import pandas as pd import json import os from pathlib import Path from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta #%% In[ ]: #%% In[ ]: # Get the current file's directory try: # This will work when running as a script script_dir = Path(__file__).parent.parent except NameError: # This will work in interactive environments script_dir = Path().absolute() project_root = script_dir fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'FacebookBusiness', 'posts', 'profile_posts_1.json')] try: with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: posts_json = json.loads(convert_encoding_meta(posts.read())) except Exception as e: print(f"Error reading JSON file: {e}") exit(1) #%% In[ ]: posts_medias = [] for post in posts_json: # data data_post_items = post['data'] texte_post_list = [] for item in data_post_items: if item.get('post'): texte_post_list.append(item['post']) texte = "\n".join(texte_post_list) # attachments for attachment in post['attachments']: if attachment.get('data'): for data_item in attachment['data']: if data_item.get('media'): media = data_item['media'] if len(texte) > 1: posts_medias.append({"network": "FacebookBusiness", "type": "posts", "index": "rs_facebookbusiness_posts", "chemin": fb_data_path[0], "texte": texte, "creation_timestamp": media["creation_timestamp"]}) #%% In[ ]: posts_medias_df = pd.DataFrame(posts_medias) #%% In[ ]: posts_medias_df.fillna(value="", inplace=True) #%% In[ ]: posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) #%% In[ ]: documents_to_database(posts_medias_df)