import datetime import pandas as pd import json from utils.get_ids import get_idtypedocument, get_idreseausocial from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta # In[ ]: fb_data_path = ['data/FacebookBusiness/posts/profile_posts_1.json', 'data/FacebookBusiness/posts/uncategorized_photos.json', 'data/FacebookBusiness/posts/videos.json'] with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: posts_json = json.loads(convert_encoding_meta(posts.read())) # In[ ]: posts_medias = [] for post in posts_json: # data data_post_items = post['data'] texte_post_list = [] for item in data_post_items: if item.get('post'): texte_post_list.append(item['post']) texte = "\n".join(texte_post_list) # attachments for attachment in post['attachments']: if attachment.get('data'): for data_item in attachment['data']: if data_item.get('media'): media = data_item['media'] if len(texte) > 1: posts_medias.append({"network": "FacebookBusiness", "type": "posts", "index": "rs_facebookbusiness_posts", "chemin": fb_data_path[0], "texte": texte, "creation_timestamp": media["creation_timestamp"]}) # In[ ]: posts_medias_df = pd.DataFrame(posts_medias) # In[ ]: posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply( lambda x: datetime.datetime.fromtimestamp(x).isoformat()) # In[ ]: del posts_medias_df['creation_timestamp'] # In[ ]: posts_medias_df.fillna(value="", inplace=True) # In[ ]: posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) # In[ ]: documents_to_database(posts_medias_df)