diff --git a/.env.template b/.env.template index 0a79348..adc70d7 100644 --- a/.env.template +++ b/.env.template @@ -1 +1 @@ -OPENSEARCH_INITIAL_ADMIN_PASSWORD= \ No newline at end of file +TYPESENSE_API_KEY= \ No newline at end of file diff --git a/README.md b/README.md index 184701e..7910cbd 100644 --- a/README.md +++ b/README.md @@ -71,9 +71,7 @@ Tu peux obtenir une sauvegarde des données de tes réseaux sociaux. Je t'ai mis ![ou-mettre-fichiers-reseaux.png](images/ou-mettre-fichiers-reseaux.png) - Exécuter le fichier qui crée les index dans le moteur de recherche [00_creer_reseauxsociaux.py](import_data/00_creer_reseauxsociaux.py) - - Ensuite exécuter tous les fichiers Python pertinents pour charger les données que tu as téléchargées. - - Pour les commentaires LinkedIn, il faut exécuter le programme R [32_importation_linkedin_comments.R](import_data/32_importation_linkedin_comments.R) avant le programme Python [32_importation_linkedin_comments.py](import_data/32_importation_linkedin_comments.py) Ensuite, tu devrais pouvoir voir toutes les données que tu as chargées dans OpenSearch Dashboards sur http://localhost:5601. Va dans Discover / Search and find insights. diff --git a/docker-compose.yml b/docker-compose.yml index 9a2f350..0e3b29f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,68 +1,21 @@ --- version: '3' services: - opensearch-node1: - image: opensearchproject/opensearch:latest - container_name: opensearch-node1 + typesense: + image: typesense/typesense:27.1 + container_name: typesense environment: - - cluster.name=opensearch-cluster - - node.name=opensearch-node1 - - discovery.seed_hosts=opensearch-node1,opensearch-node2 - - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 - - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping - - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m # minimum and maximum Java heap size, recommend setting both to 50% of system RAM - - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and higher - ulimits: - memlock: - soft: -1 - hard: -1 - nofile: - soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems - hard: 65536 + - TYPESENSE_API_KEY=${TYPESENSE_API_KEY} + - TYPESENSE_DATA_DIR=/data volumes: - - opensearch-data1:/usr/share/opensearch/data + - typesense-data:/data ports: - - 9200:9200 - - 9600:9600 # required for Performance Analyzer + - "8108:8108" networks: - - opensearch-net - opensearch-node2: - image: opensearchproject/opensearch:latest - container_name: opensearch-node2 - environment: - - cluster.name=opensearch-cluster - - node.name=opensearch-node2 - - discovery.seed_hosts=opensearch-node1,opensearch-node2 - - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 - - bootstrap.memory_lock=true - - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m - - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} - ulimits: - memlock: - soft: -1 - hard: -1 - nofile: - soft: 65536 - hard: 65536 - volumes: - - opensearch-data2:/usr/share/opensearch/data - networks: - - opensearch-net - opensearch-dashboards: - image: opensearchproject/opensearch-dashboards:latest - container_name: opensearch-dashboards - ports: - - 5601:5601 - expose: - - '5601' - environment: - OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' - networks: - - opensearch-net + - typesense-net volumes: - opensearch-data1: - opensearch-data2: + typesense-data: networks: - opensearch-net: + typesense-net: \ No newline at end of file diff --git a/import_data/00_creer_reseauxsociaux.py b/import_data/00_creer_reseauxsociaux.py index ab98ded..d5ba767 100644 --- a/import_data/00_creer_reseauxsociaux.py +++ b/import_data/00_creer_reseauxsociaux.py @@ -1,18 +1,40 @@ -import requests -import utils.config -from utils.opensearch import opensearch_client -from utils.reseau_social_data import reseau_social_data as rs_data +from typesense.exceptions import TypesenseClientError, ObjectAlreadyExists -# %% -rs_data +from utils.typesense_client import client + +# Create a collection +try: + client.collections.create({ + 'name': 'social_media_posts', + 'fields': [ + {'name': 'id', 'type': 'string'}, + {'name': 'network', 'type': 'string', 'facet': True}, + {'name': 'type', 'type': 'string', 'facet': True}, + {'name': 'index', 'type': 'string', 'facet': True}, + {'name': 'chemin', 'type': 'string'}, + {'name': 'texte', 'type': 'string'}, + {'name': 'creation_timestamp', 'type': 'int64'}, + { + "name" : "embedding", + "type" : "float[]", + "embed": { + "from": [ + "texte" + ], + "model_config": { + "model_name": "ts/multilingual-e5-small" + } + } + } + ], + 'default_sorting_field': 'creation_timestamp' + }) + print("Collection 'social_media_posts' created successfully.") +except TypesenseClientError as e: + if e==ObjectAlreadyExists: + print("Collection 'social_media_posts' already exists. Skipping creation.") + else: + print(f"Error creating collection: {str(e)}") + raise -# %% -opensearch_client.info() -# %% -for rs in rs_data: - nom = rs.get("nom") - for repertoire in rs.get("repertoires", []): - index_name = f"rs_{nom}_{repertoire}".lower() - opensearch_client.indices.create(index=index_name) - print(f"Index '{index_name}' créé") diff --git a/import_data/00_delete_collection.py b/import_data/00_delete_collection.py new file mode 100644 index 0000000..37c4f66 --- /dev/null +++ b/import_data/00_delete_collection.py @@ -0,0 +1,3 @@ +# Utiliser au besoin seulement pour supprimer la collection 'social_media_posts' dans Typesense +from utils.typesense_client import client +client.collections['social_media_posts'].delete() \ No newline at end of file diff --git a/import_data/11_importation_facebook_page_publications.py b/import_data/11_importation_facebook_page_publications.py index 69b9696..da86afc 100644 --- a/import_data/11_importation_facebook_page_publications.py +++ b/import_data/11_importation_facebook_page_publications.py @@ -1,21 +1,34 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path -from utils.get_ids import get_idtypedocument, get_idreseausocial from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -fb_data_path = ['data/FacebookBusiness/posts/profile_posts_1.json', - 'data/FacebookBusiness/posts/uncategorized_photos.json', - 'data/FacebookBusiness/posts/videos.json'] +#%% In[ ]: -with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: - posts_json = json.loads(convert_encoding_meta(posts.read())) +#%% In[ ]: -# In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'FacebookBusiness', 'posts', 'profile_posts_1.json')] + +try: + with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: + posts_json = json.loads(convert_encoding_meta(posts.read())) +except Exception as e: + print(f"Error reading JSON file: {e}") + exit(1) + +#%% In[ ]: posts_medias = [] for post in posts_json: # data @@ -39,21 +52,14 @@ for post in posts_json: "texte": texte, "creation_timestamp": media["creation_timestamp"]}) -# In[ ]: +#%% In[ ]: posts_medias_df = pd.DataFrame(posts_medias) -# In[ ]: -posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply( - lambda x: datetime.datetime.fromtimestamp(x).isoformat()) - -# In[ ]: -del posts_medias_df['creation_timestamp'] - -# In[ ]: +#%% In[ ]: posts_medias_df.fillna(value="", inplace=True) -# In[ ]: -posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: documents_to_database(posts_medias_df) diff --git a/import_data/12_importation_facebook_profil_comments.py b/import_data/12_importation_facebook_profil_comments.py index edc9859..6b06d6b 100644 --- a/import_data/12_importation_facebook_profil_comments.py +++ b/import_data/12_importation_facebook_profil_comments.py @@ -1,18 +1,28 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path -from utils.get_ids import get_idtypedocument, get_idreseausocial from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -fb_data_path = ['data/Facebook/comments_and_reactions/comments.json'] +#%% In[ ]: + +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'Facebook', 'comments_and_reactions', 'comments.json')] + with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: comments_json = json.loads(convert_encoding_meta(posts.read())) -# In[ ]: +#%% In[ ]: facebook_comments = [] for comment in comments_json['comments_v2']: if comment.get('data'): @@ -26,18 +36,8 @@ for comment in comments_json['comments_v2']: "texte": comment["comment"], "creation_timestamp": comment["timestamp"]}) -# In[ ]: +#%% In[ ]: facebook_comments_df = pd.DataFrame(facebook_comments) -# In[ ]: -facebook_comments_df['datepublication'] = facebook_comments_df['creation_timestamp'].apply( - lambda x: datetime.datetime.fromtimestamp(x).isoformat()) - -# In[ ]: -facebook_comments_df.fillna(value="", inplace=True) - -# In[ ]: -del facebook_comments_df['creation_timestamp'] - -# In[ ]: +#%% In[ ]: documents_to_database(facebook_comments_df) diff --git a/import_data/13_importation_facebook_profil_uncategorized_photos.py b/import_data/13_importation_facebook_profil_uncategorized_photos.py index 5a97878..eed3501 100644 --- a/import_data/13_importation_facebook_profil_uncategorized_photos.py +++ b/import_data/13_importation_facebook_profil_uncategorized_photos.py @@ -1,44 +1,51 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path -from utils.get_ids import get_idtypedocument, get_idreseausocial from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -fb_data_path = ['data/Facebook/posts/your_uncategorized_photos.json'] -with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: +#%% In[ ]: + +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +fb_data_path = os.path.join(project_root, 'import_data', 'data', 'Facebook', 'posts', 'your_uncategorized_photos.json') + +with open(fb_data_path, "r", encoding="raw-unicode-escape") as posts: photos_json = json.loads(convert_encoding_meta(posts.read())) -# In[ ]: +#%% In[ ]: facebook_photos = photos_json['other_photos_v2'] -# In[ ]: +#%% In[ ]: facebook_photos_df = pd.DataFrame(facebook_photos) -# In[ ]: +#%% In[ ]: # Filter out posts without a description facebook_photos_df = facebook_photos_df[~facebook_photos_df['description'].isnull()] -# In[ ]: -facebook_photos_df['datepublication'] = facebook_photos_df['creation_timestamp'].apply( - lambda x: datetime.datetime.fromtimestamp(x).isoformat()) +#%% In[ ]: facebook_photos_df['index'] = "rs_facebook_posts" facebook_photos_df['network'] = "Facebook" facebook_photos_df['type'] = "posts" -facebook_photos_df['chemin'] = fb_data_path[0] +facebook_photos_df['chemin'] = fb_data_path -# In[ ]: +#%% In[ ]: facebook_photos_df.rename(columns={"description": "texte"}, inplace=True) -# In[ ]: -del facebook_photos_df['creation_timestamp'] +#%% In[ ]: del facebook_photos_df['media_metadata'] -# In[ ]: +#%% In[ ]: facebook_photos_df.fillna(value="", inplace=True) -# In[ ]: +#%% In[ ]: documents_to_database(facebook_photos_df) diff --git a/import_data/21_importation_instagram_publications.py b/import_data/21_importation_instagram_publications.py index 7902b6a..28dffe8 100644 --- a/import_data/21_importation_instagram_publications.py +++ b/import_data/21_importation_instagram_publications.py @@ -1,17 +1,28 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -instagram_data_path = 'data/Instagram/content/posts_1.json' +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'posts_1.json') + + with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: posts_json = json.loads(convert_encoding_meta(posts.read())) -# In[ ]: +#%% In[ ]: posts_medias = [] for post in posts_json: medias = post['media'] @@ -45,25 +56,18 @@ for post in posts_json: "texte": title, "creation_timestamp": creation_timestamp}) -# In[ ]: +#%% In[ ]: posts_medias_df = pd.DataFrame(posts_medias) -# In[ ]: -posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply( - lambda x: datetime.datetime.fromtimestamp(x).isoformat()) - -# In[ ]: -del posts_medias_df['creation_timestamp'] - -# In[ ]: +#%% In[ ]: posts_medias_df.fillna(value="", inplace=True) -# In[ ]: -posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: # Filter empty texte posts_medias_df = posts_medias_df[~posts_medias_df['texte'].str.strip().eq('')] -# In[ ]: +#%% In[ ]: documents_to_database(posts_medias_df) diff --git a/import_data/22_importation_instagram_reels.py b/import_data/22_importation_instagram_reels.py index 974e189..56d5ec7 100644 --- a/import_data/22_importation_instagram_reels.py +++ b/import_data/22_importation_instagram_reels.py @@ -1,49 +1,55 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path -from utils.get_ids import get_idtypedocument, get_idreseausocial from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -instagram_data_path = 'data/Instagram/content/reels.json' +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'reels.json') + with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: reels_json = json.loads(convert_encoding_meta(posts.read())) -# In[ ]: +#%% In[ ]: ig_reels_media = [x['media'][0] for x in reels_json['ig_reels_media']] -# In[ ]: +#%% In[ ]: ig_reels_df = pd.DataFrame(ig_reels_media) -# In[ ]: -ig_reels_df['datepublication'] = ig_reels_df['creation_timestamp'].apply( - lambda x: datetime.datetime.fromtimestamp(x).isoformat()) +#%% In[ ]: ig_reels_df['index'] = "rs_instagram_content" ig_reels_df['type'] = "reels" ig_reels_df['network'] = "Instagram" ig_reels_df['chemin'] = instagram_data_path -# In[ ]: +#%% In[ ]: ig_reels_df.rename(columns={"title": "texte"}, inplace=True) -# In[ ]: -del ig_reels_df['creation_timestamp'] +#%% In[ ]: del ig_reels_df['media_metadata'] del ig_reels_df['cross_post_source'] del ig_reels_df['dubbing_info'] -# In[ ]: +#%% In[ ]: ig_reels_df.fillna(value="", inplace=True) -# In[ ]: -ig_reels_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +ig_reels_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: # Filter empty texte ig_reels_df = ig_reels_df[~ig_reels_df['texte'].str.strip().eq('')] -# In[ ]: +#%% In[ ]: documents_to_database(ig_reels_df) diff --git a/import_data/23_importation_instagram_stories.py b/import_data/23_importation_instagram_stories.py index 09702e0..b74739c 100644 --- a/import_data/23_importation_instagram_stories.py +++ b/import_data/23_importation_instagram_stories.py @@ -1,49 +1,52 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path -from utils.get_ids import get_idtypedocument, get_idreseausocial from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -instagram_data_path = 'data/Instagram/content/stories.json' +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'stories.json') with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: stories_json = json.loads(convert_encoding_meta(posts.read())) -# In[ ]: +#%% In[ ]: ig_stories_df = pd.DataFrame(stories_json['ig_stories']) -# In[ ]: -ig_stories_df['datepublication'] = ig_stories_df['creation_timestamp'].apply( - lambda x: datetime.datetime.fromtimestamp(x).isoformat()) - -# In[ ]: +#%% In[ ]: ig_stories_df['index'] = "rs_instagram_content" ig_stories_df['type'] = "stories" ig_stories_df['network'] = "Instagram" ig_stories_df['chemin'] = instagram_data_path -# In[ ]: +#%% In[ ]: ig_stories_df.rename(columns={"title": "texte"}, inplace=True) -# In[ ]: -del ig_stories_df['creation_timestamp'] +#%% In[ ]: del ig_stories_df['media_metadata'] del ig_stories_df['cross_post_source'] del ig_stories_df['ai_stickers'] del ig_stories_df['dubbing_info'] -# In[ ]: +#%% In[ ]: ig_stories_df.fillna(value="", inplace=True) -# In[ ]: -ig_stories_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +ig_stories_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: # Filter empty texte ig_stories_df = ig_stories_df[~ig_stories_df['texte'].str.strip('\n').str.strip().eq('')] -# In[ ]: +#%% In[ ]: documents_to_database(ig_stories_df) diff --git a/import_data/24_importation_instagram_post_comments.py b/import_data/24_importation_instagram_post_comments.py index 5292924..9721595 100644 --- a/import_data/24_importation_instagram_post_comments.py +++ b/import_data/24_importation_instagram_post_comments.py @@ -1,39 +1,48 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -instagram_data_path = 'data/Instagram/comments/post_comments_1.json' +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'post_comments_1.json') + with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: post_comments_1 = json.loads(convert_encoding_meta(posts.read())) -# In[ ]: +#%% In[ ]: ig_comments = [] for comment in post_comments_1: ig_comments.append({"texte": comment['string_map_data']['Comment']['value'], - 'datepublication': datetime.datetime.fromtimestamp( - timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(), + 'creation_timestamp': comment['string_map_data']['Time']['timestamp'], "chemin": instagram_data_path, "index": "rs_instagram_comments", "type": "comments", "network": "Instagram"}) -# In[ ]: +#%% In[ ]: ig_comments_df = pd.DataFrame(ig_comments) -# In[ ]: +#%% In[ ]: ig_comments_df.fillna(value="", inplace=True) -# In[ ]: -ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: # Filter empty texte ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')] -# In[ ]: +#%% In[ ]: documents_to_database(ig_comments_df) diff --git a/import_data/25_importation_instagram_reels_comments.py b/import_data/25_importation_instagram_reels_comments.py index 83bf0ea..5149616 100644 --- a/import_data/25_importation_instagram_reels_comments.py +++ b/import_data/25_importation_instagram_reels_comments.py @@ -1,40 +1,48 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path -from utils.get_ids import get_idtypedocument, get_idreseausocial from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -instagram_data_path = 'data/Instagram/comments/reels_comments.json' +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'reels_comments.json') + with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: reels_comments = json.loads(convert_encoding_meta(posts.read())) -# In[ ]: +#%% In[ ]: ig_comments = [] for comment in reels_comments['comments_reels_comments']: ig_comments.append({"texte": comment['string_map_data']['Comment']['value'], - 'datepublication': datetime.datetime.fromtimestamp( - timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(), + 'creation_timestamp': comment['string_map_data']['Time']['timestamp'], "chemin": instagram_data_path, "index": "rs_instagram_comments", "type": "comments", "network": "Instagram"}) -# In[ ]: +#%% In[ ]: ig_comments_df = pd.DataFrame(ig_comments) -# In[ ]: +#%% In[ ]: ig_comments_df.fillna(value="", inplace=True) -# In[ ]: -ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: # Filter empty texte ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')] -# In[ ]: +#%% In[ ]: documents_to_database(ig_comments_df) diff --git a/import_data/26_importation_threads.py b/import_data/26_importation_threads.py index 34f899d..0831a91 100644 --- a/import_data/26_importation_threads.py +++ b/import_data/26_importation_threads.py @@ -1,40 +1,49 @@ -import datetime - import pandas as pd import json +import os +from pathlib import Path from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta -# In[ ]: -instagram_data_path = 'data/Instagram/threads/threads_and_replies.json' +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'threads', 'threads_and_replies.json') + with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: post_comments_1 = json.loads(convert_encoding_meta(posts.read())) -# In[ ]: +#%% In[ ]: threads_comments = [] for post in post_comments_1['text_post_app_text_posts']: for element in post['media']: threads_comments.append({"texte": element['title'], - 'datepublication': datetime.datetime.fromtimestamp( - timestamp=element['creation_timestamp']).isoformat(), + 'creation_timestamp': element['creation_timestamp'], "chemin": instagram_data_path, "index": "rs_instagram_threads", "type": "posts", "network": "Instagram"}) -# In[ ]: +#%% In[ ]: ig_comments_df = pd.DataFrame(threads_comments) -# In[ ]: +#%% In[ ]: ig_comments_df.fillna(value="", inplace=True) -# In[ ]: -ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: # Filter empty texte ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')] -# In[ ]: +#%% In[ ]: documents_to_database(ig_comments_df) diff --git a/import_data/31_importation_linkedin_shares.py b/import_data/31_importation_linkedin_shares.py index 1645b8a..52094b8 100644 --- a/import_data/31_importation_linkedin_shares.py +++ b/import_data/31_importation_linkedin_shares.py @@ -1,44 +1,58 @@ import pandas as pd import datetime +import os +from pathlib import Path + from utils.documents_to_database import documents_to_database -# In[ ]: -linkedin_data_path = "data/LinkedIn/shares/Shares.csv" +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'shares', 'Shares.csv') + raw_shares = pd.read_csv(linkedin_data_path) -# In[ ]: +#%% In[ ]: raw_shares['index'] = "rs_linkedin_shares" raw_shares['type'] = "posts" raw_shares['network'] = "LinkedIn" raw_shares['chemin'] = linkedin_data_path -# In[ ]: -raw_shares["datepublication"] = raw_shares["Date"].apply( - lambda x: str(datetime.datetime.fromisoformat(x).isoformat())) +#%% In[ ]: +raw_shares["creation_timestamp"] = raw_shares["Date"].apply( + lambda x: int(datetime.datetime.fromisoformat(x).timestamp()) +) del raw_shares["Date"] -# In[ ]: +#%% In[ ]: raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True) -# In[ ]: +#%% In[ ]: raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x)) -# In[ ]: +#%% In[ ]: del raw_shares["SharedUrl"] del raw_shares["MediaUrl"] del raw_shares["Visibility"] -# In[ ]: +#%% In[ ]: raw_shares.fillna(value="", inplace=True) -# In[ ]: -raw_shares.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +raw_shares.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: # Filter empty texte raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')] -# In[ ]: +#%% In[ ]: documents_to_database(raw_shares) diff --git a/import_data/32_importation_linkedin_comments.R b/import_data/32_importation_linkedin_comments.R deleted file mode 100644 index eb6c855..0000000 --- a/import_data/32_importation_linkedin_comments.R +++ /dev/null @@ -1,19 +0,0 @@ -linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv" -library("readr") -library("magrittr") -library("dplyr") - -# Read CSV file -ddd <- readr::read_delim(linkedin_data_path, - escape_backslash = TRUE, - trim_ws = TRUE, - skip_empty_rows = FALSE, - delim = ",") - -# Remove carriage returns -ddd %>% - mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>% - select(-Message) -> ddd2 - -# Save the cleaned data to a new CSV file -ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE) diff --git a/import_data/32_importation_linkedin_comments.py b/import_data/32_importation_linkedin_comments.py index 8e91cb2..e235afc 100644 --- a/import_data/32_importation_linkedin_comments.py +++ b/import_data/32_importation_linkedin_comments.py @@ -1,41 +1,58 @@ import pandas as pd import datetime +import os +from pathlib import Path + from utils.documents_to_database import documents_to_database -# In[ ]: -linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv" +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() -# In[ ]: -raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8') +project_root = script_dir +linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'comments', 'Comments.csv') + +#%% In[ ]: +raw_comments_csv = pd.read_csv(linkedin_data_path, + escapechar='\\', + skipinitialspace=True) +raw_comments_csv['MessageFix'] = raw_comments_csv['Message'].str.replace(r'[\r\n\t]+', ' ', regex=True) +raw_comments_csv = raw_comments_csv.drop(columns=['Message']) raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates() -# In[ ]: +#%% In[ ]: raw_comments['index'] = "rs_linkedin_comments" raw_comments['type'] = "comments" raw_comments['network'] = "LinkedIn" raw_comments['chemin'] = linkedin_data_path -# In[ ]: -raw_comments["datepublication"] = raw_comments["Date"].apply( - lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat())) +#%% In[ ]: +raw_comments["creation_timestamp"] = raw_comments["Date"].apply( + lambda x: int(datetime.datetime.fromisoformat(x).timestamp()) +) del raw_comments["Date"] -# In[ ]: +#%% In[ ]: raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True) -# In[ ]: +#%% In[ ]: raw_comments["chemin"] = linkedin_data_path -# In[ ]: +#%% In[ ]: raw_comments.fillna(value="", inplace=True) -# In[ ]: -raw_comments.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) +#%% In[ ]: +raw_comments.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True) -# In[ ]: +#%% In[ ]: # Filter empty texte raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')] -# In[ ]: +#%% In[ ]: documents_to_database(raw_comments) diff --git a/import_data/41_importation_wordpress.py b/import_data/41_importation_wordpress.py index 1307bb9..a3e9110 100644 --- a/import_data/41_importation_wordpress.py +++ b/import_data/41_importation_wordpress.py @@ -5,29 +5,42 @@ import xmltodict import pandas as pd import markdownify +import os +from pathlib import Path + from utils.documents_to_database import documents_to_database -# In[ ]: -wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml" +#%% In[ ]: +# Get the current file's directory +try: + # This will work when running as a script + script_dir = Path(__file__).parent.parent +except NameError: + # This will work in interactive environments + script_dir = Path().absolute() + +project_root = script_dir +wordpress_xml_path = os.path.join(project_root, 'import_data', 'data', 'Wordpress', 'jevalideca', 'wordpress.xml') + with open(wordpress_xml_path, "r") as xml_file: wordpress_xml = xml_file.read() -# In[ ]: +#%% In[ ]: wordpress_dict = xmltodict.parse(wordpress_xml) -# In[ ]: +#%% In[ ]: items_df = pd.DataFrame(wordpress_dict['rss']['channel']['item']) -# In[ ]: +#%% In[ ]: items_df_filter = items_df[ (items_df['wp:post_type'].isin(['page', 'post'])) & (items_df['wp:status'] == 'publish')].copy() -# In[ ]: -items_df_filter['datepublication'] = items_df_filter['wp:post_date'].apply( - lambda x: str(datetime.datetime.fromisoformat(x).isoformat())) +#%% In[ ]: +items_df_filter['creation_timestamp'] = items_df_filter['wp:post_date'].apply( + lambda x: int(datetime.datetime.fromisoformat(x).timestamp())) -# In[ ]: +#%% In[ ]: def wp_to_markdown(x): try: md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x, heading_style='ATX')).strip() @@ -38,25 +51,25 @@ def wp_to_markdown(x): return md_text -# In[ ]: +#%% In[ ]: items_df_filter['texte'] = items_df_filter['content:encoded'].apply(lambda x: wp_to_markdown(x)) -# In[ ]: +#%% In[ ]: items_df_filter.rename(columns={"link": "uri", "wp:post_type": "type"}, inplace=True) -# In[ ]: +#%% In[ ]: items_df_filter['index'] = "rs_wordpress_jevalideca" items_df_filter['network'] = "Wordpress" items_df_filter['chemin'] = wordpress_xml_path -# In[ ]: +#%% In[ ]: items_df_filter.fillna(value="", inplace=True) -# In[ ]: +#%% In[ ]: documents_to_database(items_df_filter[['title', 'uri', 'type', - 'datepublication', + 'creation_timestamp', 'texte', 'index', 'network', diff --git a/import_data/requirements.txt b/import_data/requirements.txt index d57dddc..f93e4de 100644 --- a/import_data/requirements.txt +++ b/import_data/requirements.txt @@ -4,3 +4,4 @@ requests==2.31.0 xmltodict==0.13.0 python_dotenv==1.0.1 pyarrow==17.0.0 +typesense==0.21.0 diff --git a/import_data/run_all_imports.sh b/import_data/run_all_imports.sh new file mode 100644 index 0000000..c70d27e --- /dev/null +++ b/import_data/run_all_imports.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Navigate to the directory containing the scripts +cd "$(dirname "$0")" || exit + +# Find and execute all Python scripts matching the pattern +for script in [0-9][0-9]_importation_*.py +do + if [ -f "$script" ]; then + echo "Running $script..." + python3 "$script" + echo "Finished $script" + echo "--------------------" + fi +done + +echo "All importation scripts have been executed." \ No newline at end of file diff --git a/import_data/utils/documents_to_database.py b/import_data/utils/documents_to_database.py index aa22cdf..cbcdbe9 100644 --- a/import_data/utils/documents_to_database.py +++ b/import_data/utils/documents_to_database.py @@ -1,20 +1,11 @@ -import pandas as pd -import requests +import tqdm -from utils.opensearch import opensearch_client +from .typesense_client import client - -def documents_to_database(documents_list, os_client=opensearch_client): - # Check if opensearch is available - if not os_client.ping(): - raise requests.exceptions.ConnectionError("Opensearch is not reachable") - # Check if the specified index exists - if not os_client.indices.exists(index=documents_list['index'].iloc[0]): - raise requests.exceptions.HTTPError(f"Index '{documents_list['index'].iloc[0]}' does not exist") - # Insert each document into opensearch index(es) - for document in documents_list.to_dict(orient='records'): - index_name = document.pop('index', None) - if not index_name: - raise ValueError("Document must have an 'index' field") - os_client.index(index=index_name, - body=document) +def documents_to_database(documents_list, os_client=client): + try: + for document in tqdm.tqdm(documents_list.to_dict(orient='records')): + os_client.collections['social_media_posts'].documents.create(document) + print(f"Successfully inserted {len(documents_list)} documents.") + except Exception as e: + print(f"Error inserting documents: {str(e)}") \ No newline at end of file diff --git a/import_data/utils/opensearch.py b/import_data/utils/opensearch.py deleted file mode 100644 index 2c6d5a3..0000000 --- a/import_data/utils/opensearch.py +++ /dev/null @@ -1,22 +0,0 @@ -import os -import dotenv - -# Load environment variables from.env file -dotenv.load_dotenv() - -# Connect to OpenSearch using the provided credentials and hostname/port. -from opensearchpy import OpenSearch - -host = 'localhost' -port = 9200 -auth = ('admin', os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")) # For testing only. Don't store credentials in code. -# Create the client with SSL/TLS enabled, but hostname verification disabled. -opensearch_client = OpenSearch( - hosts=[{'host': host, 'port': port}], - http_compress=True, # enables gzip compression for request bodies - http_auth=auth, - use_ssl=True, - verify_certs=False, - ssl_assert_hostname=False, - ssl_show_warn=False -) diff --git a/import_data/utils/reseau_social_data.py b/import_data/utils/reseau_social_data.py index ae08bda..e87bfec 100644 --- a/import_data/utils/reseau_social_data.py +++ b/import_data/utils/reseau_social_data.py @@ -11,4 +11,7 @@ reseau_social_data = [{"nom": "LinkedIn", {"nom": "Facebook", "repertoires": ["comments_and_reactions", "posts"]}, {"nom": "FacebookBusiness", - "repertoires": ["posts"]}] + "repertoires": ["posts"]}, + {"nom": "Podcast", + "repertoires": ["shownotes", "audio"]} + ] diff --git a/import_data/utils/typesense_client.py b/import_data/utils/typesense_client.py new file mode 100644 index 0000000..dc2f8c4 --- /dev/null +++ b/import_data/utils/typesense_client.py @@ -0,0 +1,15 @@ +import typesense +import os +from dotenv import load_dotenv + +load_dotenv() + +client = typesense.Client({ + 'nodes': [{ + 'host': 'localhost', + 'port': '8108', + 'protocol': 'http' + }], + 'api_key': os.getenv('TYPESENSE_API_KEY'), + 'connection_timeout_seconds': 2 +})