importation de documents dans typesense

2024-10-02 21:53:37 -04:00 · 2024-10-02 21:53:37 -04:00 · 7a74dbf413
commit 7a74dbf413
parent f4acc32451
24 changed files with 390 additions and 332 deletions
--- a/.env.template
+++ b/.env.template
@ -1 +1 @@
-OPENSEARCH_INITIAL_ADMIN_PASSWORD=
+TYPESENSE_API_KEY=
--- a/README.md
+++ b/README.md
@ -71,9 +71,7 @@ Tu peux obtenir une sauvegarde des données de tes réseaux sociaux. Je t'ai mis
 ![ou-mettre-fichiers-reseaux.png](images/ou-mettre-fichiers-reseaux.png)
 - Exécuter le fichier qui crée les index dans le moteur de recherche  [00_creer_reseauxsociaux.py](import_data/00_creer_reseauxsociaux.py)
 - Ensuite exécuter tous les fichiers Python pertinents pour charger les données que tu as téléchargées. 
  - Pour les commentaires LinkedIn, il faut exécuter le programme R [32_importation_linkedin_comments.R](import_data/32_importation_linkedin_comments.R) avant le programme Python [32_importation_linkedin_comments.py](import_data/32_importation_linkedin_comments.py)
 Ensuite, tu devrais pouvoir voir toutes les données que tu as chargées dans OpenSearch Dashboards sur http://localhost:5601. Va dans Discover / Search and find insights.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,68 +1,21 @@
 ---
 version: '3'
 services:
-  opensearch-node1:
+  typesense:
-    image: opensearchproject/opensearch:latest
+    image: typesense/typesense:27.1
-    container_name: opensearch-node1
+    container_name: typesense
    environment:
-      - cluster.name=opensearch-cluster
+      - TYPESENSE_API_KEY=${TYPESENSE_API_KEY}
-      - node.name=opensearch-node1
+      - TYPESENSE_DATA_DIR=/data
      - discovery.seed_hosts=opensearch-node1,opensearch-node2
      - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
      - bootstrap.memory_lock=true  # along with the memlock settings below, disables swapping
      - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m  # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}    # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and higher
    ulimits:
      memlock:
        soft: -1
        hard: -1
      nofile:
        soft: 65536  # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
        hard: 65536
    volumes:
-      - opensearch-data1:/usr/share/opensearch/data
+      - typesense-data:/data
    ports:
-      - 9200:9200
+      - "8108:8108"
      - 9600:9600  # required for Performance Analyzer
    networks:
-      - opensearch-net
+      - typesense-net
  opensearch-node2:
    image: opensearchproject/opensearch:latest
    container_name: opensearch-node2
    environment:
      - cluster.name=opensearch-cluster
      - node.name=opensearch-node2
      - discovery.seed_hosts=opensearch-node1,opensearch-node2
      - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
      - bootstrap.memory_lock=true
      - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m
      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
    ulimits:
      memlock:
        soft: -1
        hard: -1
      nofile:
        soft: 65536
        hard: 65536
    volumes:
      - opensearch-data2:/usr/share/opensearch/data
    networks:
      - opensearch-net
  opensearch-dashboards:
    image: opensearchproject/opensearch-dashboards:latest
    container_name: opensearch-dashboards
    ports:
      - 5601:5601
    expose:
      - '5601'
    environment:
      OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]'
    networks:
      - opensearch-net
 volumes:
-  opensearch-data1:
+  typesense-data:
  opensearch-data2:
 networks:
-  opensearch-net:
+  typesense-net:
--- a/import_data/00_creer_reseauxsociaux.py
+++ b/import_data/00_creer_reseauxsociaux.py
@ -1,18 +1,40 @@
-import requests
+from typesense.exceptions import TypesenseClientError, ObjectAlreadyExists
 import utils.config
 from utils.opensearch import opensearch_client
 from utils.reseau_social_data import reseau_social_data as rs_data
-# %%
+from utils.typesense_client import client
-rs_data
+
 # Create a collection
 try:
    client.collections.create({
        'name': 'social_media_posts',
        'fields': [
            {'name': 'id', 'type': 'string'},
            {'name': 'network', 'type': 'string', 'facet': True},
            {'name': 'type', 'type': 'string', 'facet': True},
            {'name': 'index', 'type': 'string', 'facet': True},
            {'name': 'chemin', 'type': 'string'},
            {'name': 'texte', 'type': 'string'},
            {'name': 'creation_timestamp', 'type': 'int64'},
            {
                "name" : "embedding",
                "type" : "float[]",
                "embed": {
                    "from": [
                        "texte"
                    ],
                    "model_config": {
                        "model_name": "ts/multilingual-e5-small"
                    }
                }
            }
        ],
        'default_sorting_field': 'creation_timestamp'
    })
    print("Collection 'social_media_posts' created successfully.")
 except TypesenseClientError as e:
    if e==ObjectAlreadyExists:
        print("Collection 'social_media_posts' already exists. Skipping creation.")
    else:
        print(f"Error creating collection: {str(e)}")
        raise
 # %%
 opensearch_client.info()
 # %%
 for rs in rs_data:
    nom = rs.get("nom")
    for repertoire in rs.get("repertoires", []):
        index_name = f"rs_{nom}_{repertoire}".lower()
        opensearch_client.indices.create(index=index_name)
        print(f"Index '{index_name}' créé")
--- a/import_data/00_delete_collection.py
+++ b/import_data/00_delete_collection.py
@ -0,0 +1,3 @@
 # Utiliser au besoin seulement pour supprimer la collection 'social_media_posts' dans Typesense
 from utils.typesense_client import client
 client.collections['social_media_posts'].delete()
--- a/import_data/11_importation_facebook_page_publications.py
+++ b/import_data/11_importation_facebook_page_publications.py
@ -1,21 +1,34 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
 fb_data_path = ['data/FacebookBusiness/posts/profile_posts_1.json',
                'data/FacebookBusiness/posts/uncategorized_photos.json',
                'data/FacebookBusiness/posts/videos.json']
 #%% In[ ]:
 # Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'FacebookBusiness', 'posts', 'profile_posts_1.json')]
 try:
    with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
        posts_json = json.loads(convert_encoding_meta(posts.read()))
 except Exception as e:
    print(f"Error reading JSON file: {e}")
    exit(1)
-# In[ ]:
+#%% In[ ]:
 posts_medias = []
 for post in posts_json:
    # data
@ -39,21 +52,14 @@ for post in posts_json:
                                             "texte": texte,
                                             "creation_timestamp": media["creation_timestamp"]})
-# In[ ]:
+#%% In[ ]:
 posts_medias_df = pd.DataFrame(posts_medias)
-# In[ ]:
+#%% In[ ]:
 posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
 # In[ ]:
 del posts_medias_df['creation_timestamp']
 # In[ ]:
 posts_medias_df.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 documents_to_database(posts_medias_df)
--- a/import_data/12_importation_facebook_profil_comments.py
+++ b/import_data/12_importation_facebook_profil_comments.py
@ -1,18 +1,28 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
-fb_data_path = ['data/Facebook/comments_and_reactions/comments.json']
+
 # Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'Facebook', 'comments_and_reactions', 'comments.json')]
 with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
    comments_json = json.loads(convert_encoding_meta(posts.read()))
-# In[ ]:
+#%% In[ ]:
 facebook_comments = []
 for comment in comments_json['comments_v2']:
    if comment.get('data'):
@ -26,18 +36,8 @@ for comment in comments_json['comments_v2']:
                                          "texte": comment["comment"],
                                          "creation_timestamp": comment["timestamp"]})
-# In[ ]:
+#%% In[ ]:
 facebook_comments_df = pd.DataFrame(facebook_comments)
-# In[ ]:
+#%% In[ ]:
 facebook_comments_df['datepublication'] = facebook_comments_df['creation_timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
 # In[ ]:
 facebook_comments_df.fillna(value="", inplace=True)
 # In[ ]:
 del facebook_comments_df['creation_timestamp']
 # In[ ]:
 documents_to_database(facebook_comments_df)
--- a/import_data/13_importation_facebook_profil_uncategorized_photos.py
+++ b/import_data/13_importation_facebook_profil_uncategorized_photos.py
@ -1,44 +1,51 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
-fb_data_path = ['data/Facebook/posts/your_uncategorized_photos.json']
+
-with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 fb_data_path = os.path.join(project_root, 'import_data', 'data', 'Facebook', 'posts', 'your_uncategorized_photos.json')
 with open(fb_data_path, "r", encoding="raw-unicode-escape") as posts:
    photos_json = json.loads(convert_encoding_meta(posts.read()))
-# In[ ]:
+#%% In[ ]:
 facebook_photos = photos_json['other_photos_v2']
-# In[ ]:
+#%% In[ ]:
 facebook_photos_df = pd.DataFrame(facebook_photos)
-# In[ ]:
+#%% In[ ]:
 # Filter out posts without a description
 facebook_photos_df = facebook_photos_df[~facebook_photos_df['description'].isnull()]
-# In[ ]:
+#%% In[ ]:
 facebook_photos_df['datepublication'] = facebook_photos_df['creation_timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
 facebook_photos_df['index'] = "rs_facebook_posts"
 facebook_photos_df['network'] = "Facebook"
 facebook_photos_df['type'] = "posts"
-facebook_photos_df['chemin'] = fb_data_path[0]
+facebook_photos_df['chemin'] = fb_data_path
-# In[ ]:
+#%% In[ ]:
 facebook_photos_df.rename(columns={"description": "texte"}, inplace=True)
-# In[ ]:
+#%% In[ ]:
 del facebook_photos_df['creation_timestamp']
 del facebook_photos_df['media_metadata']
-# In[ ]:
+#%% In[ ]:
 facebook_photos_df.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
 documents_to_database(facebook_photos_df)
--- a/import_data/21_importation_instagram_publications.py
+++ b/import_data/21_importation_instagram_publications.py
@ -1,17 +1,28 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
-instagram_data_path = 'data/Instagram/content/posts_1.json'
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'posts_1.json')
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    posts_json = json.loads(convert_encoding_meta(posts.read()))
-# In[ ]:
+#%% In[ ]:
 posts_medias = []
 for post in posts_json:
    medias = post['media']
@ -45,25 +56,18 @@ for post in posts_json:
            "texte": title,
            "creation_timestamp": creation_timestamp})
-# In[ ]:
+#%% In[ ]:
 posts_medias_df = pd.DataFrame(posts_medias)
-# In[ ]:
+#%% In[ ]:
 posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
 # In[ ]:
 del posts_medias_df['creation_timestamp']
 # In[ ]:
 posts_medias_df.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 posts_medias_df = posts_medias_df[~posts_medias_df['texte'].str.strip().eq('')]
-# In[ ]:
+#%% In[ ]:
 documents_to_database(posts_medias_df)
--- a/import_data/22_importation_instagram_reels.py
+++ b/import_data/22_importation_instagram_reels.py
@ -1,49 +1,55 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
-instagram_data_path = 'data/Instagram/content/reels.json'
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'reels.json')
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    reels_json = json.loads(convert_encoding_meta(posts.read()))
-# In[ ]:
+#%% In[ ]:
 ig_reels_media = [x['media'][0] for x in reels_json['ig_reels_media']]
-# In[ ]:
+#%% In[ ]:
 ig_reels_df = pd.DataFrame(ig_reels_media)
-# In[ ]:
+#%% In[ ]:
 ig_reels_df['datepublication'] = ig_reels_df['creation_timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
 ig_reels_df['index'] = "rs_instagram_content"
 ig_reels_df['type'] = "reels"
 ig_reels_df['network'] = "Instagram"
 ig_reels_df['chemin'] = instagram_data_path
-# In[ ]:
+#%% In[ ]:
 ig_reels_df.rename(columns={"title": "texte"}, inplace=True)
-# In[ ]:
+#%% In[ ]:
 del ig_reels_df['creation_timestamp']
 del ig_reels_df['media_metadata']
 del ig_reels_df['cross_post_source']
 del ig_reels_df['dubbing_info']
-# In[ ]:
+#%% In[ ]:
 ig_reels_df.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-ig_reels_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+ig_reels_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_reels_df = ig_reels_df[~ig_reels_df['texte'].str.strip().eq('')]
-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_reels_df)
--- a/import_data/23_importation_instagram_stories.py
+++ b/import_data/23_importation_instagram_stories.py
@ -1,49 +1,52 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
-instagram_data_path = 'data/Instagram/content/stories.json'
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'stories.json')
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    stories_json = json.loads(convert_encoding_meta(posts.read()))
-# In[ ]:
+#%% In[ ]:
 ig_stories_df = pd.DataFrame(stories_json['ig_stories'])
-# In[ ]:
+#%% In[ ]:
 ig_stories_df['datepublication'] = ig_stories_df['creation_timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
 # In[ ]:
 ig_stories_df['index'] = "rs_instagram_content"
 ig_stories_df['type'] = "stories"
 ig_stories_df['network'] = "Instagram"
 ig_stories_df['chemin'] = instagram_data_path
-# In[ ]:
+#%% In[ ]:
 ig_stories_df.rename(columns={"title": "texte"}, inplace=True)
-# In[ ]:
+#%% In[ ]:
 del ig_stories_df['creation_timestamp']
 del ig_stories_df['media_metadata']
 del ig_stories_df['cross_post_source']
 del ig_stories_df['ai_stickers']
 del ig_stories_df['dubbing_info']
-# In[ ]:
+#%% In[ ]:
 ig_stories_df.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-ig_stories_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+ig_stories_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_stories_df = ig_stories_df[~ig_stories_df['texte'].str.strip('\n').str.strip().eq('')]
-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_stories_df)
--- a/import_data/24_importation_instagram_post_comments.py
+++ b/import_data/24_importation_instagram_post_comments.py
@ -1,39 +1,48 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
-instagram_data_path = 'data/Instagram/comments/post_comments_1.json'
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'post_comments_1.json')
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
-# In[ ]:
+#%% In[ ]:
 ig_comments = []
 for comment in post_comments_1:
    ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
-                        'datepublication': datetime.datetime.fromtimestamp(
+                        'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
                            timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
                        "chemin": instagram_data_path,
                        "index": "rs_instagram_comments",
                        "type": "comments",
                        "network": "Instagram"})
-# In[ ]:
+#%% In[ ]:
 ig_comments_df = pd.DataFrame(ig_comments)
-# In[ ]:
+#%% In[ ]:
 ig_comments_df.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_comments_df)
--- a/import_data/25_importation_instagram_reels_comments.py
+++ b/import_data/25_importation_instagram_reels_comments.py
@ -1,40 +1,48 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
-instagram_data_path = 'data/Instagram/comments/reels_comments.json'
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'reels_comments.json')
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    reels_comments = json.loads(convert_encoding_meta(posts.read()))
-# In[ ]:
+#%% In[ ]:
 ig_comments = []
 for comment in reels_comments['comments_reels_comments']:
    ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
-                        'datepublication': datetime.datetime.fromtimestamp(
+                        'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
                            timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
                        "chemin": instagram_data_path,
                        "index": "rs_instagram_comments",
                        "type": "comments",
                        "network": "Instagram"})
-# In[ ]:
+#%% In[ ]:
 ig_comments_df = pd.DataFrame(ig_comments)
-# In[ ]:
+#%% In[ ]:
 ig_comments_df.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_comments_df)
--- a/import_data/26_importation_threads.py
+++ b/import_data/26_importation_threads.py
@ -1,40 +1,49 @@
 import datetime
 import pandas as pd
 import json
 import os
 from pathlib import Path
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta
-# In[ ]:
+#%% In[ ]:
-instagram_data_path = 'data/Instagram/threads/threads_and_replies.json'
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'threads', 'threads_and_replies.json')
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
-# In[ ]:
+#%% In[ ]:
 threads_comments = []
 for post in post_comments_1['text_post_app_text_posts']:
    for element in post['media']:
        threads_comments.append({"texte": element['title'],
-                                 'datepublication': datetime.datetime.fromtimestamp(
+                                 'creation_timestamp': element['creation_timestamp'],
                                     timestamp=element['creation_timestamp']).isoformat(),
                                 "chemin": instagram_data_path,
                                 "index": "rs_instagram_threads",
                                 "type": "posts",
                                 "network": "Instagram"})
-# In[ ]:
+#%% In[ ]:
 ig_comments_df = pd.DataFrame(threads_comments)
-# In[ ]:
+#%% In[ ]:
 ig_comments_df.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_comments_df)
--- a/import_data/31_importation_linkedin_shares.py
+++ b/import_data/31_importation_linkedin_shares.py
@ -1,44 +1,58 @@
 import pandas as pd
 import datetime
 import os
 from pathlib import Path
 from utils.documents_to_database import documents_to_database
-# In[ ]:
+#%% In[ ]:
-linkedin_data_path = "data/LinkedIn/shares/Shares.csv"
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'shares', 'Shares.csv')
 raw_shares = pd.read_csv(linkedin_data_path)
-# In[ ]:
+#%% In[ ]:
 raw_shares['index'] = "rs_linkedin_shares"
 raw_shares['type'] = "posts"
 raw_shares['network'] = "LinkedIn"
 raw_shares['chemin'] = linkedin_data_path
-# In[ ]:
+#%% In[ ]:
-raw_shares["datepublication"] = raw_shares["Date"].apply(
+raw_shares["creation_timestamp"] = raw_shares["Date"].apply(
-    lambda x: str(datetime.datetime.fromisoformat(x).isoformat()))
+    lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
 )
 del raw_shares["Date"]
-# In[ ]:
+#%% In[ ]:
 raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True)
-# In[ ]:
+#%% In[ ]:
 raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x))
-# In[ ]:
+#%% In[ ]:
 del raw_shares["SharedUrl"]
 del raw_shares["MediaUrl"]
 del raw_shares["Visibility"]
-# In[ ]:
+#%% In[ ]:
 raw_shares.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-raw_shares.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+raw_shares.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')]
-# In[ ]:
+#%% In[ ]:
 documents_to_database(raw_shares)
--- a/import_data/32_importation_linkedin_comments.R
+++ b/import_data/32_importation_linkedin_comments.R
@ -1,19 +0,0 @@
 linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv"
 library("readr")
 library("magrittr")
 library("dplyr")
 # Read CSV file
 ddd <- readr::read_delim(linkedin_data_path,
                         escape_backslash = TRUE,
                         trim_ws = TRUE,
                         skip_empty_rows = FALSE,
                         delim = ",")
 # Remove carriage returns
 ddd %>%
  mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>%
  select(-Message) -> ddd2
 # Save the cleaned data to a new CSV file
 ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE)
--- a/import_data/32_importation_linkedin_comments.py
+++ b/import_data/32_importation_linkedin_comments.py
@ -1,41 +1,58 @@
 import pandas as pd
 import datetime
 import os
 from pathlib import Path
 from utils.documents_to_database import documents_to_database
-# In[ ]:
+#%% In[ ]:
-linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv"
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
-# In[ ]:
+project_root = script_dir
-raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8')
+linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'comments', 'Comments.csv')
 #%% In[ ]:
 raw_comments_csv = pd.read_csv(linkedin_data_path,
                               escapechar='\\',
                               skipinitialspace=True)
 raw_comments_csv['MessageFix'] = raw_comments_csv['Message'].str.replace(r'[\r\n\t]+', ' ', regex=True)
 raw_comments_csv = raw_comments_csv.drop(columns=['Message'])
 raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates()
-# In[ ]:
+#%% In[ ]:
 raw_comments['index'] = "rs_linkedin_comments"
 raw_comments['type'] = "comments"
 raw_comments['network'] = "LinkedIn"
 raw_comments['chemin'] = linkedin_data_path
-# In[ ]:
+#%% In[ ]:
-raw_comments["datepublication"] = raw_comments["Date"].apply(
+raw_comments["creation_timestamp"] = raw_comments["Date"].apply(
-    lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
+    lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
 )
 del raw_comments["Date"]
-# In[ ]:
+#%% In[ ]:
 raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True)
-# In[ ]:
+#%% In[ ]:
 raw_comments["chemin"] = linkedin_data_path
-# In[ ]:
+#%% In[ ]:
 raw_comments.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
-raw_comments.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+raw_comments.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')]
-# In[ ]:
+#%% In[ ]:
 documents_to_database(raw_comments)
--- a/import_data/41_importation_wordpress.py
+++ b/import_data/41_importation_wordpress.py
@ -5,29 +5,42 @@ import xmltodict
 import pandas as pd
 import markdownify
 import os
 from pathlib import Path
 from utils.documents_to_database import documents_to_database
-# In[ ]:
+#%% In[ ]:
-wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml"
+# Get the current file's directory
 try:
    # This will work when running as a script
    script_dir = Path(__file__).parent.parent
 except NameError:
    # This will work in interactive environments
    script_dir = Path().absolute()
 project_root = script_dir
 wordpress_xml_path = os.path.join(project_root, 'import_data', 'data', 'Wordpress', 'jevalideca', 'wordpress.xml')
 with open(wordpress_xml_path, "r") as xml_file:
    wordpress_xml = xml_file.read()
-# In[ ]:
+#%% In[ ]:
 wordpress_dict = xmltodict.parse(wordpress_xml)
-# In[ ]:
+#%% In[ ]:
 items_df = pd.DataFrame(wordpress_dict['rss']['channel']['item'])
-# In[ ]:
+#%% In[ ]:
 items_df_filter = items_df[
    (items_df['wp:post_type'].isin(['page', 'post'])) & (items_df['wp:status'] == 'publish')].copy()
-# In[ ]:
+#%% In[ ]:
-items_df_filter['datepublication'] = items_df_filter['wp:post_date'].apply(
+items_df_filter['creation_timestamp'] = items_df_filter['wp:post_date'].apply(
-    lambda x: str(datetime.datetime.fromisoformat(x).isoformat()))
+    lambda x: int(datetime.datetime.fromisoformat(x).timestamp()))
-# In[ ]:
+#%% In[ ]:
 def wp_to_markdown(x):
    try:
        md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x, heading_style='ATX')).strip()
@ -38,25 +51,25 @@ def wp_to_markdown(x):
    return md_text
-# In[ ]:
+#%% In[ ]:
 items_df_filter['texte'] = items_df_filter['content:encoded'].apply(lambda x: wp_to_markdown(x))
-# In[ ]:
+#%% In[ ]:
 items_df_filter.rename(columns={"link": "uri", "wp:post_type": "type"}, inplace=True)
-# In[ ]:
+#%% In[ ]:
 items_df_filter['index'] = "rs_wordpress_jevalideca"
 items_df_filter['network'] = "Wordpress"
 items_df_filter['chemin'] = wordpress_xml_path
-# In[ ]:
+#%% In[ ]:
 items_df_filter.fillna(value="", inplace=True)
-# In[ ]:
+#%% In[ ]:
 documents_to_database(items_df_filter[['title',
                                       'uri',
                                       'type',
-                                       'datepublication',
+                                       'creation_timestamp',
                                       'texte',
                                       'index',
                                       'network',
--- a/import_data/requirements.txt
+++ b/import_data/requirements.txt
@ -4,3 +4,4 @@ requests==2.31.0
 xmltodict==0.13.0
 python_dotenv==1.0.1
 pyarrow==17.0.0
 typesense==0.21.0
--- a/import_data/run_all_imports.sh
+++ b/import_data/run_all_imports.sh
@ -0,0 +1,17 @@
 #!/bin/bash
 # Navigate to the directory containing the scripts
 cd "$(dirname "$0")" || exit
 # Find and execute all Python scripts matching the pattern
 for script in [0-9][0-9]_importation_*.py
 do
    if [ -f "$script" ]; then
        echo "Running $script..."
        python3 "$script"
        echo "Finished $script"
        echo "--------------------"
    fi
 done
 echo "All importation scripts have been executed."
--- a/import_data/utils/documents_to_database.py
+++ b/import_data/utils/documents_to_database.py
@ -1,20 +1,11 @@
-import pandas as pd
+import tqdm
 import requests
-from utils.opensearch import opensearch_client
+from .typesense_client import client
-
+def documents_to_database(documents_list, os_client=client):
-def documents_to_database(documents_list, os_client=opensearch_client):
+    try:
-    # Check if opensearch is available
+        for document in tqdm.tqdm(documents_list.to_dict(orient='records')):
-    if not os_client.ping():
+            os_client.collections['social_media_posts'].documents.create(document)
-        raise requests.exceptions.ConnectionError("Opensearch is not reachable")
+        print(f"Successfully inserted {len(documents_list)} documents.")
-    # Check if the specified index exists
+    except Exception as e:
-    if not os_client.indices.exists(index=documents_list['index'].iloc[0]):
+        print(f"Error inserting documents: {str(e)}")
        raise requests.exceptions.HTTPError(f"Index '{documents_list['index'].iloc[0]}' does not exist")
    # Insert each document into opensearch index(es)
    for document in documents_list.to_dict(orient='records'):
        index_name = document.pop('index', None)
        if not index_name:
            raise ValueError("Document must have an 'index' field")
        os_client.index(index=index_name,
                        body=document)
--- a/import_data/utils/opensearch.py
+++ b/import_data/utils/opensearch.py
@ -1,22 +0,0 @@
 import os
 import dotenv
 # Load environment variables from.env file
 dotenv.load_dotenv()
 # Connect to OpenSearch using the provided credentials and hostname/port.
 from opensearchpy import OpenSearch
 host = 'localhost'
 port = 9200
 auth = ('admin', os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD"))  # For testing only. Don't store credentials in code.
 # Create the client with SSL/TLS enabled, but hostname verification disabled.
 opensearch_client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False
 )
--- a/import_data/utils/reseau_social_data.py
+++ b/import_data/utils/reseau_social_data.py
@ -11,4 +11,7 @@ reseau_social_data = [{"nom": "LinkedIn",
                      {"nom": "Facebook",
                       "repertoires": ["comments_and_reactions", "posts"]},
                      {"nom": "FacebookBusiness",
-                       "repertoires": ["posts"]}]
+                       "repertoires": ["posts"]},
                      {"nom": "Podcast",
                       "repertoires": ["shownotes", "audio"]}
                      ]
--- a/import_data/utils/typesense_client.py
+++ b/import_data/utils/typesense_client.py
@ -0,0 +1,15 @@
 import typesense
 import os
 from dotenv import load_dotenv
 load_dotenv()
 client = typesense.Client({
    'nodes': [{
        'host': 'localhost',
        'port': '8108',
        'protocol': 'http'
    }],
    'api_key': os.getenv('TYPESENSE_API_KEY'),
    'connection_timeout_seconds': 2
 })
		`@ -1 +1 @@`
			`OPENSEARCH_INITIAL_ADMIN_PASSWORD=`				`TYPESENSE_API_KEY=`