importation de documents dans typesense

2024-10-02 21:53:37 -04:00 · 2024-10-02 21:53:37 -04:00 · 7a74dbf413
commit 7a74dbf413
parent f4acc32451
24 changed files with 390 additions and 332 deletions
--- a/.env.template
+++ b/.env.template
@ -1 +1 @@
-OPENSEARCH_INITIAL_ADMIN_PASSWORD=
+TYPESENSE_API_KEY=
--- a/README.md
+++ b/README.md
@ -71,9 +71,7 @@ Tu peux obtenir une sauvegarde des données de tes réseaux sociaux. Je t'ai mis
 ![ou-mettre-fichiers-reseaux.png](images/ou-mettre-fichiers-reseaux.png)

 - Exécuter le fichier qui crée les index dans le moteur de recherche  [00_creer_reseauxsociaux.py](import_data/00_creer_reseauxsociaux.py)
-
 - Ensuite exécuter tous les fichiers Python pertinents pour charger les données que tu as téléchargées. 
-  - Pour les commentaires LinkedIn, il faut exécuter le programme R [32_importation_linkedin_comments.R](import_data/32_importation_linkedin_comments.R) avant le programme Python [32_importation_linkedin_comments.py](import_data/32_importation_linkedin_comments.py)

 Ensuite, tu devrais pouvoir voir toutes les données que tu as chargées dans OpenSearch Dashboards sur http://localhost:5601. Va dans Discover / Search and find insights.

--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,68 +1,21 @@
 ---
 version: '3'
 services:
-  opensearch-node1:
-    image: opensearchproject/opensearch:latest
-    container_name: opensearch-node1
+  typesense:
+    image: typesense/typesense:27.1
+    container_name: typesense
    environment:
-      - cluster.name=opensearch-cluster
-      - node.name=opensearch-node1
-      - discovery.seed_hosts=opensearch-node1,opensearch-node2
-      - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
-      - bootstrap.memory_lock=true  # along with the memlock settings below, disables swapping
-      - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m  # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
-      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}    # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and higher
-    ulimits:
-      memlock:
-        soft: -1
-        hard: -1
-      nofile:
-        soft: 65536  # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
-        hard: 65536
+      - TYPESENSE_API_KEY=${TYPESENSE_API_KEY}
+      - TYPESENSE_DATA_DIR=/data
    volumes:
-      - opensearch-data1:/usr/share/opensearch/data
+      - typesense-data:/data
    ports:
-      - 9200:9200
-      - 9600:9600  # required for Performance Analyzer
+      - "8108:8108"
    networks:
-      - opensearch-net
-  opensearch-node2:
-    image: opensearchproject/opensearch:latest
-    container_name: opensearch-node2
-    environment:
-      - cluster.name=opensearch-cluster
-      - node.name=opensearch-node2
-      - discovery.seed_hosts=opensearch-node1,opensearch-node2
-      - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
-      - bootstrap.memory_lock=true
-      - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m
-      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
-    ulimits:
-      memlock:
-        soft: -1
-        hard: -1
-      nofile:
-        soft: 65536
-        hard: 65536
-    volumes:
-      - opensearch-data2:/usr/share/opensearch/data
-    networks:
-      - opensearch-net
-  opensearch-dashboards:
-    image: opensearchproject/opensearch-dashboards:latest
-    container_name: opensearch-dashboards
-    ports:
-      - 5601:5601
-    expose:
-      - '5601'
-    environment:
-      OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]'
-    networks:
-      - opensearch-net
+      - typesense-net

 volumes:
-  opensearch-data1:
-  opensearch-data2:
+  typesense-data:

 networks:
-  opensearch-net:
+  typesense-net:
--- a/import_data/00_creer_reseauxsociaux.py
+++ b/import_data/00_creer_reseauxsociaux.py
@ -1,18 +1,40 @@
-import requests
-import utils.config
-from utils.opensearch import opensearch_client
-from utils.reseau_social_data import reseau_social_data as rs_data
+from typesense.exceptions import TypesenseClientError, ObjectAlreadyExists

-# %%
-rs_data
+from utils.typesense_client import client
+
+# Create a collection
+try:
+    client.collections.create({
+        'name': 'social_media_posts',
+        'fields': [
+            {'name': 'id', 'type': 'string'},
+            {'name': 'network', 'type': 'string', 'facet': True},
+            {'name': 'type', 'type': 'string', 'facet': True},
+            {'name': 'index', 'type': 'string', 'facet': True},
+            {'name': 'chemin', 'type': 'string'},
+            {'name': 'texte', 'type': 'string'},
+            {'name': 'creation_timestamp', 'type': 'int64'},
+            {
+                "name" : "embedding",
+                "type" : "float[]",
+                "embed": {
+                    "from": [
+                        "texte"
+                    ],
+                    "model_config": {
+                        "model_name": "ts/multilingual-e5-small"
+                    }
+                }
+            }
+        ],
+        'default_sorting_field': 'creation_timestamp'
+    })
+    print("Collection 'social_media_posts' created successfully.")
+except TypesenseClientError as e:
+    if e==ObjectAlreadyExists:
+        print("Collection 'social_media_posts' already exists. Skipping creation.")
+    else:
+        print(f"Error creating collection: {str(e)}")
+        raise

-# %%
-opensearch_client.info()

-# %%
-for rs in rs_data:
-    nom = rs.get("nom")
-    for repertoire in rs.get("repertoires", []):
-        index_name = f"rs_{nom}_{repertoire}".lower()
-        opensearch_client.indices.create(index=index_name)
-        print(f"Index '{index_name}' créé")
--- a/import_data/00_delete_collection.py
+++ b/import_data/00_delete_collection.py
@ -0,0 +1,3 @@
+# Utiliser au besoin seulement pour supprimer la collection 'social_media_posts' dans Typesense
+from utils.typesense_client import client
+client.collections['social_media_posts'].delete()
--- a/import_data/11_importation_facebook_page_publications.py
+++ b/import_data/11_importation_facebook_page_publications.py
@ -1,21 +1,34 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

-from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-fb_data_path = ['data/FacebookBusiness/posts/profile_posts_1.json',
-                'data/FacebookBusiness/posts/uncategorized_photos.json',
-                'data/FacebookBusiness/posts/videos.json']
+#%% In[ ]:

+#%% In[ ]:
+
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'FacebookBusiness', 'posts', 'profile_posts_1.json')]
+
+try:
    with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
        posts_json = json.loads(convert_encoding_meta(posts.read()))
+except Exception as e:
+    print(f"Error reading JSON file: {e}")
+    exit(1)

-# In[ ]:
+#%% In[ ]:
 posts_medias = []
 for post in posts_json:
    # data
@ -39,21 +52,14 @@ for post in posts_json:
                                             "texte": texte,
                                             "creation_timestamp": media["creation_timestamp"]})

-# In[ ]:
+#%% In[ ]:
 posts_medias_df = pd.DataFrame(posts_medias)

-# In[ ]:
-posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
-    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
-
-# In[ ]:
-del posts_medias_df['creation_timestamp']
-
-# In[ ]:
+#%% In[ ]:
 posts_medias_df.fillna(value="", inplace=True)

-# In[ ]:
-posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 documents_to_database(posts_medias_df)
--- a/import_data/12_importation_facebook_profil_comments.py
+++ b/import_data/12_importation_facebook_profil_comments.py
@ -1,18 +1,28 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

-from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-fb_data_path = ['data/Facebook/comments_and_reactions/comments.json']
+#%% In[ ]:
+
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'Facebook', 'comments_and_reactions', 'comments.json')]
+
 with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
    comments_json = json.loads(convert_encoding_meta(posts.read()))

-# In[ ]:
+#%% In[ ]:
 facebook_comments = []
 for comment in comments_json['comments_v2']:
    if comment.get('data'):
@ -26,18 +36,8 @@ for comment in comments_json['comments_v2']:
                                          "texte": comment["comment"],
                                          "creation_timestamp": comment["timestamp"]})

-# In[ ]:
+#%% In[ ]:
 facebook_comments_df = pd.DataFrame(facebook_comments)

-# In[ ]:
-facebook_comments_df['datepublication'] = facebook_comments_df['creation_timestamp'].apply(
-    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
-
-# In[ ]:
-facebook_comments_df.fillna(value="", inplace=True)
-
-# In[ ]:
-del facebook_comments_df['creation_timestamp']
-
-# In[ ]:
+#%% In[ ]:
 documents_to_database(facebook_comments_df)
--- a/import_data/13_importation_facebook_profil_uncategorized_photos.py
+++ b/import_data/13_importation_facebook_profil_uncategorized_photos.py
@ -1,44 +1,51 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

-from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-fb_data_path = ['data/Facebook/posts/your_uncategorized_photos.json']
-with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
+#%% In[ ]:
+
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+fb_data_path = os.path.join(project_root, 'import_data', 'data', 'Facebook', 'posts', 'your_uncategorized_photos.json')
+
+with open(fb_data_path, "r", encoding="raw-unicode-escape") as posts:
    photos_json = json.loads(convert_encoding_meta(posts.read()))

-# In[ ]:
+#%% In[ ]:
 facebook_photos = photos_json['other_photos_v2']

-# In[ ]:
+#%% In[ ]:
 facebook_photos_df = pd.DataFrame(facebook_photos)

-# In[ ]:
+#%% In[ ]:
 # Filter out posts without a description
 facebook_photos_df = facebook_photos_df[~facebook_photos_df['description'].isnull()]

-# In[ ]:
-facebook_photos_df['datepublication'] = facebook_photos_df['creation_timestamp'].apply(
-    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
+#%% In[ ]:
 facebook_photos_df['index'] = "rs_facebook_posts"
 facebook_photos_df['network'] = "Facebook"
 facebook_photos_df['type'] = "posts"
-facebook_photos_df['chemin'] = fb_data_path[0]
+facebook_photos_df['chemin'] = fb_data_path

-# In[ ]:
+#%% In[ ]:
 facebook_photos_df.rename(columns={"description": "texte"}, inplace=True)

-# In[ ]:
-del facebook_photos_df['creation_timestamp']
+#%% In[ ]:
 del facebook_photos_df['media_metadata']

-# In[ ]:
+#%% In[ ]:
 facebook_photos_df.fillna(value="", inplace=True)

-# In[ ]:
+#%% In[ ]:
 documents_to_database(facebook_photos_df)
--- a/import_data/21_importation_instagram_publications.py
+++ b/import_data/21_importation_instagram_publications.py
@ -1,17 +1,28 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-instagram_data_path = 'data/Instagram/content/posts_1.json'
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'posts_1.json')
+
+
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    posts_json = json.loads(convert_encoding_meta(posts.read()))

-# In[ ]:
+#%% In[ ]:
 posts_medias = []
 for post in posts_json:
    medias = post['media']
@ -45,25 +56,18 @@ for post in posts_json:
            "texte": title,
            "creation_timestamp": creation_timestamp})

-# In[ ]:
+#%% In[ ]:
 posts_medias_df = pd.DataFrame(posts_medias)

-# In[ ]:
-posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
-    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
-
-# In[ ]:
-del posts_medias_df['creation_timestamp']
-
-# In[ ]:
+#%% In[ ]:
 posts_medias_df.fillna(value="", inplace=True)

-# In[ ]:
-posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 posts_medias_df = posts_medias_df[~posts_medias_df['texte'].str.strip().eq('')]

-# In[ ]:
+#%% In[ ]:
 documents_to_database(posts_medias_df)
--- a/import_data/22_importation_instagram_reels.py
+++ b/import_data/22_importation_instagram_reels.py
@ -1,49 +1,55 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

-from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-instagram_data_path = 'data/Instagram/content/reels.json'
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'reels.json')
+
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    reels_json = json.loads(convert_encoding_meta(posts.read()))

-# In[ ]:
+#%% In[ ]:
 ig_reels_media = [x['media'][0] for x in reels_json['ig_reels_media']]

-# In[ ]:
+#%% In[ ]:
 ig_reels_df = pd.DataFrame(ig_reels_media)

-# In[ ]:
-ig_reels_df['datepublication'] = ig_reels_df['creation_timestamp'].apply(
-    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
+#%% In[ ]:
 ig_reels_df['index'] = "rs_instagram_content"
 ig_reels_df['type'] = "reels"
 ig_reels_df['network'] = "Instagram"
 ig_reels_df['chemin'] = instagram_data_path

-# In[ ]:
+#%% In[ ]:
 ig_reels_df.rename(columns={"title": "texte"}, inplace=True)

-# In[ ]:
-del ig_reels_df['creation_timestamp']
+#%% In[ ]:
 del ig_reels_df['media_metadata']
 del ig_reels_df['cross_post_source']
 del ig_reels_df['dubbing_info']

-# In[ ]:
+#%% In[ ]:
 ig_reels_df.fillna(value="", inplace=True)

-# In[ ]:
-ig_reels_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+ig_reels_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_reels_df = ig_reels_df[~ig_reels_df['texte'].str.strip().eq('')]

-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_reels_df)
--- a/import_data/23_importation_instagram_stories.py
+++ b/import_data/23_importation_instagram_stories.py
@ -1,49 +1,52 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

-from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-instagram_data_path = 'data/Instagram/content/stories.json'
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'stories.json')
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    stories_json = json.loads(convert_encoding_meta(posts.read()))

-# In[ ]:
+#%% In[ ]:
 ig_stories_df = pd.DataFrame(stories_json['ig_stories'])

-# In[ ]:
-ig_stories_df['datepublication'] = ig_stories_df['creation_timestamp'].apply(
-    lambda x: datetime.datetime.fromtimestamp(x).isoformat())
-
-# In[ ]:
+#%% In[ ]:
 ig_stories_df['index'] = "rs_instagram_content"
 ig_stories_df['type'] = "stories"
 ig_stories_df['network'] = "Instagram"
 ig_stories_df['chemin'] = instagram_data_path

-# In[ ]:
+#%% In[ ]:
 ig_stories_df.rename(columns={"title": "texte"}, inplace=True)

-# In[ ]:
-del ig_stories_df['creation_timestamp']
+#%% In[ ]:
 del ig_stories_df['media_metadata']
 del ig_stories_df['cross_post_source']
 del ig_stories_df['ai_stickers']
 del ig_stories_df['dubbing_info']

-# In[ ]:
+#%% In[ ]:
 ig_stories_df.fillna(value="", inplace=True)

-# In[ ]:
-ig_stories_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+ig_stories_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_stories_df = ig_stories_df[~ig_stories_df['texte'].str.strip('\n').str.strip().eq('')]

-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_stories_df)
--- a/import_data/24_importation_instagram_post_comments.py
+++ b/import_data/24_importation_instagram_post_comments.py
@ -1,39 +1,48 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-instagram_data_path = 'data/Instagram/comments/post_comments_1.json'
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'post_comments_1.json')
+
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    post_comments_1 = json.loads(convert_encoding_meta(posts.read()))

-# In[ ]:
+#%% In[ ]:
 ig_comments = []
 for comment in post_comments_1:
    ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
-                        'datepublication': datetime.datetime.fromtimestamp(
-                            timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
+                        'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
                        "chemin": instagram_data_path,
                        "index": "rs_instagram_comments",
                        "type": "comments",
                        "network": "Instagram"})

-# In[ ]:
+#%% In[ ]:
 ig_comments_df = pd.DataFrame(ig_comments)

-# In[ ]:
+#%% In[ ]:
 ig_comments_df.fillna(value="", inplace=True)

-# In[ ]:
-ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]

-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_comments_df)
--- a/import_data/25_importation_instagram_reels_comments.py
+++ b/import_data/25_importation_instagram_reels_comments.py
@ -1,40 +1,48 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

-from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-instagram_data_path = 'data/Instagram/comments/reels_comments.json'
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'reels_comments.json')
+
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    reels_comments = json.loads(convert_encoding_meta(posts.read()))

-# In[ ]:
+#%% In[ ]:
 ig_comments = []
 for comment in reels_comments['comments_reels_comments']:
    ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
-                        'datepublication': datetime.datetime.fromtimestamp(
-                            timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
+                        'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
                        "chemin": instagram_data_path,
                        "index": "rs_instagram_comments",
                        "type": "comments",
                        "network": "Instagram"})

-# In[ ]:
+#%% In[ ]:
 ig_comments_df = pd.DataFrame(ig_comments)

-# In[ ]:
+#%% In[ ]:
 ig_comments_df.fillna(value="", inplace=True)

-# In[ ]:
-ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]

-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_comments_df)
--- a/import_data/26_importation_threads.py
+++ b/import_data/26_importation_threads.py
@ -1,40 +1,49 @@
-import datetime
-
 import pandas as pd
 import json
+import os
+from pathlib import Path

 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

-# In[ ]:
-instagram_data_path = 'data/Instagram/threads/threads_and_replies.json'
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'threads', 'threads_and_replies.json')
+
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    post_comments_1 = json.loads(convert_encoding_meta(posts.read()))

-# In[ ]:
+#%% In[ ]:
 threads_comments = []
 for post in post_comments_1['text_post_app_text_posts']:
    for element in post['media']:
        threads_comments.append({"texte": element['title'],
-                                 'datepublication': datetime.datetime.fromtimestamp(
-                                     timestamp=element['creation_timestamp']).isoformat(),
+                                 'creation_timestamp': element['creation_timestamp'],
                                 "chemin": instagram_data_path,
                                 "index": "rs_instagram_threads",
                                 "type": "posts",
                                 "network": "Instagram"})

-# In[ ]:
+#%% In[ ]:
 ig_comments_df = pd.DataFrame(threads_comments)

-# In[ ]:
+#%% In[ ]:
 ig_comments_df.fillna(value="", inplace=True)

-# In[ ]:
-ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]

-# In[ ]:
+#%% In[ ]:
 documents_to_database(ig_comments_df)
--- a/import_data/31_importation_linkedin_shares.py
+++ b/import_data/31_importation_linkedin_shares.py
@ -1,44 +1,58 @@
 import pandas as pd
 import datetime

+import os
+from pathlib import Path
+
 from utils.documents_to_database import documents_to_database

-# In[ ]:
-linkedin_data_path = "data/LinkedIn/shares/Shares.csv"
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'shares', 'Shares.csv')
+
 raw_shares = pd.read_csv(linkedin_data_path)

-# In[ ]:
+#%% In[ ]:
 raw_shares['index'] = "rs_linkedin_shares"
 raw_shares['type'] = "posts"
 raw_shares['network'] = "LinkedIn"
 raw_shares['chemin'] = linkedin_data_path

-# In[ ]:
-raw_shares["datepublication"] = raw_shares["Date"].apply(
-    lambda x: str(datetime.datetime.fromisoformat(x).isoformat()))
+#%% In[ ]:
+raw_shares["creation_timestamp"] = raw_shares["Date"].apply(
+    lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
+)
 del raw_shares["Date"]

-# In[ ]:
+#%% In[ ]:
 raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True)

-# In[ ]:
+#%% In[ ]:
 raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x))

-# In[ ]:
+#%% In[ ]:
 del raw_shares["SharedUrl"]
 del raw_shares["MediaUrl"]
 del raw_shares["Visibility"]

-# In[ ]:
+#%% In[ ]:
 raw_shares.fillna(value="", inplace=True)

-# In[ ]:
-raw_shares.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+raw_shares.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')]

-# In[ ]:
+#%% In[ ]:
 documents_to_database(raw_shares)

--- a/import_data/32_importation_linkedin_comments.R
+++ b/import_data/32_importation_linkedin_comments.R
@ -1,19 +0,0 @@
-linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv"
-library("readr")
-library("magrittr")
-library("dplyr")
-
-# Read CSV file
-ddd <- readr::read_delim(linkedin_data_path,
-                         escape_backslash = TRUE,
-                         trim_ws = TRUE,
-                         skip_empty_rows = FALSE,
-                         delim = ",")
-
-# Remove carriage returns
-ddd %>%
-  mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>%
-  select(-Message) -> ddd2
-
-# Save the cleaned data to a new CSV file
-ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE)
--- a/import_data/32_importation_linkedin_comments.py
+++ b/import_data/32_importation_linkedin_comments.py
@ -1,41 +1,58 @@
 import pandas as pd
 import datetime

+import os
+from pathlib import Path
+
 from utils.documents_to_database import documents_to_database

-# In[ ]:
-linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv"
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()

-# In[ ]:
-raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8')
+project_root = script_dir
+linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'comments', 'Comments.csv')
+
+#%% In[ ]:
+raw_comments_csv = pd.read_csv(linkedin_data_path,
+                               escapechar='\\',
+                               skipinitialspace=True)
+raw_comments_csv['MessageFix'] = raw_comments_csv['Message'].str.replace(r'[\r\n\t]+', ' ', regex=True)
+raw_comments_csv = raw_comments_csv.drop(columns=['Message'])
 raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates()

-# In[ ]:
+#%% In[ ]:
 raw_comments['index'] = "rs_linkedin_comments"
 raw_comments['type'] = "comments"
 raw_comments['network'] = "LinkedIn"
 raw_comments['chemin'] = linkedin_data_path

-# In[ ]:
-raw_comments["datepublication"] = raw_comments["Date"].apply(
-    lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
+#%% In[ ]:
+raw_comments["creation_timestamp"] = raw_comments["Date"].apply(
+    lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
+)
 del raw_comments["Date"]

-# In[ ]:
+#%% In[ ]:
 raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True)

-# In[ ]:
+#%% In[ ]:
 raw_comments["chemin"] = linkedin_data_path

-# In[ ]:
+#%% In[ ]:
 raw_comments.fillna(value="", inplace=True)

-# In[ ]:
-raw_comments.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
+#%% In[ ]:
+raw_comments.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)

-# In[ ]:
+#%% In[ ]:
 # Filter empty texte
 raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')]

-# In[ ]:
+#%% In[ ]:
 documents_to_database(raw_comments)
--- a/import_data/41_importation_wordpress.py
+++ b/import_data/41_importation_wordpress.py
@ -5,29 +5,42 @@ import xmltodict
 import pandas as pd
 import markdownify

+import os
+from pathlib import Path
+
 from utils.documents_to_database import documents_to_database

-# In[ ]:
-wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml"
+#%% In[ ]:
+# Get the current file's directory
+try:
+    # This will work when running as a script
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    # This will work in interactive environments
+    script_dir = Path().absolute()
+
+project_root = script_dir
+wordpress_xml_path = os.path.join(project_root, 'import_data', 'data', 'Wordpress', 'jevalideca', 'wordpress.xml')
+
 with open(wordpress_xml_path, "r") as xml_file:
    wordpress_xml = xml_file.read()

-# In[ ]:
+#%% In[ ]:
 wordpress_dict = xmltodict.parse(wordpress_xml)

-# In[ ]:
+#%% In[ ]:
 items_df = pd.DataFrame(wordpress_dict['rss']['channel']['item'])

-# In[ ]:
+#%% In[ ]:
 items_df_filter = items_df[
    (items_df['wp:post_type'].isin(['page', 'post'])) & (items_df['wp:status'] == 'publish')].copy()

-# In[ ]:
-items_df_filter['datepublication'] = items_df_filter['wp:post_date'].apply(
-    lambda x: str(datetime.datetime.fromisoformat(x).isoformat()))
+#%% In[ ]:
+items_df_filter['creation_timestamp'] = items_df_filter['wp:post_date'].apply(
+    lambda x: int(datetime.datetime.fromisoformat(x).timestamp()))


-# In[ ]:
+#%% In[ ]:
 def wp_to_markdown(x):
    try:
        md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x, heading_style='ATX')).strip()
@ -38,25 +51,25 @@ def wp_to_markdown(x):
    return md_text


-# In[ ]:
+#%% In[ ]:
 items_df_filter['texte'] = items_df_filter['content:encoded'].apply(lambda x: wp_to_markdown(x))

-# In[ ]:
+#%% In[ ]:
 items_df_filter.rename(columns={"link": "uri", "wp:post_type": "type"}, inplace=True)

-# In[ ]:
+#%% In[ ]:
 items_df_filter['index'] = "rs_wordpress_jevalideca"
 items_df_filter['network'] = "Wordpress"
 items_df_filter['chemin'] = wordpress_xml_path

-# In[ ]:
+#%% In[ ]:
 items_df_filter.fillna(value="", inplace=True)

-# In[ ]:
+#%% In[ ]:
 documents_to_database(items_df_filter[['title',
                                       'uri',
                                       'type',
-                                       'datepublication',
+                                       'creation_timestamp',
                                       'texte',
                                       'index',
                                       'network',
--- a/import_data/requirements.txt
+++ b/import_data/requirements.txt
@ -4,3 +4,4 @@ requests==2.31.0
 xmltodict==0.13.0
 python_dotenv==1.0.1
 pyarrow==17.0.0
+typesense==0.21.0
--- a/import_data/run_all_imports.sh
+++ b/import_data/run_all_imports.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Navigate to the directory containing the scripts
+cd "$(dirname "$0")" || exit
+
+# Find and execute all Python scripts matching the pattern
+for script in [0-9][0-9]_importation_*.py
+do
+    if [ -f "$script" ]; then
+        echo "Running $script..."
+        python3 "$script"
+        echo "Finished $script"
+        echo "--------------------"
+    fi
+done
+
+echo "All importation scripts have been executed."
--- a/import_data/utils/documents_to_database.py
+++ b/import_data/utils/documents_to_database.py
@ -1,20 +1,11 @@
-import pandas as pd
-import requests
+import tqdm

-from utils.opensearch import opensearch_client
+from .typesense_client import client

-
-def documents_to_database(documents_list, os_client=opensearch_client):
-    # Check if opensearch is available
-    if not os_client.ping():
-        raise requests.exceptions.ConnectionError("Opensearch is not reachable")
-    # Check if the specified index exists
-    if not os_client.indices.exists(index=documents_list['index'].iloc[0]):
-        raise requests.exceptions.HTTPError(f"Index '{documents_list['index'].iloc[0]}' does not exist")
-    # Insert each document into opensearch index(es)
-    for document in documents_list.to_dict(orient='records'):
-        index_name = document.pop('index', None)
-        if not index_name:
-            raise ValueError("Document must have an 'index' field")
-        os_client.index(index=index_name,
-                        body=document)
+def documents_to_database(documents_list, os_client=client):
+    try:
+        for document in tqdm.tqdm(documents_list.to_dict(orient='records')):
+            os_client.collections['social_media_posts'].documents.create(document)
+        print(f"Successfully inserted {len(documents_list)} documents.")
+    except Exception as e:
+        print(f"Error inserting documents: {str(e)}")
--- a/import_data/utils/opensearch.py
+++ b/import_data/utils/opensearch.py
@ -1,22 +0,0 @@
-import os
-import dotenv
-
-# Load environment variables from.env file
-dotenv.load_dotenv()
-
-# Connect to OpenSearch using the provided credentials and hostname/port.
-from opensearchpy import OpenSearch
-
-host = 'localhost'
-port = 9200
-auth = ('admin', os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD"))  # For testing only. Don't store credentials in code.
-# Create the client with SSL/TLS enabled, but hostname verification disabled.
-opensearch_client = OpenSearch(
-    hosts=[{'host': host, 'port': port}],
-    http_compress=True,  # enables gzip compression for request bodies
-    http_auth=auth,
-    use_ssl=True,
-    verify_certs=False,
-    ssl_assert_hostname=False,
-    ssl_show_warn=False
-)
--- a/import_data/utils/reseau_social_data.py
+++ b/import_data/utils/reseau_social_data.py
@ -11,4 +11,7 @@ reseau_social_data = [{"nom": "LinkedIn",
                      {"nom": "Facebook",
                       "repertoires": ["comments_and_reactions", "posts"]},
                      {"nom": "FacebookBusiness",
-                       "repertoires": ["posts"]}]
+                       "repertoires": ["posts"]},
+                      {"nom": "Podcast",
+                       "repertoires": ["shownotes", "audio"]}
+                      ]
--- a/import_data/utils/typesense_client.py
+++ b/import_data/utils/typesense_client.py
@ -0,0 +1,15 @@
+import typesense
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+client = typesense.Client({
+    'nodes': [{
+        'host': 'localhost',
+        'port': '8108',
+        'protocol': 'http'
+    }],
+    'api_key': os.getenv('TYPESENSE_API_KEY'),
+    'connection_timeout_seconds': 2
+})