importation de documents dans typesense

This commit is contained in:
François Pelletier 2024-10-02 21:53:37 -04:00
parent f4acc32451
commit 7a74dbf413
24 changed files with 390 additions and 332 deletions

View file

@ -1 +1 @@
OPENSEARCH_INITIAL_ADMIN_PASSWORD= TYPESENSE_API_KEY=

View file

@ -71,9 +71,7 @@ Tu peux obtenir une sauvegarde des données de tes réseaux sociaux. Je t'ai mis
![ou-mettre-fichiers-reseaux.png](images/ou-mettre-fichiers-reseaux.png) ![ou-mettre-fichiers-reseaux.png](images/ou-mettre-fichiers-reseaux.png)
- Exécuter le fichier qui crée les index dans le moteur de recherche [00_creer_reseauxsociaux.py](import_data/00_creer_reseauxsociaux.py) - Exécuter le fichier qui crée les index dans le moteur de recherche [00_creer_reseauxsociaux.py](import_data/00_creer_reseauxsociaux.py)
- Ensuite exécuter tous les fichiers Python pertinents pour charger les données que tu as téléchargées. - Ensuite exécuter tous les fichiers Python pertinents pour charger les données que tu as téléchargées.
- Pour les commentaires LinkedIn, il faut exécuter le programme R [32_importation_linkedin_comments.R](import_data/32_importation_linkedin_comments.R) avant le programme Python [32_importation_linkedin_comments.py](import_data/32_importation_linkedin_comments.py)
Ensuite, tu devrais pouvoir voir toutes les données que tu as chargées dans OpenSearch Dashboards sur http://localhost:5601. Va dans Discover / Search and find insights. Ensuite, tu devrais pouvoir voir toutes les données que tu as chargées dans OpenSearch Dashboards sur http://localhost:5601. Va dans Discover / Search and find insights.

View file

@ -1,68 +1,21 @@
--- ---
version: '3' version: '3'
services: services:
opensearch-node1: typesense:
image: opensearchproject/opensearch:latest image: typesense/typesense:27.1
container_name: opensearch-node1 container_name: typesense
environment: environment:
- cluster.name=opensearch-cluster - TYPESENSE_API_KEY=${TYPESENSE_API_KEY}
- node.name=opensearch-node1 - TYPESENSE_DATA_DIR=/data
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
- OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and higher
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
hard: 65536
volumes: volumes:
- opensearch-data1:/usr/share/opensearch/data - typesense-data:/data
ports: ports:
- 9200:9200 - "8108:8108"
- 9600:9600 # required for Performance Analyzer
networks: networks:
- opensearch-net - typesense-net
opensearch-node2:
image: opensearchproject/opensearch:latest
container_name: opensearch-node2
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch-node2
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true
- OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
volumes:
- opensearch-data2:/usr/share/opensearch/data
networks:
- opensearch-net
opensearch-dashboards:
image: opensearchproject/opensearch-dashboards:latest
container_name: opensearch-dashboards
ports:
- 5601:5601
expose:
- '5601'
environment:
OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]'
networks:
- opensearch-net
volumes: volumes:
opensearch-data1: typesense-data:
opensearch-data2:
networks: networks:
opensearch-net: typesense-net:

View file

@ -1,18 +1,40 @@
import requests from typesense.exceptions import TypesenseClientError, ObjectAlreadyExists
import utils.config
from utils.opensearch import opensearch_client
from utils.reseau_social_data import reseau_social_data as rs_data
# %% from utils.typesense_client import client
rs_data
# Create a collection
try:
client.collections.create({
'name': 'social_media_posts',
'fields': [
{'name': 'id', 'type': 'string'},
{'name': 'network', 'type': 'string', 'facet': True},
{'name': 'type', 'type': 'string', 'facet': True},
{'name': 'index', 'type': 'string', 'facet': True},
{'name': 'chemin', 'type': 'string'},
{'name': 'texte', 'type': 'string'},
{'name': 'creation_timestamp', 'type': 'int64'},
{
"name" : "embedding",
"type" : "float[]",
"embed": {
"from": [
"texte"
],
"model_config": {
"model_name": "ts/multilingual-e5-small"
}
}
}
],
'default_sorting_field': 'creation_timestamp'
})
print("Collection 'social_media_posts' created successfully.")
except TypesenseClientError as e:
if e==ObjectAlreadyExists:
print("Collection 'social_media_posts' already exists. Skipping creation.")
else:
print(f"Error creating collection: {str(e)}")
raise
# %%
opensearch_client.info()
# %%
for rs in rs_data:
nom = rs.get("nom")
for repertoire in rs.get("repertoires", []):
index_name = f"rs_{nom}_{repertoire}".lower()
opensearch_client.indices.create(index=index_name)
print(f"Index '{index_name}' créé")

View file

@ -0,0 +1,3 @@
# Utiliser au besoin seulement pour supprimer la collection 'social_media_posts' dans Typesense
from utils.typesense_client import client
client.collections['social_media_posts'].delete()

View file

@ -1,21 +1,34 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.get_ids import get_idtypedocument, get_idreseausocial
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
fb_data_path = ['data/FacebookBusiness/posts/profile_posts_1.json',
'data/FacebookBusiness/posts/uncategorized_photos.json',
'data/FacebookBusiness/posts/videos.json']
#%% In[ ]:
# Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'FacebookBusiness', 'posts', 'profile_posts_1.json')]
try:
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
posts_json = json.loads(convert_encoding_meta(posts.read())) posts_json = json.loads(convert_encoding_meta(posts.read()))
except Exception as e:
print(f"Error reading JSON file: {e}")
exit(1)
# In[ ]: #%% In[ ]:
posts_medias = [] posts_medias = []
for post in posts_json: for post in posts_json:
# data # data
@ -39,21 +52,14 @@ for post in posts_json:
"texte": texte, "texte": texte,
"creation_timestamp": media["creation_timestamp"]}) "creation_timestamp": media["creation_timestamp"]})
# In[ ]: #%% In[ ]:
posts_medias_df = pd.DataFrame(posts_medias) posts_medias_df = pd.DataFrame(posts_medias)
# In[ ]: #%% In[ ]:
posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
# In[ ]:
del posts_medias_df['creation_timestamp']
# In[ ]:
posts_medias_df.fillna(value="", inplace=True) posts_medias_df.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
documents_to_database(posts_medias_df) documents_to_database(posts_medias_df)

View file

@ -1,18 +1,28 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.get_ids import get_idtypedocument, get_idreseausocial
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
fb_data_path = ['data/Facebook/comments_and_reactions/comments.json']
# Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'Facebook', 'comments_and_reactions', 'comments.json')]
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
comments_json = json.loads(convert_encoding_meta(posts.read())) comments_json = json.loads(convert_encoding_meta(posts.read()))
# In[ ]: #%% In[ ]:
facebook_comments = [] facebook_comments = []
for comment in comments_json['comments_v2']: for comment in comments_json['comments_v2']:
if comment.get('data'): if comment.get('data'):
@ -26,18 +36,8 @@ for comment in comments_json['comments_v2']:
"texte": comment["comment"], "texte": comment["comment"],
"creation_timestamp": comment["timestamp"]}) "creation_timestamp": comment["timestamp"]})
# In[ ]: #%% In[ ]:
facebook_comments_df = pd.DataFrame(facebook_comments) facebook_comments_df = pd.DataFrame(facebook_comments)
# In[ ]: #%% In[ ]:
facebook_comments_df['datepublication'] = facebook_comments_df['creation_timestamp'].apply(
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
# In[ ]:
facebook_comments_df.fillna(value="", inplace=True)
# In[ ]:
del facebook_comments_df['creation_timestamp']
# In[ ]:
documents_to_database(facebook_comments_df) documents_to_database(facebook_comments_df)

View file

@ -1,44 +1,51 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.get_ids import get_idtypedocument, get_idreseausocial
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
fb_data_path = ['data/Facebook/posts/your_uncategorized_photos.json']
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts: # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
fb_data_path = os.path.join(project_root, 'import_data', 'data', 'Facebook', 'posts', 'your_uncategorized_photos.json')
with open(fb_data_path, "r", encoding="raw-unicode-escape") as posts:
photos_json = json.loads(convert_encoding_meta(posts.read())) photos_json = json.loads(convert_encoding_meta(posts.read()))
# In[ ]: #%% In[ ]:
facebook_photos = photos_json['other_photos_v2'] facebook_photos = photos_json['other_photos_v2']
# In[ ]: #%% In[ ]:
facebook_photos_df = pd.DataFrame(facebook_photos) facebook_photos_df = pd.DataFrame(facebook_photos)
# In[ ]: #%% In[ ]:
# Filter out posts without a description # Filter out posts without a description
facebook_photos_df = facebook_photos_df[~facebook_photos_df['description'].isnull()] facebook_photos_df = facebook_photos_df[~facebook_photos_df['description'].isnull()]
# In[ ]: #%% In[ ]:
facebook_photos_df['datepublication'] = facebook_photos_df['creation_timestamp'].apply(
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
facebook_photos_df['index'] = "rs_facebook_posts" facebook_photos_df['index'] = "rs_facebook_posts"
facebook_photos_df['network'] = "Facebook" facebook_photos_df['network'] = "Facebook"
facebook_photos_df['type'] = "posts" facebook_photos_df['type'] = "posts"
facebook_photos_df['chemin'] = fb_data_path[0] facebook_photos_df['chemin'] = fb_data_path
# In[ ]: #%% In[ ]:
facebook_photos_df.rename(columns={"description": "texte"}, inplace=True) facebook_photos_df.rename(columns={"description": "texte"}, inplace=True)
# In[ ]: #%% In[ ]:
del facebook_photos_df['creation_timestamp']
del facebook_photos_df['media_metadata'] del facebook_photos_df['media_metadata']
# In[ ]: #%% In[ ]:
facebook_photos_df.fillna(value="", inplace=True) facebook_photos_df.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
documents_to_database(facebook_photos_df) documents_to_database(facebook_photos_df)

View file

@ -1,17 +1,28 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
instagram_data_path = 'data/Instagram/content/posts_1.json' # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'posts_1.json')
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
posts_json = json.loads(convert_encoding_meta(posts.read())) posts_json = json.loads(convert_encoding_meta(posts.read()))
# In[ ]: #%% In[ ]:
posts_medias = [] posts_medias = []
for post in posts_json: for post in posts_json:
medias = post['media'] medias = post['media']
@ -45,25 +56,18 @@ for post in posts_json:
"texte": title, "texte": title,
"creation_timestamp": creation_timestamp}) "creation_timestamp": creation_timestamp})
# In[ ]: #%% In[ ]:
posts_medias_df = pd.DataFrame(posts_medias) posts_medias_df = pd.DataFrame(posts_medias)
# In[ ]: #%% In[ ]:
posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
# In[ ]:
del posts_medias_df['creation_timestamp']
# In[ ]:
posts_medias_df.fillna(value="", inplace=True) posts_medias_df.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
# Filter empty texte # Filter empty texte
posts_medias_df = posts_medias_df[~posts_medias_df['texte'].str.strip().eq('')] posts_medias_df = posts_medias_df[~posts_medias_df['texte'].str.strip().eq('')]
# In[ ]: #%% In[ ]:
documents_to_database(posts_medias_df) documents_to_database(posts_medias_df)

View file

@ -1,49 +1,55 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.get_ids import get_idtypedocument, get_idreseausocial
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
instagram_data_path = 'data/Instagram/content/reels.json' # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'reels.json')
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
reels_json = json.loads(convert_encoding_meta(posts.read())) reels_json = json.loads(convert_encoding_meta(posts.read()))
# In[ ]: #%% In[ ]:
ig_reels_media = [x['media'][0] for x in reels_json['ig_reels_media']] ig_reels_media = [x['media'][0] for x in reels_json['ig_reels_media']]
# In[ ]: #%% In[ ]:
ig_reels_df = pd.DataFrame(ig_reels_media) ig_reels_df = pd.DataFrame(ig_reels_media)
# In[ ]: #%% In[ ]:
ig_reels_df['datepublication'] = ig_reels_df['creation_timestamp'].apply(
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
ig_reels_df['index'] = "rs_instagram_content" ig_reels_df['index'] = "rs_instagram_content"
ig_reels_df['type'] = "reels" ig_reels_df['type'] = "reels"
ig_reels_df['network'] = "Instagram" ig_reels_df['network'] = "Instagram"
ig_reels_df['chemin'] = instagram_data_path ig_reels_df['chemin'] = instagram_data_path
# In[ ]: #%% In[ ]:
ig_reels_df.rename(columns={"title": "texte"}, inplace=True) ig_reels_df.rename(columns={"title": "texte"}, inplace=True)
# In[ ]: #%% In[ ]:
del ig_reels_df['creation_timestamp']
del ig_reels_df['media_metadata'] del ig_reels_df['media_metadata']
del ig_reels_df['cross_post_source'] del ig_reels_df['cross_post_source']
del ig_reels_df['dubbing_info'] del ig_reels_df['dubbing_info']
# In[ ]: #%% In[ ]:
ig_reels_df.fillna(value="", inplace=True) ig_reels_df.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
ig_reels_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) ig_reels_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
# Filter empty texte # Filter empty texte
ig_reels_df = ig_reels_df[~ig_reels_df['texte'].str.strip().eq('')] ig_reels_df = ig_reels_df[~ig_reels_df['texte'].str.strip().eq('')]
# In[ ]: #%% In[ ]:
documents_to_database(ig_reels_df) documents_to_database(ig_reels_df)

View file

@ -1,49 +1,52 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.get_ids import get_idtypedocument, get_idreseausocial
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
instagram_data_path = 'data/Instagram/content/stories.json' # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'stories.json')
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
stories_json = json.loads(convert_encoding_meta(posts.read())) stories_json = json.loads(convert_encoding_meta(posts.read()))
# In[ ]: #%% In[ ]:
ig_stories_df = pd.DataFrame(stories_json['ig_stories']) ig_stories_df = pd.DataFrame(stories_json['ig_stories'])
# In[ ]: #%% In[ ]:
ig_stories_df['datepublication'] = ig_stories_df['creation_timestamp'].apply(
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
# In[ ]:
ig_stories_df['index'] = "rs_instagram_content" ig_stories_df['index'] = "rs_instagram_content"
ig_stories_df['type'] = "stories" ig_stories_df['type'] = "stories"
ig_stories_df['network'] = "Instagram" ig_stories_df['network'] = "Instagram"
ig_stories_df['chemin'] = instagram_data_path ig_stories_df['chemin'] = instagram_data_path
# In[ ]: #%% In[ ]:
ig_stories_df.rename(columns={"title": "texte"}, inplace=True) ig_stories_df.rename(columns={"title": "texte"}, inplace=True)
# In[ ]: #%% In[ ]:
del ig_stories_df['creation_timestamp']
del ig_stories_df['media_metadata'] del ig_stories_df['media_metadata']
del ig_stories_df['cross_post_source'] del ig_stories_df['cross_post_source']
del ig_stories_df['ai_stickers'] del ig_stories_df['ai_stickers']
del ig_stories_df['dubbing_info'] del ig_stories_df['dubbing_info']
# In[ ]: #%% In[ ]:
ig_stories_df.fillna(value="", inplace=True) ig_stories_df.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
ig_stories_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) ig_stories_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
# Filter empty texte # Filter empty texte
ig_stories_df = ig_stories_df[~ig_stories_df['texte'].str.strip('\n').str.strip().eq('')] ig_stories_df = ig_stories_df[~ig_stories_df['texte'].str.strip('\n').str.strip().eq('')]
# In[ ]: #%% In[ ]:
documents_to_database(ig_stories_df) documents_to_database(ig_stories_df)

View file

@ -1,39 +1,48 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
instagram_data_path = 'data/Instagram/comments/post_comments_1.json' # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'post_comments_1.json')
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
post_comments_1 = json.loads(convert_encoding_meta(posts.read())) post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
# In[ ]: #%% In[ ]:
ig_comments = [] ig_comments = []
for comment in post_comments_1: for comment in post_comments_1:
ig_comments.append({"texte": comment['string_map_data']['Comment']['value'], ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
'datepublication': datetime.datetime.fromtimestamp( 'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
"chemin": instagram_data_path, "chemin": instagram_data_path,
"index": "rs_instagram_comments", "index": "rs_instagram_comments",
"type": "comments", "type": "comments",
"network": "Instagram"}) "network": "Instagram"})
# In[ ]: #%% In[ ]:
ig_comments_df = pd.DataFrame(ig_comments) ig_comments_df = pd.DataFrame(ig_comments)
# In[ ]: #%% In[ ]:
ig_comments_df.fillna(value="", inplace=True) ig_comments_df.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
# Filter empty texte # Filter empty texte
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')] ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
# In[ ]: #%% In[ ]:
documents_to_database(ig_comments_df) documents_to_database(ig_comments_df)

View file

@ -1,40 +1,48 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.get_ids import get_idtypedocument, get_idreseausocial
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
instagram_data_path = 'data/Instagram/comments/reels_comments.json' # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'reels_comments.json')
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
reels_comments = json.loads(convert_encoding_meta(posts.read())) reels_comments = json.loads(convert_encoding_meta(posts.read()))
# In[ ]: #%% In[ ]:
ig_comments = [] ig_comments = []
for comment in reels_comments['comments_reels_comments']: for comment in reels_comments['comments_reels_comments']:
ig_comments.append({"texte": comment['string_map_data']['Comment']['value'], ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
'datepublication': datetime.datetime.fromtimestamp( 'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
"chemin": instagram_data_path, "chemin": instagram_data_path,
"index": "rs_instagram_comments", "index": "rs_instagram_comments",
"type": "comments", "type": "comments",
"network": "Instagram"}) "network": "Instagram"})
# In[ ]: #%% In[ ]:
ig_comments_df = pd.DataFrame(ig_comments) ig_comments_df = pd.DataFrame(ig_comments)
# In[ ]: #%% In[ ]:
ig_comments_df.fillna(value="", inplace=True) ig_comments_df.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
# Filter empty texte # Filter empty texte
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')] ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
# In[ ]: #%% In[ ]:
documents_to_database(ig_comments_df) documents_to_database(ig_comments_df)

View file

@ -1,40 +1,49 @@
import datetime
import pandas as pd import pandas as pd
import json import json
import os
from pathlib import Path
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
from utils.convert_encoding_meta import convert_encoding_meta from utils.convert_encoding_meta import convert_encoding_meta
# In[ ]: #%% In[ ]:
instagram_data_path = 'data/Instagram/threads/threads_and_replies.json' # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'threads', 'threads_and_replies.json')
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
post_comments_1 = json.loads(convert_encoding_meta(posts.read())) post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
# In[ ]: #%% In[ ]:
threads_comments = [] threads_comments = []
for post in post_comments_1['text_post_app_text_posts']: for post in post_comments_1['text_post_app_text_posts']:
for element in post['media']: for element in post['media']:
threads_comments.append({"texte": element['title'], threads_comments.append({"texte": element['title'],
'datepublication': datetime.datetime.fromtimestamp( 'creation_timestamp': element['creation_timestamp'],
timestamp=element['creation_timestamp']).isoformat(),
"chemin": instagram_data_path, "chemin": instagram_data_path,
"index": "rs_instagram_threads", "index": "rs_instagram_threads",
"type": "posts", "type": "posts",
"network": "Instagram"}) "network": "Instagram"})
# In[ ]: #%% In[ ]:
ig_comments_df = pd.DataFrame(threads_comments) ig_comments_df = pd.DataFrame(threads_comments)
# In[ ]: #%% In[ ]:
ig_comments_df.fillna(value="", inplace=True) ig_comments_df.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
# Filter empty texte # Filter empty texte
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')] ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
# In[ ]: #%% In[ ]:
documents_to_database(ig_comments_df) documents_to_database(ig_comments_df)

View file

@ -1,44 +1,58 @@
import pandas as pd import pandas as pd
import datetime import datetime
import os
from pathlib import Path
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
# In[ ]: #%% In[ ]:
linkedin_data_path = "data/LinkedIn/shares/Shares.csv" # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'shares', 'Shares.csv')
raw_shares = pd.read_csv(linkedin_data_path) raw_shares = pd.read_csv(linkedin_data_path)
# In[ ]: #%% In[ ]:
raw_shares['index'] = "rs_linkedin_shares" raw_shares['index'] = "rs_linkedin_shares"
raw_shares['type'] = "posts" raw_shares['type'] = "posts"
raw_shares['network'] = "LinkedIn" raw_shares['network'] = "LinkedIn"
raw_shares['chemin'] = linkedin_data_path raw_shares['chemin'] = linkedin_data_path
# In[ ]: #%% In[ ]:
raw_shares["datepublication"] = raw_shares["Date"].apply( raw_shares["creation_timestamp"] = raw_shares["Date"].apply(
lambda x: str(datetime.datetime.fromisoformat(x).isoformat())) lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
)
del raw_shares["Date"] del raw_shares["Date"]
# In[ ]: #%% In[ ]:
raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True) raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True)
# In[ ]: #%% In[ ]:
raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x)) raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x))
# In[ ]: #%% In[ ]:
del raw_shares["SharedUrl"] del raw_shares["SharedUrl"]
del raw_shares["MediaUrl"] del raw_shares["MediaUrl"]
del raw_shares["Visibility"] del raw_shares["Visibility"]
# In[ ]: #%% In[ ]:
raw_shares.fillna(value="", inplace=True) raw_shares.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
raw_shares.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) raw_shares.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
# Filter empty texte # Filter empty texte
raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')] raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')]
# In[ ]: #%% In[ ]:
documents_to_database(raw_shares) documents_to_database(raw_shares)

View file

@ -1,19 +0,0 @@
linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv"
library("readr")
library("magrittr")
library("dplyr")
# Read CSV file
ddd <- readr::read_delim(linkedin_data_path,
escape_backslash = TRUE,
trim_ws = TRUE,
skip_empty_rows = FALSE,
delim = ",")
# Remove carriage returns
ddd %>%
mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>%
select(-Message) -> ddd2
# Save the cleaned data to a new CSV file
ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE)

View file

@ -1,41 +1,58 @@
import pandas as pd import pandas as pd
import datetime import datetime
import os
from pathlib import Path
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
# In[ ]: #%% In[ ]:
linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv" # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
# In[ ]: project_root = script_dir
raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8') linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'comments', 'Comments.csv')
#%% In[ ]:
raw_comments_csv = pd.read_csv(linkedin_data_path,
escapechar='\\',
skipinitialspace=True)
raw_comments_csv['MessageFix'] = raw_comments_csv['Message'].str.replace(r'[\r\n\t]+', ' ', regex=True)
raw_comments_csv = raw_comments_csv.drop(columns=['Message'])
raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates() raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates()
# In[ ]: #%% In[ ]:
raw_comments['index'] = "rs_linkedin_comments" raw_comments['index'] = "rs_linkedin_comments"
raw_comments['type'] = "comments" raw_comments['type'] = "comments"
raw_comments['network'] = "LinkedIn" raw_comments['network'] = "LinkedIn"
raw_comments['chemin'] = linkedin_data_path raw_comments['chemin'] = linkedin_data_path
# In[ ]: #%% In[ ]:
raw_comments["datepublication"] = raw_comments["Date"].apply( raw_comments["creation_timestamp"] = raw_comments["Date"].apply(
lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat())) lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
)
del raw_comments["Date"] del raw_comments["Date"]
# In[ ]: #%% In[ ]:
raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True) raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True)
# In[ ]: #%% In[ ]:
raw_comments["chemin"] = linkedin_data_path raw_comments["chemin"] = linkedin_data_path
# In[ ]: #%% In[ ]:
raw_comments.fillna(value="", inplace=True) raw_comments.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
raw_comments.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) raw_comments.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
# In[ ]: #%% In[ ]:
# Filter empty texte # Filter empty texte
raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')] raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')]
# In[ ]: #%% In[ ]:
documents_to_database(raw_comments) documents_to_database(raw_comments)

View file

@ -5,29 +5,42 @@ import xmltodict
import pandas as pd import pandas as pd
import markdownify import markdownify
import os
from pathlib import Path
from utils.documents_to_database import documents_to_database from utils.documents_to_database import documents_to_database
# In[ ]: #%% In[ ]:
wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml" # Get the current file's directory
try:
# This will work when running as a script
script_dir = Path(__file__).parent.parent
except NameError:
# This will work in interactive environments
script_dir = Path().absolute()
project_root = script_dir
wordpress_xml_path = os.path.join(project_root, 'import_data', 'data', 'Wordpress', 'jevalideca', 'wordpress.xml')
with open(wordpress_xml_path, "r") as xml_file: with open(wordpress_xml_path, "r") as xml_file:
wordpress_xml = xml_file.read() wordpress_xml = xml_file.read()
# In[ ]: #%% In[ ]:
wordpress_dict = xmltodict.parse(wordpress_xml) wordpress_dict = xmltodict.parse(wordpress_xml)
# In[ ]: #%% In[ ]:
items_df = pd.DataFrame(wordpress_dict['rss']['channel']['item']) items_df = pd.DataFrame(wordpress_dict['rss']['channel']['item'])
# In[ ]: #%% In[ ]:
items_df_filter = items_df[ items_df_filter = items_df[
(items_df['wp:post_type'].isin(['page', 'post'])) & (items_df['wp:status'] == 'publish')].copy() (items_df['wp:post_type'].isin(['page', 'post'])) & (items_df['wp:status'] == 'publish')].copy()
# In[ ]: #%% In[ ]:
items_df_filter['datepublication'] = items_df_filter['wp:post_date'].apply( items_df_filter['creation_timestamp'] = items_df_filter['wp:post_date'].apply(
lambda x: str(datetime.datetime.fromisoformat(x).isoformat())) lambda x: int(datetime.datetime.fromisoformat(x).timestamp()))
# In[ ]: #%% In[ ]:
def wp_to_markdown(x): def wp_to_markdown(x):
try: try:
md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x, heading_style='ATX')).strip() md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x, heading_style='ATX')).strip()
@ -38,25 +51,25 @@ def wp_to_markdown(x):
return md_text return md_text
# In[ ]: #%% In[ ]:
items_df_filter['texte'] = items_df_filter['content:encoded'].apply(lambda x: wp_to_markdown(x)) items_df_filter['texte'] = items_df_filter['content:encoded'].apply(lambda x: wp_to_markdown(x))
# In[ ]: #%% In[ ]:
items_df_filter.rename(columns={"link": "uri", "wp:post_type": "type"}, inplace=True) items_df_filter.rename(columns={"link": "uri", "wp:post_type": "type"}, inplace=True)
# In[ ]: #%% In[ ]:
items_df_filter['index'] = "rs_wordpress_jevalideca" items_df_filter['index'] = "rs_wordpress_jevalideca"
items_df_filter['network'] = "Wordpress" items_df_filter['network'] = "Wordpress"
items_df_filter['chemin'] = wordpress_xml_path items_df_filter['chemin'] = wordpress_xml_path
# In[ ]: #%% In[ ]:
items_df_filter.fillna(value="", inplace=True) items_df_filter.fillna(value="", inplace=True)
# In[ ]: #%% In[ ]:
documents_to_database(items_df_filter[['title', documents_to_database(items_df_filter[['title',
'uri', 'uri',
'type', 'type',
'datepublication', 'creation_timestamp',
'texte', 'texte',
'index', 'index',
'network', 'network',

View file

@ -4,3 +4,4 @@ requests==2.31.0
xmltodict==0.13.0 xmltodict==0.13.0
python_dotenv==1.0.1 python_dotenv==1.0.1
pyarrow==17.0.0 pyarrow==17.0.0
typesense==0.21.0

View file

@ -0,0 +1,17 @@
#!/bin/bash
# Navigate to the directory containing the scripts
cd "$(dirname "$0")" || exit
# Find and execute all Python scripts matching the pattern
for script in [0-9][0-9]_importation_*.py
do
if [ -f "$script" ]; then
echo "Running $script..."
python3 "$script"
echo "Finished $script"
echo "--------------------"
fi
done
echo "All importation scripts have been executed."

View file

@ -1,20 +1,11 @@
import pandas as pd import tqdm
import requests
from utils.opensearch import opensearch_client from .typesense_client import client
def documents_to_database(documents_list, os_client=client):
def documents_to_database(documents_list, os_client=opensearch_client): try:
# Check if opensearch is available for document in tqdm.tqdm(documents_list.to_dict(orient='records')):
if not os_client.ping(): os_client.collections['social_media_posts'].documents.create(document)
raise requests.exceptions.ConnectionError("Opensearch is not reachable") print(f"Successfully inserted {len(documents_list)} documents.")
# Check if the specified index exists except Exception as e:
if not os_client.indices.exists(index=documents_list['index'].iloc[0]): print(f"Error inserting documents: {str(e)}")
raise requests.exceptions.HTTPError(f"Index '{documents_list['index'].iloc[0]}' does not exist")
# Insert each document into opensearch index(es)
for document in documents_list.to_dict(orient='records'):
index_name = document.pop('index', None)
if not index_name:
raise ValueError("Document must have an 'index' field")
os_client.index(index=index_name,
body=document)

View file

@ -1,22 +0,0 @@
import os
import dotenv
# Load environment variables from.env file
dotenv.load_dotenv()
# Connect to OpenSearch using the provided credentials and hostname/port.
from opensearchpy import OpenSearch
host = 'localhost'
port = 9200
auth = ('admin', os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")) # For testing only. Don't store credentials in code.
# Create the client with SSL/TLS enabled, but hostname verification disabled.
opensearch_client = OpenSearch(
hosts=[{'host': host, 'port': port}],
http_compress=True, # enables gzip compression for request bodies
http_auth=auth,
use_ssl=True,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False
)

View file

@ -11,4 +11,7 @@ reseau_social_data = [{"nom": "LinkedIn",
{"nom": "Facebook", {"nom": "Facebook",
"repertoires": ["comments_and_reactions", "posts"]}, "repertoires": ["comments_and_reactions", "posts"]},
{"nom": "FacebookBusiness", {"nom": "FacebookBusiness",
"repertoires": ["posts"]}] "repertoires": ["posts"]},
{"nom": "Podcast",
"repertoires": ["shownotes", "audio"]}
]

View file

@ -0,0 +1,15 @@
import typesense
import os
from dotenv import load_dotenv
load_dotenv()
client = typesense.Client({
'nodes': [{
'host': 'localhost',
'port': '8108',
'protocol': 'http'
}],
'api_key': os.getenv('TYPESENSE_API_KEY'),
'connection_timeout_seconds': 2
})