importation de documents dans typesense
This commit is contained in:
parent
f4acc32451
commit
7a74dbf413
24 changed files with 390 additions and 332 deletions
|
@ -1 +1 @@
|
|||
OPENSEARCH_INITIAL_ADMIN_PASSWORD=
|
||||
TYPESENSE_API_KEY=
|
|
@ -71,9 +71,7 @@ Tu peux obtenir une sauvegarde des données de tes réseaux sociaux. Je t'ai mis
|
|||
![ou-mettre-fichiers-reseaux.png](images/ou-mettre-fichiers-reseaux.png)
|
||||
|
||||
- Exécuter le fichier qui crée les index dans le moteur de recherche [00_creer_reseauxsociaux.py](import_data/00_creer_reseauxsociaux.py)
|
||||
|
||||
- Ensuite exécuter tous les fichiers Python pertinents pour charger les données que tu as téléchargées.
|
||||
- Pour les commentaires LinkedIn, il faut exécuter le programme R [32_importation_linkedin_comments.R](import_data/32_importation_linkedin_comments.R) avant le programme Python [32_importation_linkedin_comments.py](import_data/32_importation_linkedin_comments.py)
|
||||
|
||||
Ensuite, tu devrais pouvoir voir toutes les données que tu as chargées dans OpenSearch Dashboards sur http://localhost:5601. Va dans Discover / Search and find insights.
|
||||
|
||||
|
|
|
@ -1,68 +1,21 @@
|
|||
---
|
||||
version: '3'
|
||||
services:
|
||||
opensearch-node1:
|
||||
image: opensearchproject/opensearch:latest
|
||||
container_name: opensearch-node1
|
||||
typesense:
|
||||
image: typesense/typesense:27.1
|
||||
container_name: typesense
|
||||
environment:
|
||||
- cluster.name=opensearch-cluster
|
||||
- node.name=opensearch-node1
|
||||
- discovery.seed_hosts=opensearch-node1,opensearch-node2
|
||||
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
|
||||
- bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
|
||||
- OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
|
||||
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and higher
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
nofile:
|
||||
soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
|
||||
hard: 65536
|
||||
- TYPESENSE_API_KEY=${TYPESENSE_API_KEY}
|
||||
- TYPESENSE_DATA_DIR=/data
|
||||
volumes:
|
||||
- opensearch-data1:/usr/share/opensearch/data
|
||||
- typesense-data:/data
|
||||
ports:
|
||||
- 9200:9200
|
||||
- 9600:9600 # required for Performance Analyzer
|
||||
- "8108:8108"
|
||||
networks:
|
||||
- opensearch-net
|
||||
opensearch-node2:
|
||||
image: opensearchproject/opensearch:latest
|
||||
container_name: opensearch-node2
|
||||
environment:
|
||||
- cluster.name=opensearch-cluster
|
||||
- node.name=opensearch-node2
|
||||
- discovery.seed_hosts=opensearch-node1,opensearch-node2
|
||||
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
|
||||
- bootstrap.memory_lock=true
|
||||
- OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m
|
||||
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
nofile:
|
||||
soft: 65536
|
||||
hard: 65536
|
||||
volumes:
|
||||
- opensearch-data2:/usr/share/opensearch/data
|
||||
networks:
|
||||
- opensearch-net
|
||||
opensearch-dashboards:
|
||||
image: opensearchproject/opensearch-dashboards:latest
|
||||
container_name: opensearch-dashboards
|
||||
ports:
|
||||
- 5601:5601
|
||||
expose:
|
||||
- '5601'
|
||||
environment:
|
||||
OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]'
|
||||
networks:
|
||||
- opensearch-net
|
||||
- typesense-net
|
||||
|
||||
volumes:
|
||||
opensearch-data1:
|
||||
opensearch-data2:
|
||||
typesense-data:
|
||||
|
||||
networks:
|
||||
opensearch-net:
|
||||
typesense-net:
|
|
@ -1,18 +1,40 @@
|
|||
import requests
|
||||
import utils.config
|
||||
from utils.opensearch import opensearch_client
|
||||
from utils.reseau_social_data import reseau_social_data as rs_data
|
||||
from typesense.exceptions import TypesenseClientError, ObjectAlreadyExists
|
||||
|
||||
# %%
|
||||
rs_data
|
||||
from utils.typesense_client import client
|
||||
|
||||
# Create a collection
|
||||
try:
|
||||
client.collections.create({
|
||||
'name': 'social_media_posts',
|
||||
'fields': [
|
||||
{'name': 'id', 'type': 'string'},
|
||||
{'name': 'network', 'type': 'string', 'facet': True},
|
||||
{'name': 'type', 'type': 'string', 'facet': True},
|
||||
{'name': 'index', 'type': 'string', 'facet': True},
|
||||
{'name': 'chemin', 'type': 'string'},
|
||||
{'name': 'texte', 'type': 'string'},
|
||||
{'name': 'creation_timestamp', 'type': 'int64'},
|
||||
{
|
||||
"name" : "embedding",
|
||||
"type" : "float[]",
|
||||
"embed": {
|
||||
"from": [
|
||||
"texte"
|
||||
],
|
||||
"model_config": {
|
||||
"model_name": "ts/multilingual-e5-small"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
'default_sorting_field': 'creation_timestamp'
|
||||
})
|
||||
print("Collection 'social_media_posts' created successfully.")
|
||||
except TypesenseClientError as e:
|
||||
if e==ObjectAlreadyExists:
|
||||
print("Collection 'social_media_posts' already exists. Skipping creation.")
|
||||
else:
|
||||
print(f"Error creating collection: {str(e)}")
|
||||
raise
|
||||
|
||||
# %%
|
||||
opensearch_client.info()
|
||||
|
||||
# %%
|
||||
for rs in rs_data:
|
||||
nom = rs.get("nom")
|
||||
for repertoire in rs.get("repertoires", []):
|
||||
index_name = f"rs_{nom}_{repertoire}".lower()
|
||||
opensearch_client.indices.create(index=index_name)
|
||||
print(f"Index '{index_name}' créé")
|
||||
|
|
3
import_data/00_delete_collection.py
Normal file
3
import_data/00_delete_collection.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
# Utiliser au besoin seulement pour supprimer la collection 'social_media_posts' dans Typesense
|
||||
from utils.typesense_client import client
|
||||
client.collections['social_media_posts'].delete()
|
|
@ -1,21 +1,34 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
fb_data_path = ['data/FacebookBusiness/posts/profile_posts_1.json',
|
||||
'data/FacebookBusiness/posts/uncategorized_photos.json',
|
||||
'data/FacebookBusiness/posts/videos.json']
|
||||
#%% In[ ]:
|
||||
|
||||
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
||||
#%% In[ ]:
|
||||
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'FacebookBusiness', 'posts', 'profile_posts_1.json')]
|
||||
|
||||
try:
|
||||
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
||||
posts_json = json.loads(convert_encoding_meta(posts.read()))
|
||||
except Exception as e:
|
||||
print(f"Error reading JSON file: {e}")
|
||||
exit(1)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
posts_medias = []
|
||||
for post in posts_json:
|
||||
# data
|
||||
|
@ -39,21 +52,14 @@ for post in posts_json:
|
|||
"texte": texte,
|
||||
"creation_timestamp": media["creation_timestamp"]})
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
posts_medias_df = pd.DataFrame(posts_medias)
|
||||
|
||||
# In[ ]:
|
||||
posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
||||
|
||||
# In[ ]:
|
||||
del posts_medias_df['creation_timestamp']
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
posts_medias_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(posts_medias_df)
|
||||
|
|
|
@ -1,18 +1,28 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
fb_data_path = ['data/Facebook/comments_and_reactions/comments.json']
|
||||
#%% In[ ]:
|
||||
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'Facebook', 'comments_and_reactions', 'comments.json')]
|
||||
|
||||
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
||||
comments_json = json.loads(convert_encoding_meta(posts.read()))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
facebook_comments = []
|
||||
for comment in comments_json['comments_v2']:
|
||||
if comment.get('data'):
|
||||
|
@ -26,18 +36,8 @@ for comment in comments_json['comments_v2']:
|
|||
"texte": comment["comment"],
|
||||
"creation_timestamp": comment["timestamp"]})
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
facebook_comments_df = pd.DataFrame(facebook_comments)
|
||||
|
||||
# In[ ]:
|
||||
facebook_comments_df['datepublication'] = facebook_comments_df['creation_timestamp'].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
||||
|
||||
# In[ ]:
|
||||
facebook_comments_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
del facebook_comments_df['creation_timestamp']
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(facebook_comments_df)
|
||||
|
|
|
@ -1,44 +1,51 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
fb_data_path = ['data/Facebook/posts/your_uncategorized_photos.json']
|
||||
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
||||
#%% In[ ]:
|
||||
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
fb_data_path = os.path.join(project_root, 'import_data', 'data', 'Facebook', 'posts', 'your_uncategorized_photos.json')
|
||||
|
||||
with open(fb_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||
photos_json = json.loads(convert_encoding_meta(posts.read()))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
facebook_photos = photos_json['other_photos_v2']
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
facebook_photos_df = pd.DataFrame(facebook_photos)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter out posts without a description
|
||||
facebook_photos_df = facebook_photos_df[~facebook_photos_df['description'].isnull()]
|
||||
|
||||
# In[ ]:
|
||||
facebook_photos_df['datepublication'] = facebook_photos_df['creation_timestamp'].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
||||
#%% In[ ]:
|
||||
facebook_photos_df['index'] = "rs_facebook_posts"
|
||||
facebook_photos_df['network'] = "Facebook"
|
||||
facebook_photos_df['type'] = "posts"
|
||||
facebook_photos_df['chemin'] = fb_data_path[0]
|
||||
facebook_photos_df['chemin'] = fb_data_path
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
facebook_photos_df.rename(columns={"description": "texte"}, inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
del facebook_photos_df['creation_timestamp']
|
||||
#%% In[ ]:
|
||||
del facebook_photos_df['media_metadata']
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
facebook_photos_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(facebook_photos_df)
|
||||
|
|
|
@ -1,17 +1,28 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
instagram_data_path = 'data/Instagram/content/posts_1.json'
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'posts_1.json')
|
||||
|
||||
|
||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||
posts_json = json.loads(convert_encoding_meta(posts.read()))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
posts_medias = []
|
||||
for post in posts_json:
|
||||
medias = post['media']
|
||||
|
@ -45,25 +56,18 @@ for post in posts_json:
|
|||
"texte": title,
|
||||
"creation_timestamp": creation_timestamp})
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
posts_medias_df = pd.DataFrame(posts_medias)
|
||||
|
||||
# In[ ]:
|
||||
posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
||||
|
||||
# In[ ]:
|
||||
del posts_medias_df['creation_timestamp']
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
posts_medias_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter empty texte
|
||||
posts_medias_df = posts_medias_df[~posts_medias_df['texte'].str.strip().eq('')]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(posts_medias_df)
|
||||
|
|
|
@ -1,49 +1,55 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
instagram_data_path = 'data/Instagram/content/reels.json'
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'reels.json')
|
||||
|
||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||
reels_json = json.loads(convert_encoding_meta(posts.read()))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_reels_media = [x['media'][0] for x in reels_json['ig_reels_media']]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_reels_df = pd.DataFrame(ig_reels_media)
|
||||
|
||||
# In[ ]:
|
||||
ig_reels_df['datepublication'] = ig_reels_df['creation_timestamp'].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
||||
#%% In[ ]:
|
||||
ig_reels_df['index'] = "rs_instagram_content"
|
||||
ig_reels_df['type'] = "reels"
|
||||
ig_reels_df['network'] = "Instagram"
|
||||
ig_reels_df['chemin'] = instagram_data_path
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_reels_df.rename(columns={"title": "texte"}, inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
del ig_reels_df['creation_timestamp']
|
||||
#%% In[ ]:
|
||||
del ig_reels_df['media_metadata']
|
||||
del ig_reels_df['cross_post_source']
|
||||
del ig_reels_df['dubbing_info']
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_reels_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
ig_reels_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
ig_reels_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter empty texte
|
||||
ig_reels_df = ig_reels_df[~ig_reels_df['texte'].str.strip().eq('')]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(ig_reels_df)
|
||||
|
|
|
@ -1,49 +1,52 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
instagram_data_path = 'data/Instagram/content/stories.json'
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'stories.json')
|
||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||
stories_json = json.loads(convert_encoding_meta(posts.read()))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_stories_df = pd.DataFrame(stories_json['ig_stories'])
|
||||
|
||||
# In[ ]:
|
||||
ig_stories_df['datepublication'] = ig_stories_df['creation_timestamp'].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_stories_df['index'] = "rs_instagram_content"
|
||||
ig_stories_df['type'] = "stories"
|
||||
ig_stories_df['network'] = "Instagram"
|
||||
ig_stories_df['chemin'] = instagram_data_path
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_stories_df.rename(columns={"title": "texte"}, inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
del ig_stories_df['creation_timestamp']
|
||||
#%% In[ ]:
|
||||
del ig_stories_df['media_metadata']
|
||||
del ig_stories_df['cross_post_source']
|
||||
del ig_stories_df['ai_stickers']
|
||||
del ig_stories_df['dubbing_info']
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_stories_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
ig_stories_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
ig_stories_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter empty texte
|
||||
ig_stories_df = ig_stories_df[~ig_stories_df['texte'].str.strip('\n').str.strip().eq('')]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(ig_stories_df)
|
||||
|
|
|
@ -1,39 +1,48 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
instagram_data_path = 'data/Instagram/comments/post_comments_1.json'
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'post_comments_1.json')
|
||||
|
||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||
post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_comments = []
|
||||
for comment in post_comments_1:
|
||||
ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
|
||||
'datepublication': datetime.datetime.fromtimestamp(
|
||||
timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
|
||||
'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
|
||||
"chemin": instagram_data_path,
|
||||
"index": "rs_instagram_comments",
|
||||
"type": "comments",
|
||||
"network": "Instagram"})
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_comments_df = pd.DataFrame(ig_comments)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_comments_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter empty texte
|
||||
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(ig_comments_df)
|
||||
|
|
|
@ -1,40 +1,48 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
instagram_data_path = 'data/Instagram/comments/reels_comments.json'
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'reels_comments.json')
|
||||
|
||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||
reels_comments = json.loads(convert_encoding_meta(posts.read()))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_comments = []
|
||||
for comment in reels_comments['comments_reels_comments']:
|
||||
ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
|
||||
'datepublication': datetime.datetime.fromtimestamp(
|
||||
timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
|
||||
'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
|
||||
"chemin": instagram_data_path,
|
||||
"index": "rs_instagram_comments",
|
||||
"type": "comments",
|
||||
"network": "Instagram"})
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_comments_df = pd.DataFrame(ig_comments)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_comments_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter empty texte
|
||||
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(ig_comments_df)
|
||||
|
|
|
@ -1,40 +1,49 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.documents_to_database import documents_to_database
|
||||
from utils.convert_encoding_meta import convert_encoding_meta
|
||||
|
||||
# In[ ]:
|
||||
instagram_data_path = 'data/Instagram/threads/threads_and_replies.json'
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'threads', 'threads_and_replies.json')
|
||||
|
||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||
post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
threads_comments = []
|
||||
for post in post_comments_1['text_post_app_text_posts']:
|
||||
for element in post['media']:
|
||||
threads_comments.append({"texte": element['title'],
|
||||
'datepublication': datetime.datetime.fromtimestamp(
|
||||
timestamp=element['creation_timestamp']).isoformat(),
|
||||
'creation_timestamp': element['creation_timestamp'],
|
||||
"chemin": instagram_data_path,
|
||||
"index": "rs_instagram_threads",
|
||||
"type": "posts",
|
||||
"network": "Instagram"})
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_comments_df = pd.DataFrame(threads_comments)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
ig_comments_df.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter empty texte
|
||||
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(ig_comments_df)
|
||||
|
|
|
@ -1,44 +1,58 @@
|
|||
import pandas as pd
|
||||
import datetime
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.documents_to_database import documents_to_database
|
||||
|
||||
# In[ ]:
|
||||
linkedin_data_path = "data/LinkedIn/shares/Shares.csv"
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'shares', 'Shares.csv')
|
||||
|
||||
raw_shares = pd.read_csv(linkedin_data_path)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
raw_shares['index'] = "rs_linkedin_shares"
|
||||
raw_shares['type'] = "posts"
|
||||
raw_shares['network'] = "LinkedIn"
|
||||
raw_shares['chemin'] = linkedin_data_path
|
||||
|
||||
# In[ ]:
|
||||
raw_shares["datepublication"] = raw_shares["Date"].apply(
|
||||
lambda x: str(datetime.datetime.fromisoformat(x).isoformat()))
|
||||
#%% In[ ]:
|
||||
raw_shares["creation_timestamp"] = raw_shares["Date"].apply(
|
||||
lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
|
||||
)
|
||||
del raw_shares["Date"]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
del raw_shares["SharedUrl"]
|
||||
del raw_shares["MediaUrl"]
|
||||
del raw_shares["Visibility"]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
raw_shares.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
raw_shares.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
raw_shares.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter empty texte
|
||||
raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(raw_shares)
|
||||
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv"
|
||||
library("readr")
|
||||
library("magrittr")
|
||||
library("dplyr")
|
||||
|
||||
# Read CSV file
|
||||
ddd <- readr::read_delim(linkedin_data_path,
|
||||
escape_backslash = TRUE,
|
||||
trim_ws = TRUE,
|
||||
skip_empty_rows = FALSE,
|
||||
delim = ",")
|
||||
|
||||
# Remove carriage returns
|
||||
ddd %>%
|
||||
mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>%
|
||||
select(-Message) -> ddd2
|
||||
|
||||
# Save the cleaned data to a new CSV file
|
||||
ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE)
|
|
@ -1,41 +1,58 @@
|
|||
import pandas as pd
|
||||
import datetime
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.documents_to_database import documents_to_database
|
||||
|
||||
# In[ ]:
|
||||
linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv"
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
# In[ ]:
|
||||
raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8')
|
||||
project_root = script_dir
|
||||
linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'comments', 'Comments.csv')
|
||||
|
||||
#%% In[ ]:
|
||||
raw_comments_csv = pd.read_csv(linkedin_data_path,
|
||||
escapechar='\\',
|
||||
skipinitialspace=True)
|
||||
raw_comments_csv['MessageFix'] = raw_comments_csv['Message'].str.replace(r'[\r\n\t]+', ' ', regex=True)
|
||||
raw_comments_csv = raw_comments_csv.drop(columns=['Message'])
|
||||
raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates()
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
raw_comments['index'] = "rs_linkedin_comments"
|
||||
raw_comments['type'] = "comments"
|
||||
raw_comments['network'] = "LinkedIn"
|
||||
raw_comments['chemin'] = linkedin_data_path
|
||||
|
||||
# In[ ]:
|
||||
raw_comments["datepublication"] = raw_comments["Date"].apply(
|
||||
lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
|
||||
#%% In[ ]:
|
||||
raw_comments["creation_timestamp"] = raw_comments["Date"].apply(
|
||||
lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
|
||||
)
|
||||
del raw_comments["Date"]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
raw_comments["chemin"] = linkedin_data_path
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
raw_comments.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
raw_comments.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||||
#%% In[ ]:
|
||||
raw_comments.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
# Filter empty texte
|
||||
raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')]
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(raw_comments)
|
||||
|
|
|
@ -5,29 +5,42 @@ import xmltodict
|
|||
import pandas as pd
|
||||
import markdownify
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from utils.documents_to_database import documents_to_database
|
||||
|
||||
# In[ ]:
|
||||
wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml"
|
||||
#%% In[ ]:
|
||||
# Get the current file's directory
|
||||
try:
|
||||
# This will work when running as a script
|
||||
script_dir = Path(__file__).parent.parent
|
||||
except NameError:
|
||||
# This will work in interactive environments
|
||||
script_dir = Path().absolute()
|
||||
|
||||
project_root = script_dir
|
||||
wordpress_xml_path = os.path.join(project_root, 'import_data', 'data', 'Wordpress', 'jevalideca', 'wordpress.xml')
|
||||
|
||||
with open(wordpress_xml_path, "r") as xml_file:
|
||||
wordpress_xml = xml_file.read()
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
wordpress_dict = xmltodict.parse(wordpress_xml)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
items_df = pd.DataFrame(wordpress_dict['rss']['channel']['item'])
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
items_df_filter = items_df[
|
||||
(items_df['wp:post_type'].isin(['page', 'post'])) & (items_df['wp:status'] == 'publish')].copy()
|
||||
|
||||
# In[ ]:
|
||||
items_df_filter['datepublication'] = items_df_filter['wp:post_date'].apply(
|
||||
lambda x: str(datetime.datetime.fromisoformat(x).isoformat()))
|
||||
#%% In[ ]:
|
||||
items_df_filter['creation_timestamp'] = items_df_filter['wp:post_date'].apply(
|
||||
lambda x: int(datetime.datetime.fromisoformat(x).timestamp()))
|
||||
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
def wp_to_markdown(x):
|
||||
try:
|
||||
md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x, heading_style='ATX')).strip()
|
||||
|
@ -38,25 +51,25 @@ def wp_to_markdown(x):
|
|||
return md_text
|
||||
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
items_df_filter['texte'] = items_df_filter['content:encoded'].apply(lambda x: wp_to_markdown(x))
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
items_df_filter.rename(columns={"link": "uri", "wp:post_type": "type"}, inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
items_df_filter['index'] = "rs_wordpress_jevalideca"
|
||||
items_df_filter['network'] = "Wordpress"
|
||||
items_df_filter['chemin'] = wordpress_xml_path
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
items_df_filter.fillna(value="", inplace=True)
|
||||
|
||||
# In[ ]:
|
||||
#%% In[ ]:
|
||||
documents_to_database(items_df_filter[['title',
|
||||
'uri',
|
||||
'type',
|
||||
'datepublication',
|
||||
'creation_timestamp',
|
||||
'texte',
|
||||
'index',
|
||||
'network',
|
||||
|
|
|
@ -4,3 +4,4 @@ requests==2.31.0
|
|||
xmltodict==0.13.0
|
||||
python_dotenv==1.0.1
|
||||
pyarrow==17.0.0
|
||||
typesense==0.21.0
|
||||
|
|
17
import_data/run_all_imports.sh
Normal file
17
import_data/run_all_imports.sh
Normal file
|
@ -0,0 +1,17 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Navigate to the directory containing the scripts
|
||||
cd "$(dirname "$0")" || exit
|
||||
|
||||
# Find and execute all Python scripts matching the pattern
|
||||
for script in [0-9][0-9]_importation_*.py
|
||||
do
|
||||
if [ -f "$script" ]; then
|
||||
echo "Running $script..."
|
||||
python3 "$script"
|
||||
echo "Finished $script"
|
||||
echo "--------------------"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "All importation scripts have been executed."
|
|
@ -1,20 +1,11 @@
|
|||
import pandas as pd
|
||||
import requests
|
||||
import tqdm
|
||||
|
||||
from utils.opensearch import opensearch_client
|
||||
from .typesense_client import client
|
||||
|
||||
|
||||
def documents_to_database(documents_list, os_client=opensearch_client):
|
||||
# Check if opensearch is available
|
||||
if not os_client.ping():
|
||||
raise requests.exceptions.ConnectionError("Opensearch is not reachable")
|
||||
# Check if the specified index exists
|
||||
if not os_client.indices.exists(index=documents_list['index'].iloc[0]):
|
||||
raise requests.exceptions.HTTPError(f"Index '{documents_list['index'].iloc[0]}' does not exist")
|
||||
# Insert each document into opensearch index(es)
|
||||
for document in documents_list.to_dict(orient='records'):
|
||||
index_name = document.pop('index', None)
|
||||
if not index_name:
|
||||
raise ValueError("Document must have an 'index' field")
|
||||
os_client.index(index=index_name,
|
||||
body=document)
|
||||
def documents_to_database(documents_list, os_client=client):
|
||||
try:
|
||||
for document in tqdm.tqdm(documents_list.to_dict(orient='records')):
|
||||
os_client.collections['social_media_posts'].documents.create(document)
|
||||
print(f"Successfully inserted {len(documents_list)} documents.")
|
||||
except Exception as e:
|
||||
print(f"Error inserting documents: {str(e)}")
|
|
@ -1,22 +0,0 @@
|
|||
import os
|
||||
import dotenv
|
||||
|
||||
# Load environment variables from.env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# Connect to OpenSearch using the provided credentials and hostname/port.
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
host = 'localhost'
|
||||
port = 9200
|
||||
auth = ('admin', os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")) # For testing only. Don't store credentials in code.
|
||||
# Create the client with SSL/TLS enabled, but hostname verification disabled.
|
||||
opensearch_client = OpenSearch(
|
||||
hosts=[{'host': host, 'port': port}],
|
||||
http_compress=True, # enables gzip compression for request bodies
|
||||
http_auth=auth,
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_assert_hostname=False,
|
||||
ssl_show_warn=False
|
||||
)
|
|
@ -11,4 +11,7 @@ reseau_social_data = [{"nom": "LinkedIn",
|
|||
{"nom": "Facebook",
|
||||
"repertoires": ["comments_and_reactions", "posts"]},
|
||||
{"nom": "FacebookBusiness",
|
||||
"repertoires": ["posts"]}]
|
||||
"repertoires": ["posts"]},
|
||||
{"nom": "Podcast",
|
||||
"repertoires": ["shownotes", "audio"]}
|
||||
]
|
||||
|
|
15
import_data/utils/typesense_client.py
Normal file
15
import_data/utils/typesense_client.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import typesense
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
client = typesense.Client({
|
||||
'nodes': [{
|
||||
'host': 'localhost',
|
||||
'port': '8108',
|
||||
'protocol': 'http'
|
||||
}],
|
||||
'api_key': os.getenv('TYPESENSE_API_KEY'),
|
||||
'connection_timeout_seconds': 2
|
||||
})
|
Loading…
Reference in a new issue