importation de documents dans typesense
This commit is contained in:
parent
f4acc32451
commit
7a74dbf413
24 changed files with 390 additions and 332 deletions
|
@ -1 +1 @@
|
||||||
OPENSEARCH_INITIAL_ADMIN_PASSWORD=
|
TYPESENSE_API_KEY=
|
|
@ -71,9 +71,7 @@ Tu peux obtenir une sauvegarde des données de tes réseaux sociaux. Je t'ai mis
|
||||||
![ou-mettre-fichiers-reseaux.png](images/ou-mettre-fichiers-reseaux.png)
|
![ou-mettre-fichiers-reseaux.png](images/ou-mettre-fichiers-reseaux.png)
|
||||||
|
|
||||||
- Exécuter le fichier qui crée les index dans le moteur de recherche [00_creer_reseauxsociaux.py](import_data/00_creer_reseauxsociaux.py)
|
- Exécuter le fichier qui crée les index dans le moteur de recherche [00_creer_reseauxsociaux.py](import_data/00_creer_reseauxsociaux.py)
|
||||||
|
|
||||||
- Ensuite exécuter tous les fichiers Python pertinents pour charger les données que tu as téléchargées.
|
- Ensuite exécuter tous les fichiers Python pertinents pour charger les données que tu as téléchargées.
|
||||||
- Pour les commentaires LinkedIn, il faut exécuter le programme R [32_importation_linkedin_comments.R](import_data/32_importation_linkedin_comments.R) avant le programme Python [32_importation_linkedin_comments.py](import_data/32_importation_linkedin_comments.py)
|
|
||||||
|
|
||||||
Ensuite, tu devrais pouvoir voir toutes les données que tu as chargées dans OpenSearch Dashboards sur http://localhost:5601. Va dans Discover / Search and find insights.
|
Ensuite, tu devrais pouvoir voir toutes les données que tu as chargées dans OpenSearch Dashboards sur http://localhost:5601. Va dans Discover / Search and find insights.
|
||||||
|
|
||||||
|
|
|
@ -1,68 +1,21 @@
|
||||||
---
|
---
|
||||||
version: '3'
|
version: '3'
|
||||||
services:
|
services:
|
||||||
opensearch-node1:
|
typesense:
|
||||||
image: opensearchproject/opensearch:latest
|
image: typesense/typesense:27.1
|
||||||
container_name: opensearch-node1
|
container_name: typesense
|
||||||
environment:
|
environment:
|
||||||
- cluster.name=opensearch-cluster
|
- TYPESENSE_API_KEY=${TYPESENSE_API_KEY}
|
||||||
- node.name=opensearch-node1
|
- TYPESENSE_DATA_DIR=/data
|
||||||
- discovery.seed_hosts=opensearch-node1,opensearch-node2
|
|
||||||
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
|
|
||||||
- bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
|
|
||||||
- OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
|
|
||||||
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and higher
|
|
||||||
ulimits:
|
|
||||||
memlock:
|
|
||||||
soft: -1
|
|
||||||
hard: -1
|
|
||||||
nofile:
|
|
||||||
soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
|
|
||||||
hard: 65536
|
|
||||||
volumes:
|
volumes:
|
||||||
- opensearch-data1:/usr/share/opensearch/data
|
- typesense-data:/data
|
||||||
ports:
|
ports:
|
||||||
- 9200:9200
|
- "8108:8108"
|
||||||
- 9600:9600 # required for Performance Analyzer
|
|
||||||
networks:
|
networks:
|
||||||
- opensearch-net
|
- typesense-net
|
||||||
opensearch-node2:
|
|
||||||
image: opensearchproject/opensearch:latest
|
|
||||||
container_name: opensearch-node2
|
|
||||||
environment:
|
|
||||||
- cluster.name=opensearch-cluster
|
|
||||||
- node.name=opensearch-node2
|
|
||||||
- discovery.seed_hosts=opensearch-node1,opensearch-node2
|
|
||||||
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
|
|
||||||
- bootstrap.memory_lock=true
|
|
||||||
- OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m
|
|
||||||
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
|
|
||||||
ulimits:
|
|
||||||
memlock:
|
|
||||||
soft: -1
|
|
||||||
hard: -1
|
|
||||||
nofile:
|
|
||||||
soft: 65536
|
|
||||||
hard: 65536
|
|
||||||
volumes:
|
|
||||||
- opensearch-data2:/usr/share/opensearch/data
|
|
||||||
networks:
|
|
||||||
- opensearch-net
|
|
||||||
opensearch-dashboards:
|
|
||||||
image: opensearchproject/opensearch-dashboards:latest
|
|
||||||
container_name: opensearch-dashboards
|
|
||||||
ports:
|
|
||||||
- 5601:5601
|
|
||||||
expose:
|
|
||||||
- '5601'
|
|
||||||
environment:
|
|
||||||
OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]'
|
|
||||||
networks:
|
|
||||||
- opensearch-net
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
opensearch-data1:
|
typesense-data:
|
||||||
opensearch-data2:
|
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
opensearch-net:
|
typesense-net:
|
|
@ -1,18 +1,40 @@
|
||||||
import requests
|
from typesense.exceptions import TypesenseClientError, ObjectAlreadyExists
|
||||||
import utils.config
|
|
||||||
from utils.opensearch import opensearch_client
|
|
||||||
from utils.reseau_social_data import reseau_social_data as rs_data
|
|
||||||
|
|
||||||
# %%
|
from utils.typesense_client import client
|
||||||
rs_data
|
|
||||||
|
# Create a collection
|
||||||
|
try:
|
||||||
|
client.collections.create({
|
||||||
|
'name': 'social_media_posts',
|
||||||
|
'fields': [
|
||||||
|
{'name': 'id', 'type': 'string'},
|
||||||
|
{'name': 'network', 'type': 'string', 'facet': True},
|
||||||
|
{'name': 'type', 'type': 'string', 'facet': True},
|
||||||
|
{'name': 'index', 'type': 'string', 'facet': True},
|
||||||
|
{'name': 'chemin', 'type': 'string'},
|
||||||
|
{'name': 'texte', 'type': 'string'},
|
||||||
|
{'name': 'creation_timestamp', 'type': 'int64'},
|
||||||
|
{
|
||||||
|
"name" : "embedding",
|
||||||
|
"type" : "float[]",
|
||||||
|
"embed": {
|
||||||
|
"from": [
|
||||||
|
"texte"
|
||||||
|
],
|
||||||
|
"model_config": {
|
||||||
|
"model_name": "ts/multilingual-e5-small"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
'default_sorting_field': 'creation_timestamp'
|
||||||
|
})
|
||||||
|
print("Collection 'social_media_posts' created successfully.")
|
||||||
|
except TypesenseClientError as e:
|
||||||
|
if e==ObjectAlreadyExists:
|
||||||
|
print("Collection 'social_media_posts' already exists. Skipping creation.")
|
||||||
|
else:
|
||||||
|
print(f"Error creating collection: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
# %%
|
|
||||||
opensearch_client.info()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
for rs in rs_data:
|
|
||||||
nom = rs.get("nom")
|
|
||||||
for repertoire in rs.get("repertoires", []):
|
|
||||||
index_name = f"rs_{nom}_{repertoire}".lower()
|
|
||||||
opensearch_client.indices.create(index=index_name)
|
|
||||||
print(f"Index '{index_name}' créé")
|
|
||||||
|
|
3
import_data/00_delete_collection.py
Normal file
3
import_data/00_delete_collection.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
# Utiliser au besoin seulement pour supprimer la collection 'social_media_posts' dans Typesense
|
||||||
|
from utils.typesense_client import client
|
||||||
|
client.collections['social_media_posts'].delete()
|
|
@ -1,21 +1,34 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
fb_data_path = ['data/FacebookBusiness/posts/profile_posts_1.json',
|
|
||||||
'data/FacebookBusiness/posts/uncategorized_photos.json',
|
|
||||||
'data/FacebookBusiness/posts/videos.json']
|
|
||||||
|
|
||||||
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
#%% In[ ]:
|
||||||
posts_json = json.loads(convert_encoding_meta(posts.read()))
|
|
||||||
|
|
||||||
# In[ ]:
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'FacebookBusiness', 'posts', 'profile_posts_1.json')]
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
||||||
|
posts_json = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading JSON file: {e}")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
#%% In[ ]:
|
||||||
posts_medias = []
|
posts_medias = []
|
||||||
for post in posts_json:
|
for post in posts_json:
|
||||||
# data
|
# data
|
||||||
|
@ -39,21 +52,14 @@ for post in posts_json:
|
||||||
"texte": texte,
|
"texte": texte,
|
||||||
"creation_timestamp": media["creation_timestamp"]})
|
"creation_timestamp": media["creation_timestamp"]})
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
posts_medias_df = pd.DataFrame(posts_medias)
|
posts_medias_df = pd.DataFrame(posts_medias)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
|
|
||||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
del posts_medias_df['creation_timestamp']
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
posts_medias_df.fillna(value="", inplace=True)
|
posts_medias_df.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(posts_medias_df)
|
documents_to_database(posts_medias_df)
|
||||||
|
|
|
@ -1,18 +1,28 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
fb_data_path = ['data/Facebook/comments_and_reactions/comments.json']
|
|
||||||
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
fb_data_path = [os.path.join(project_root, 'import_data', 'data', 'Facebook', 'comments_and_reactions', 'comments.json')]
|
||||||
|
|
||||||
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
||||||
comments_json = json.loads(convert_encoding_meta(posts.read()))
|
comments_json = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
facebook_comments = []
|
facebook_comments = []
|
||||||
for comment in comments_json['comments_v2']:
|
for comment in comments_json['comments_v2']:
|
||||||
if comment.get('data'):
|
if comment.get('data'):
|
||||||
|
@ -26,18 +36,8 @@ for comment in comments_json['comments_v2']:
|
||||||
"texte": comment["comment"],
|
"texte": comment["comment"],
|
||||||
"creation_timestamp": comment["timestamp"]})
|
"creation_timestamp": comment["timestamp"]})
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
facebook_comments_df = pd.DataFrame(facebook_comments)
|
facebook_comments_df = pd.DataFrame(facebook_comments)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
facebook_comments_df['datepublication'] = facebook_comments_df['creation_timestamp'].apply(
|
|
||||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
facebook_comments_df.fillna(value="", inplace=True)
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
del facebook_comments_df['creation_timestamp']
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
documents_to_database(facebook_comments_df)
|
documents_to_database(facebook_comments_df)
|
||||||
|
|
|
@ -1,44 +1,51 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
fb_data_path = ['data/Facebook/posts/your_uncategorized_photos.json']
|
|
||||||
with open(fb_data_path[0], "r", encoding="raw-unicode-escape") as posts:
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
fb_data_path = os.path.join(project_root, 'import_data', 'data', 'Facebook', 'posts', 'your_uncategorized_photos.json')
|
||||||
|
|
||||||
|
with open(fb_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||||
photos_json = json.loads(convert_encoding_meta(posts.read()))
|
photos_json = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
facebook_photos = photos_json['other_photos_v2']
|
facebook_photos = photos_json['other_photos_v2']
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
facebook_photos_df = pd.DataFrame(facebook_photos)
|
facebook_photos_df = pd.DataFrame(facebook_photos)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter out posts without a description
|
# Filter out posts without a description
|
||||||
facebook_photos_df = facebook_photos_df[~facebook_photos_df['description'].isnull()]
|
facebook_photos_df = facebook_photos_df[~facebook_photos_df['description'].isnull()]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
facebook_photos_df['datepublication'] = facebook_photos_df['creation_timestamp'].apply(
|
|
||||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
|
||||||
facebook_photos_df['index'] = "rs_facebook_posts"
|
facebook_photos_df['index'] = "rs_facebook_posts"
|
||||||
facebook_photos_df['network'] = "Facebook"
|
facebook_photos_df['network'] = "Facebook"
|
||||||
facebook_photos_df['type'] = "posts"
|
facebook_photos_df['type'] = "posts"
|
||||||
facebook_photos_df['chemin'] = fb_data_path[0]
|
facebook_photos_df['chemin'] = fb_data_path
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
facebook_photos_df.rename(columns={"description": "texte"}, inplace=True)
|
facebook_photos_df.rename(columns={"description": "texte"}, inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
del facebook_photos_df['creation_timestamp']
|
|
||||||
del facebook_photos_df['media_metadata']
|
del facebook_photos_df['media_metadata']
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
facebook_photos_df.fillna(value="", inplace=True)
|
facebook_photos_df.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(facebook_photos_df)
|
documents_to_database(facebook_photos_df)
|
||||||
|
|
|
@ -1,17 +1,28 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
instagram_data_path = 'data/Instagram/content/posts_1.json'
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'posts_1.json')
|
||||||
|
|
||||||
|
|
||||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||||
posts_json = json.loads(convert_encoding_meta(posts.read()))
|
posts_json = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
posts_medias = []
|
posts_medias = []
|
||||||
for post in posts_json:
|
for post in posts_json:
|
||||||
medias = post['media']
|
medias = post['media']
|
||||||
|
@ -45,25 +56,18 @@ for post in posts_json:
|
||||||
"texte": title,
|
"texte": title,
|
||||||
"creation_timestamp": creation_timestamp})
|
"creation_timestamp": creation_timestamp})
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
posts_medias_df = pd.DataFrame(posts_medias)
|
posts_medias_df = pd.DataFrame(posts_medias)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
posts_medias_df['datepublication'] = posts_medias_df['creation_timestamp'].apply(
|
|
||||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
del posts_medias_df['creation_timestamp']
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
posts_medias_df.fillna(value="", inplace=True)
|
posts_medias_df.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
posts_medias_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
posts_medias_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter empty texte
|
# Filter empty texte
|
||||||
posts_medias_df = posts_medias_df[~posts_medias_df['texte'].str.strip().eq('')]
|
posts_medias_df = posts_medias_df[~posts_medias_df['texte'].str.strip().eq('')]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(posts_medias_df)
|
documents_to_database(posts_medias_df)
|
||||||
|
|
|
@ -1,49 +1,55 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
instagram_data_path = 'data/Instagram/content/reels.json'
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'reels.json')
|
||||||
|
|
||||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||||
reels_json = json.loads(convert_encoding_meta(posts.read()))
|
reels_json = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_reels_media = [x['media'][0] for x in reels_json['ig_reels_media']]
|
ig_reels_media = [x['media'][0] for x in reels_json['ig_reels_media']]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_reels_df = pd.DataFrame(ig_reels_media)
|
ig_reels_df = pd.DataFrame(ig_reels_media)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_reels_df['datepublication'] = ig_reels_df['creation_timestamp'].apply(
|
|
||||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
|
||||||
ig_reels_df['index'] = "rs_instagram_content"
|
ig_reels_df['index'] = "rs_instagram_content"
|
||||||
ig_reels_df['type'] = "reels"
|
ig_reels_df['type'] = "reels"
|
||||||
ig_reels_df['network'] = "Instagram"
|
ig_reels_df['network'] = "Instagram"
|
||||||
ig_reels_df['chemin'] = instagram_data_path
|
ig_reels_df['chemin'] = instagram_data_path
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_reels_df.rename(columns={"title": "texte"}, inplace=True)
|
ig_reels_df.rename(columns={"title": "texte"}, inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
del ig_reels_df['creation_timestamp']
|
|
||||||
del ig_reels_df['media_metadata']
|
del ig_reels_df['media_metadata']
|
||||||
del ig_reels_df['cross_post_source']
|
del ig_reels_df['cross_post_source']
|
||||||
del ig_reels_df['dubbing_info']
|
del ig_reels_df['dubbing_info']
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_reels_df.fillna(value="", inplace=True)
|
ig_reels_df.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_reels_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
ig_reels_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter empty texte
|
# Filter empty texte
|
||||||
ig_reels_df = ig_reels_df[~ig_reels_df['texte'].str.strip().eq('')]
|
ig_reels_df = ig_reels_df[~ig_reels_df['texte'].str.strip().eq('')]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(ig_reels_df)
|
documents_to_database(ig_reels_df)
|
||||||
|
|
|
@ -1,49 +1,52 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
instagram_data_path = 'data/Instagram/content/stories.json'
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'content', 'stories.json')
|
||||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||||
stories_json = json.loads(convert_encoding_meta(posts.read()))
|
stories_json = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_stories_df = pd.DataFrame(stories_json['ig_stories'])
|
ig_stories_df = pd.DataFrame(stories_json['ig_stories'])
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_stories_df['datepublication'] = ig_stories_df['creation_timestamp'].apply(
|
|
||||||
lambda x: datetime.datetime.fromtimestamp(x).isoformat())
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
ig_stories_df['index'] = "rs_instagram_content"
|
ig_stories_df['index'] = "rs_instagram_content"
|
||||||
ig_stories_df['type'] = "stories"
|
ig_stories_df['type'] = "stories"
|
||||||
ig_stories_df['network'] = "Instagram"
|
ig_stories_df['network'] = "Instagram"
|
||||||
ig_stories_df['chemin'] = instagram_data_path
|
ig_stories_df['chemin'] = instagram_data_path
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_stories_df.rename(columns={"title": "texte"}, inplace=True)
|
ig_stories_df.rename(columns={"title": "texte"}, inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
del ig_stories_df['creation_timestamp']
|
|
||||||
del ig_stories_df['media_metadata']
|
del ig_stories_df['media_metadata']
|
||||||
del ig_stories_df['cross_post_source']
|
del ig_stories_df['cross_post_source']
|
||||||
del ig_stories_df['ai_stickers']
|
del ig_stories_df['ai_stickers']
|
||||||
del ig_stories_df['dubbing_info']
|
del ig_stories_df['dubbing_info']
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_stories_df.fillna(value="", inplace=True)
|
ig_stories_df.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_stories_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
ig_stories_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter empty texte
|
# Filter empty texte
|
||||||
ig_stories_df = ig_stories_df[~ig_stories_df['texte'].str.strip('\n').str.strip().eq('')]
|
ig_stories_df = ig_stories_df[~ig_stories_df['texte'].str.strip('\n').str.strip().eq('')]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(ig_stories_df)
|
documents_to_database(ig_stories_df)
|
||||||
|
|
|
@ -1,39 +1,48 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
instagram_data_path = 'data/Instagram/comments/post_comments_1.json'
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'post_comments_1.json')
|
||||||
|
|
||||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||||
post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
|
post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments = []
|
ig_comments = []
|
||||||
for comment in post_comments_1:
|
for comment in post_comments_1:
|
||||||
ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
|
ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
|
||||||
'datepublication': datetime.datetime.fromtimestamp(
|
'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
|
||||||
timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
|
|
||||||
"chemin": instagram_data_path,
|
"chemin": instagram_data_path,
|
||||||
"index": "rs_instagram_comments",
|
"index": "rs_instagram_comments",
|
||||||
"type": "comments",
|
"type": "comments",
|
||||||
"network": "Instagram"})
|
"network": "Instagram"})
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df = pd.DataFrame(ig_comments)
|
ig_comments_df = pd.DataFrame(ig_comments)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df.fillna(value="", inplace=True)
|
ig_comments_df.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter empty texte
|
# Filter empty texte
|
||||||
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(ig_comments_df)
|
documents_to_database(ig_comments_df)
|
||||||
|
|
|
@ -1,40 +1,48 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
instagram_data_path = 'data/Instagram/comments/reels_comments.json'
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'comments', 'reels_comments.json')
|
||||||
|
|
||||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||||
reels_comments = json.loads(convert_encoding_meta(posts.read()))
|
reels_comments = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments = []
|
ig_comments = []
|
||||||
for comment in reels_comments['comments_reels_comments']:
|
for comment in reels_comments['comments_reels_comments']:
|
||||||
ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
|
ig_comments.append({"texte": comment['string_map_data']['Comment']['value'],
|
||||||
'datepublication': datetime.datetime.fromtimestamp(
|
'creation_timestamp': comment['string_map_data']['Time']['timestamp'],
|
||||||
timestamp=comment['string_map_data']['Time']['timestamp']).isoformat(),
|
|
||||||
"chemin": instagram_data_path,
|
"chemin": instagram_data_path,
|
||||||
"index": "rs_instagram_comments",
|
"index": "rs_instagram_comments",
|
||||||
"type": "comments",
|
"type": "comments",
|
||||||
"network": "Instagram"})
|
"network": "Instagram"})
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df = pd.DataFrame(ig_comments)
|
ig_comments_df = pd.DataFrame(ig_comments)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df.fillna(value="", inplace=True)
|
ig_comments_df.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter empty texte
|
# Filter empty texte
|
||||||
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(ig_comments_df)
|
documents_to_database(ig_comments_df)
|
||||||
|
|
|
@ -1,40 +1,49 @@
|
||||||
import datetime
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
instagram_data_path = 'data/Instagram/threads/threads_and_replies.json'
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'threads', 'threads_and_replies.json')
|
||||||
|
|
||||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||||
post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
|
post_comments_1 = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
threads_comments = []
|
threads_comments = []
|
||||||
for post in post_comments_1['text_post_app_text_posts']:
|
for post in post_comments_1['text_post_app_text_posts']:
|
||||||
for element in post['media']:
|
for element in post['media']:
|
||||||
threads_comments.append({"texte": element['title'],
|
threads_comments.append({"texte": element['title'],
|
||||||
'datepublication': datetime.datetime.fromtimestamp(
|
'creation_timestamp': element['creation_timestamp'],
|
||||||
timestamp=element['creation_timestamp']).isoformat(),
|
|
||||||
"chemin": instagram_data_path,
|
"chemin": instagram_data_path,
|
||||||
"index": "rs_instagram_threads",
|
"index": "rs_instagram_threads",
|
||||||
"type": "posts",
|
"type": "posts",
|
||||||
"network": "Instagram"})
|
"network": "Instagram"})
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df = pd.DataFrame(threads_comments)
|
ig_comments_df = pd.DataFrame(threads_comments)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df.fillna(value="", inplace=True)
|
ig_comments_df.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
ig_comments_df.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
ig_comments_df.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter empty texte
|
# Filter empty texte
|
||||||
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
ig_comments_df = ig_comments_df[~ig_comments_df['texte'].str.strip('\n').str.strip().eq('')]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(ig_comments_df)
|
documents_to_database(ig_comments_df)
|
||||||
|
|
|
@ -1,44 +1,58 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
linkedin_data_path = "data/LinkedIn/shares/Shares.csv"
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'shares', 'Shares.csv')
|
||||||
|
|
||||||
raw_shares = pd.read_csv(linkedin_data_path)
|
raw_shares = pd.read_csv(linkedin_data_path)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_shares['index'] = "rs_linkedin_shares"
|
raw_shares['index'] = "rs_linkedin_shares"
|
||||||
raw_shares['type'] = "posts"
|
raw_shares['type'] = "posts"
|
||||||
raw_shares['network'] = "LinkedIn"
|
raw_shares['network'] = "LinkedIn"
|
||||||
raw_shares['chemin'] = linkedin_data_path
|
raw_shares['chemin'] = linkedin_data_path
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_shares["datepublication"] = raw_shares["Date"].apply(
|
raw_shares["creation_timestamp"] = raw_shares["Date"].apply(
|
||||||
lambda x: str(datetime.datetime.fromisoformat(x).isoformat()))
|
lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
|
||||||
|
)
|
||||||
del raw_shares["Date"]
|
del raw_shares["Date"]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True)
|
raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x))
|
raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
del raw_shares["SharedUrl"]
|
del raw_shares["SharedUrl"]
|
||||||
del raw_shares["MediaUrl"]
|
del raw_shares["MediaUrl"]
|
||||||
del raw_shares["Visibility"]
|
del raw_shares["Visibility"]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_shares.fillna(value="", inplace=True)
|
raw_shares.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_shares.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
raw_shares.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter empty texte
|
# Filter empty texte
|
||||||
raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')]
|
raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(raw_shares)
|
documents_to_database(raw_shares)
|
||||||
|
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv"
|
|
||||||
library("readr")
|
|
||||||
library("magrittr")
|
|
||||||
library("dplyr")
|
|
||||||
|
|
||||||
# Read CSV file
|
|
||||||
ddd <- readr::read_delim(linkedin_data_path,
|
|
||||||
escape_backslash = TRUE,
|
|
||||||
trim_ws = TRUE,
|
|
||||||
skip_empty_rows = FALSE,
|
|
||||||
delim = ",")
|
|
||||||
|
|
||||||
# Remove carriage returns
|
|
||||||
ddd %>%
|
|
||||||
mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>%
|
|
||||||
select(-Message) -> ddd2
|
|
||||||
|
|
||||||
# Save the cleaned data to a new CSV file
|
|
||||||
ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE)
|
|
|
@ -1,41 +1,58 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv"
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
# In[ ]:
|
project_root = script_dir
|
||||||
raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8')
|
linkedin_data_path = os.path.join(project_root, 'import_data', 'data', 'LinkedIn', 'comments', 'Comments.csv')
|
||||||
|
|
||||||
|
#%% In[ ]:
|
||||||
|
raw_comments_csv = pd.read_csv(linkedin_data_path,
|
||||||
|
escapechar='\\',
|
||||||
|
skipinitialspace=True)
|
||||||
|
raw_comments_csv['MessageFix'] = raw_comments_csv['Message'].str.replace(r'[\r\n\t]+', ' ', regex=True)
|
||||||
|
raw_comments_csv = raw_comments_csv.drop(columns=['Message'])
|
||||||
raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates()
|
raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates()
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_comments['index'] = "rs_linkedin_comments"
|
raw_comments['index'] = "rs_linkedin_comments"
|
||||||
raw_comments['type'] = "comments"
|
raw_comments['type'] = "comments"
|
||||||
raw_comments['network'] = "LinkedIn"
|
raw_comments['network'] = "LinkedIn"
|
||||||
raw_comments['chemin'] = linkedin_data_path
|
raw_comments['chemin'] = linkedin_data_path
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_comments["datepublication"] = raw_comments["Date"].apply(
|
raw_comments["creation_timestamp"] = raw_comments["Date"].apply(
|
||||||
lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
|
lambda x: int(datetime.datetime.fromisoformat(x).timestamp())
|
||||||
|
)
|
||||||
del raw_comments["Date"]
|
del raw_comments["Date"]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True)
|
raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_comments["chemin"] = linkedin_data_path
|
raw_comments["chemin"] = linkedin_data_path
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_comments.fillna(value="", inplace=True)
|
raw_comments.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
raw_comments.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
raw_comments.drop_duplicates(subset=['texte', 'creation_timestamp'], inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
# Filter empty texte
|
# Filter empty texte
|
||||||
raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')]
|
raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')]
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(raw_comments)
|
documents_to_database(raw_comments)
|
||||||
|
|
|
@ -5,29 +5,42 @@ import xmltodict
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import markdownify
|
import markdownify
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml"
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
# This will work when running as a script
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
# This will work in interactive environments
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
wordpress_xml_path = os.path.join(project_root, 'import_data', 'data', 'Wordpress', 'jevalideca', 'wordpress.xml')
|
||||||
|
|
||||||
with open(wordpress_xml_path, "r") as xml_file:
|
with open(wordpress_xml_path, "r") as xml_file:
|
||||||
wordpress_xml = xml_file.read()
|
wordpress_xml = xml_file.read()
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
wordpress_dict = xmltodict.parse(wordpress_xml)
|
wordpress_dict = xmltodict.parse(wordpress_xml)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
items_df = pd.DataFrame(wordpress_dict['rss']['channel']['item'])
|
items_df = pd.DataFrame(wordpress_dict['rss']['channel']['item'])
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
items_df_filter = items_df[
|
items_df_filter = items_df[
|
||||||
(items_df['wp:post_type'].isin(['page', 'post'])) & (items_df['wp:status'] == 'publish')].copy()
|
(items_df['wp:post_type'].isin(['page', 'post'])) & (items_df['wp:status'] == 'publish')].copy()
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
items_df_filter['datepublication'] = items_df_filter['wp:post_date'].apply(
|
items_df_filter['creation_timestamp'] = items_df_filter['wp:post_date'].apply(
|
||||||
lambda x: str(datetime.datetime.fromisoformat(x).isoformat()))
|
lambda x: int(datetime.datetime.fromisoformat(x).timestamp()))
|
||||||
|
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
def wp_to_markdown(x):
|
def wp_to_markdown(x):
|
||||||
try:
|
try:
|
||||||
md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x, heading_style='ATX')).strip()
|
md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x, heading_style='ATX')).strip()
|
||||||
|
@ -38,25 +51,25 @@ def wp_to_markdown(x):
|
||||||
return md_text
|
return md_text
|
||||||
|
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
items_df_filter['texte'] = items_df_filter['content:encoded'].apply(lambda x: wp_to_markdown(x))
|
items_df_filter['texte'] = items_df_filter['content:encoded'].apply(lambda x: wp_to_markdown(x))
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
items_df_filter.rename(columns={"link": "uri", "wp:post_type": "type"}, inplace=True)
|
items_df_filter.rename(columns={"link": "uri", "wp:post_type": "type"}, inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
items_df_filter['index'] = "rs_wordpress_jevalideca"
|
items_df_filter['index'] = "rs_wordpress_jevalideca"
|
||||||
items_df_filter['network'] = "Wordpress"
|
items_df_filter['network'] = "Wordpress"
|
||||||
items_df_filter['chemin'] = wordpress_xml_path
|
items_df_filter['chemin'] = wordpress_xml_path
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
items_df_filter.fillna(value="", inplace=True)
|
items_df_filter.fillna(value="", inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
#%% In[ ]:
|
||||||
documents_to_database(items_df_filter[['title',
|
documents_to_database(items_df_filter[['title',
|
||||||
'uri',
|
'uri',
|
||||||
'type',
|
'type',
|
||||||
'datepublication',
|
'creation_timestamp',
|
||||||
'texte',
|
'texte',
|
||||||
'index',
|
'index',
|
||||||
'network',
|
'network',
|
||||||
|
|
|
@ -4,3 +4,4 @@ requests==2.31.0
|
||||||
xmltodict==0.13.0
|
xmltodict==0.13.0
|
||||||
python_dotenv==1.0.1
|
python_dotenv==1.0.1
|
||||||
pyarrow==17.0.0
|
pyarrow==17.0.0
|
||||||
|
typesense==0.21.0
|
||||||
|
|
17
import_data/run_all_imports.sh
Normal file
17
import_data/run_all_imports.sh
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Navigate to the directory containing the scripts
|
||||||
|
cd "$(dirname "$0")" || exit
|
||||||
|
|
||||||
|
# Find and execute all Python scripts matching the pattern
|
||||||
|
for script in [0-9][0-9]_importation_*.py
|
||||||
|
do
|
||||||
|
if [ -f "$script" ]; then
|
||||||
|
echo "Running $script..."
|
||||||
|
python3 "$script"
|
||||||
|
echo "Finished $script"
|
||||||
|
echo "--------------------"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "All importation scripts have been executed."
|
|
@ -1,20 +1,11 @@
|
||||||
import pandas as pd
|
import tqdm
|
||||||
import requests
|
|
||||||
|
|
||||||
from utils.opensearch import opensearch_client
|
from .typesense_client import client
|
||||||
|
|
||||||
|
def documents_to_database(documents_list, os_client=client):
|
||||||
def documents_to_database(documents_list, os_client=opensearch_client):
|
try:
|
||||||
# Check if opensearch is available
|
for document in tqdm.tqdm(documents_list.to_dict(orient='records')):
|
||||||
if not os_client.ping():
|
os_client.collections['social_media_posts'].documents.create(document)
|
||||||
raise requests.exceptions.ConnectionError("Opensearch is not reachable")
|
print(f"Successfully inserted {len(documents_list)} documents.")
|
||||||
# Check if the specified index exists
|
except Exception as e:
|
||||||
if not os_client.indices.exists(index=documents_list['index'].iloc[0]):
|
print(f"Error inserting documents: {str(e)}")
|
||||||
raise requests.exceptions.HTTPError(f"Index '{documents_list['index'].iloc[0]}' does not exist")
|
|
||||||
# Insert each document into opensearch index(es)
|
|
||||||
for document in documents_list.to_dict(orient='records'):
|
|
||||||
index_name = document.pop('index', None)
|
|
||||||
if not index_name:
|
|
||||||
raise ValueError("Document must have an 'index' field")
|
|
||||||
os_client.index(index=index_name,
|
|
||||||
body=document)
|
|
|
@ -1,22 +0,0 @@
|
||||||
import os
|
|
||||||
import dotenv
|
|
||||||
|
|
||||||
# Load environment variables from.env file
|
|
||||||
dotenv.load_dotenv()
|
|
||||||
|
|
||||||
# Connect to OpenSearch using the provided credentials and hostname/port.
|
|
||||||
from opensearchpy import OpenSearch
|
|
||||||
|
|
||||||
host = 'localhost'
|
|
||||||
port = 9200
|
|
||||||
auth = ('admin', os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")) # For testing only. Don't store credentials in code.
|
|
||||||
# Create the client with SSL/TLS enabled, but hostname verification disabled.
|
|
||||||
opensearch_client = OpenSearch(
|
|
||||||
hosts=[{'host': host, 'port': port}],
|
|
||||||
http_compress=True, # enables gzip compression for request bodies
|
|
||||||
http_auth=auth,
|
|
||||||
use_ssl=True,
|
|
||||||
verify_certs=False,
|
|
||||||
ssl_assert_hostname=False,
|
|
||||||
ssl_show_warn=False
|
|
||||||
)
|
|
|
@ -11,4 +11,7 @@ reseau_social_data = [{"nom": "LinkedIn",
|
||||||
{"nom": "Facebook",
|
{"nom": "Facebook",
|
||||||
"repertoires": ["comments_and_reactions", "posts"]},
|
"repertoires": ["comments_and_reactions", "posts"]},
|
||||||
{"nom": "FacebookBusiness",
|
{"nom": "FacebookBusiness",
|
||||||
"repertoires": ["posts"]}]
|
"repertoires": ["posts"]},
|
||||||
|
{"nom": "Podcast",
|
||||||
|
"repertoires": ["shownotes", "audio"]}
|
||||||
|
]
|
||||||
|
|
15
import_data/utils/typesense_client.py
Normal file
15
import_data/utils/typesense_client.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import typesense
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
client = typesense.Client({
|
||||||
|
'nodes': [{
|
||||||
|
'host': 'localhost',
|
||||||
|
'port': '8108',
|
||||||
|
'protocol': 'http'
|
||||||
|
}],
|
||||||
|
'api_key': os.getenv('TYPESENSE_API_KEY'),
|
||||||
|
'connection_timeout_seconds': 2
|
||||||
|
})
|
Loading…
Reference in a new issue