Version initiale pour importer les données

This commit is contained in:
François Pelletier 2024-07-18 20:04:51 -04:00
parent 3d337d064b
commit aff201f6cf
22 changed files with 694 additions and 0 deletions

View file

View file

@ -0,0 +1,2 @@
API_URL = "http://localhost:8000"
WORDPRESS_NAMES = "jevalideca" # Séparer les noms de blog par une virgule

View file

@ -0,0 +1,7 @@
import re
def convert_encoding_meta(text):
conv_text = re.sub(r'[\xc2-\xf4][\x80-\xbf]+',
lambda m: m.group(0).encode('latin1').decode('utf8'), text)
return conv_text

View file

@ -0,0 +1,20 @@
import pandas as pd
import requests
from utils.opensearch import opensearch_client
def documents_to_database(documents_list, os_client=opensearch_client):
# Check if opensearch is available
if not os_client.ping():
raise requests.exceptions.ConnectionError("Opensearch is not reachable")
# Check if the specified index exists
if not os_client.indices.exists(index=documents_list['index'].iloc[0]):
raise requests.exceptions.HTTPError(f"Index '{documents_list['index'].iloc[0]}' does not exist")
# Insert each document into opensearch index(es)
for document in documents_list.to_dict(orient='records'):
index_name = document.pop('index', None)
if not index_name:
raise ValueError("Document must have an 'index' field")
os_client.index(index=index_name,
body=document)

View file

@ -0,0 +1,16 @@
import pandas as pd
import requests
import utils.config as config
API_URL = config.API_URL
def get_idreseausocial(nom, endpoint=f"{API_URL}/reseauxsociaux/"):
reseaux_sociaux = pd.DataFrame(requests.get(endpoint).json())
return list(reseaux_sociaux[reseaux_sociaux["nom"] == nom]["id"])[0]
def get_idtypedocument(nom, endpoint=f"{API_URL}/typedocuments/"):
type_documents = pd.DataFrame(requests.get(endpoint).json())
return list(type_documents[type_documents["nom"] == nom]["id"])[0]

View file

@ -0,0 +1,22 @@
import os
import dotenv
# Load environment variables from.env file
dotenv.load_dotenv()
# Connect to OpenSearch using the provided credentials and hostname/port.
from opensearchpy import OpenSearch
host = 'localhost'
port = 9200
auth = ('admin', os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")) # For testing only. Don't store credentials in code.
# Create the client with SSL/TLS enabled, but hostname verification disabled.
opensearch_client = OpenSearch(
hosts=[{'host': host, 'port': port}],
http_compress=True, # enables gzip compression for request bodies
http_auth=auth,
use_ssl=True,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False
)

View file

@ -0,0 +1,14 @@
import utils.config as config
wordpress_names = config.WORDPRESS_NAMES.split(",")
reseau_social_data = [{"nom": "LinkedIn",
"repertoires": ["comments", "shares"]},
{"nom": "Wordpress",
"repertoires": wordpress_names},
{"nom": "Instagram",
"repertoires": ["comments", "content", "threads"]},
{"nom": "Facebook",
"repertoires": ["comments_and_reactions", "posts"]},
{"nom": "FacebookBusiness",
"repertoires": ["posts"]}]