Améliorations au moteur de recherche

This commit is contained in:
François Pelletier 2024-11-30 22:03:49 -05:00
parent 15eb318212
commit fc6bd9f255
21 changed files with 509 additions and 223 deletions

View file

@ -0,0 +1,106 @@
import typesense
from dotenv import load_dotenv
import os
from datetime import datetime
import pandas as pd
## %% Configuration initiale
# Charger les variables d'environnement et initialiser le client Typesense
load_dotenv()
# Initialiser le client Typesense
client = typesense.Client({
'nodes': [{
'host': os.getenv('TYPESENSE_HOST', 'localhost'),
'port': os.getenv('TYPESENSE_PORT', '8108'),
'protocol': os.getenv('TYPESENSE_PROTOCOL', 'http')
}],
'api_key': os.getenv('TYPESENSE_API_KEY'),
'connection_timeout_seconds': 2
})
## %% Fonction pour obtenir les statistiques d'index
def get_index_stats():
collections = client.collections.retrieve()
total_documents = sum(collection['num_documents'] for collection in collections)
print(f"Nombre total de documents indexés : {total_documents}")
print("\nStatistiques par collection :")
for collection in collections:
print(f" {collection['name']}: {collection['num_documents']} documents")
## %% Fonction pour obtenir la distribution par réseau social
def get_network_distribution():
search_parameters = {
'q': '*',
'query_by': 'texte',
'facet_by': 'network',
'per_page': 0
}
result = client.collections['social_media_posts'].documents.search(search_parameters)
network_counts = {facet['value']: facet['count'] for facet in result['facet_counts'][0]['counts']}
print("\nDistribution par réseau social :")
for network, count in network_counts.items():
print(f" {network}: {count}")
## %% Fonction pour obtenir la distribution temporelle
def get_temporal_distribution():
search_parameters = {
'q': '*',
'query_by': 'texte',
'per_page': 250, # Maximum autorisé par Typesense
'sort_by': 'creation_timestamp:asc',
'page': 1
}
all_dates = []
while True:
result = client.collections['social_media_posts'].documents.search(search_parameters)
if not result['hits']:
break
dates = [datetime.fromtimestamp(hit['document']['creation_timestamp']) for hit in result['hits']]
all_dates.extend(dates)
search_parameters['page'] += 1
df = pd.DataFrame({'date': all_dates})
df['month'] = df['date'].dt.to_period('M')
monthly_counts = df['month'].value_counts().sort_index()
print("\nDistribution temporelle (par mois) :")
for month, count in monthly_counts.items():
print(f" {month}: {count}")
## %% Fonction pour obtenir un échantillon de documents
def get_document_sample(sample_size=5):
search_parameters = {
'q': '*',
'query_by': 'texte',
'per_page': sample_size,
'sort_by': 'creation_timestamp:desc'
}
result = client.collections['social_media_posts'].documents.search(search_parameters)
print(f"\nÉchantillon de {sample_size} documents récents :")
for hit in result['hits']:
doc = hit['document']
print(f"\n ID: {doc.get('id', 'N/A')}")
print(f" Réseau: {doc.get('network', 'N/A')}")
print(f" Type: {doc.get('type', 'N/A')}")
print(f" Date: {datetime.fromtimestamp(doc['creation_timestamp']).strftime('%Y-%m-%d %H:%M:%S')}")
print(f" Texte: {doc.get('texte', 'N/A')[:100]}...") # Afficher les 100 premiers caractères du texte
print(f" URL: {doc.get('uri', 'N/A')}")
print(f" Langue: {doc.get('langue', 'N/A')}")
## %% Point d'entrée principal du script
if __name__ == "__main__":
get_index_stats()
get_network_distribution()
get_temporal_distribution()
get_document_sample()