Améliorations au moteur de recherche
This commit is contained in:
parent
15eb318212
commit
fc6bd9f255
21 changed files with 509 additions and 223 deletions
106
typesense_stats/typesense_stats.py
Normal file
106
typesense_stats/typesense_stats.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
import typesense
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
|
||||
## %% Configuration initiale
|
||||
# Charger les variables d'environnement et initialiser le client Typesense
|
||||
load_dotenv()
|
||||
|
||||
# Initialiser le client Typesense
|
||||
client = typesense.Client({
|
||||
'nodes': [{
|
||||
'host': os.getenv('TYPESENSE_HOST', 'localhost'),
|
||||
'port': os.getenv('TYPESENSE_PORT', '8108'),
|
||||
'protocol': os.getenv('TYPESENSE_PROTOCOL', 'http')
|
||||
}],
|
||||
'api_key': os.getenv('TYPESENSE_API_KEY'),
|
||||
'connection_timeout_seconds': 2
|
||||
})
|
||||
|
||||
## %% Fonction pour obtenir les statistiques d'index
|
||||
def get_index_stats():
|
||||
collections = client.collections.retrieve()
|
||||
total_documents = sum(collection['num_documents'] for collection in collections)
|
||||
|
||||
print(f"Nombre total de documents indexés : {total_documents}")
|
||||
print("\nStatistiques par collection :")
|
||||
for collection in collections:
|
||||
print(f" {collection['name']}: {collection['num_documents']} documents")
|
||||
|
||||
## %% Fonction pour obtenir la distribution par réseau social
|
||||
def get_network_distribution():
|
||||
search_parameters = {
|
||||
'q': '*',
|
||||
'query_by': 'texte',
|
||||
'facet_by': 'network',
|
||||
'per_page': 0
|
||||
}
|
||||
|
||||
result = client.collections['social_media_posts'].documents.search(search_parameters)
|
||||
|
||||
network_counts = {facet['value']: facet['count'] for facet in result['facet_counts'][0]['counts']}
|
||||
|
||||
print("\nDistribution par réseau social :")
|
||||
for network, count in network_counts.items():
|
||||
print(f" {network}: {count}")
|
||||
|
||||
## %% Fonction pour obtenir la distribution temporelle
|
||||
def get_temporal_distribution():
|
||||
search_parameters = {
|
||||
'q': '*',
|
||||
'query_by': 'texte',
|
||||
'per_page': 250, # Maximum autorisé par Typesense
|
||||
'sort_by': 'creation_timestamp:asc',
|
||||
'page': 1
|
||||
}
|
||||
|
||||
all_dates = []
|
||||
while True:
|
||||
result = client.collections['social_media_posts'].documents.search(search_parameters)
|
||||
|
||||
if not result['hits']:
|
||||
break
|
||||
|
||||
dates = [datetime.fromtimestamp(hit['document']['creation_timestamp']) for hit in result['hits']]
|
||||
all_dates.extend(dates)
|
||||
|
||||
search_parameters['page'] += 1
|
||||
|
||||
df = pd.DataFrame({'date': all_dates})
|
||||
df['month'] = df['date'].dt.to_period('M')
|
||||
monthly_counts = df['month'].value_counts().sort_index()
|
||||
|
||||
print("\nDistribution temporelle (par mois) :")
|
||||
for month, count in monthly_counts.items():
|
||||
print(f" {month}: {count}")
|
||||
|
||||
## %% Fonction pour obtenir un échantillon de documents
|
||||
def get_document_sample(sample_size=5):
|
||||
search_parameters = {
|
||||
'q': '*',
|
||||
'query_by': 'texte',
|
||||
'per_page': sample_size,
|
||||
'sort_by': 'creation_timestamp:desc'
|
||||
}
|
||||
|
||||
result = client.collections['social_media_posts'].documents.search(search_parameters)
|
||||
|
||||
print(f"\nÉchantillon de {sample_size} documents récents :")
|
||||
for hit in result['hits']:
|
||||
doc = hit['document']
|
||||
print(f"\n ID: {doc.get('id', 'N/A')}")
|
||||
print(f" Réseau: {doc.get('network', 'N/A')}")
|
||||
print(f" Type: {doc.get('type', 'N/A')}")
|
||||
print(f" Date: {datetime.fromtimestamp(doc['creation_timestamp']).strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" Texte: {doc.get('texte', 'N/A')[:100]}...") # Afficher les 100 premiers caractères du texte
|
||||
print(f" URL: {doc.get('uri', 'N/A')}")
|
||||
print(f" Langue: {doc.get('langue', 'N/A')}")
|
||||
|
||||
## %% Point d'entrée principal du script
|
||||
if __name__ == "__main__":
|
||||
get_index_stats()
|
||||
get_network_distribution()
|
||||
get_temporal_distribution()
|
||||
get_document_sample()
|
Loading…
Add table
Add a link
Reference in a new issue