Améliorations au moteur de recherche

2024-11-30 22:03:49 -05:00 · 2024-11-30 22:03:49 -05:00 · fc6bd9f255
commit fc6bd9f255
parent 15eb318212
21 changed files with 509 additions and 223 deletions
--- a/typesense_stats/typesense_stats.py
+++ b/typesense_stats/typesense_stats.py
@ -0,0 +1,106 @@
+import typesense
+from dotenv import load_dotenv
+import os
+from datetime import datetime
+import pandas as pd
+
+## %% Configuration initiale
+# Charger les variables d'environnement et initialiser le client Typesense
+load_dotenv()
+
+# Initialiser le client Typesense
+client = typesense.Client({
+    'nodes': [{
+        'host': os.getenv('TYPESENSE_HOST', 'localhost'),
+        'port': os.getenv('TYPESENSE_PORT', '8108'),
+        'protocol': os.getenv('TYPESENSE_PROTOCOL', 'http')
+    }],
+    'api_key': os.getenv('TYPESENSE_API_KEY'),
+    'connection_timeout_seconds': 2
+})
+
+## %% Fonction pour obtenir les statistiques d'index
+def get_index_stats():
+    collections = client.collections.retrieve()
+    total_documents = sum(collection['num_documents'] for collection in collections)
+    
+    print(f"Nombre total de documents indexés : {total_documents}")
+    print("\nStatistiques par collection :")
+    for collection in collections:
+        print(f"  {collection['name']}: {collection['num_documents']} documents")
+
+## %% Fonction pour obtenir la distribution par réseau social
+def get_network_distribution():
+    search_parameters = {
+        'q': '*',
+        'query_by': 'texte',
+        'facet_by': 'network',
+        'per_page': 0
+    }
+    
+    result = client.collections['social_media_posts'].documents.search(search_parameters)
+    
+    network_counts = {facet['value']: facet['count'] for facet in result['facet_counts'][0]['counts']}
+    
+    print("\nDistribution par réseau social :")
+    for network, count in network_counts.items():
+        print(f"  {network}: {count}")
+
+## %% Fonction pour obtenir la distribution temporelle
+def get_temporal_distribution():
+    search_parameters = {
+        'q': '*',
+        'query_by': 'texte',
+        'per_page': 250,  # Maximum autorisé par Typesense
+        'sort_by': 'creation_timestamp:asc',
+        'page': 1
+    }
+    
+    all_dates = []
+    while True:
+        result = client.collections['social_media_posts'].documents.search(search_parameters)
+        
+        if not result['hits']:
+            break
+        
+        dates = [datetime.fromtimestamp(hit['document']['creation_timestamp']) for hit in result['hits']]
+        all_dates.extend(dates)
+        
+        search_parameters['page'] += 1
+    
+    df = pd.DataFrame({'date': all_dates})
+    df['month'] = df['date'].dt.to_period('M')
+    monthly_counts = df['month'].value_counts().sort_index()
+    
+    print("\nDistribution temporelle (par mois) :")
+    for month, count in monthly_counts.items():
+        print(f"  {month}: {count}")
+
+## %% Fonction pour obtenir un échantillon de documents
+def get_document_sample(sample_size=5):
+    search_parameters = {
+        'q': '*',
+        'query_by': 'texte',
+        'per_page': sample_size,
+        'sort_by': 'creation_timestamp:desc'
+    }
+
+    result = client.collections['social_media_posts'].documents.search(search_parameters)
+
+    print(f"\nÉchantillon de {sample_size} documents récents :")
+    for hit in result['hits']:
+        doc = hit['document']
+        print(f"\n  ID: {doc.get('id', 'N/A')}")
+        print(f"  Réseau: {doc.get('network', 'N/A')}")
+        print(f"  Type: {doc.get('type', 'N/A')}")
+        print(f"  Date: {datetime.fromtimestamp(doc['creation_timestamp']).strftime('%Y-%m-%d %H:%M:%S')}")
+        print(f"  Texte: {doc.get('texte', 'N/A')[:100]}...")  # Afficher les 100 premiers caractères du texte
+        print(f"  URL: {doc.get('uri', 'N/A')}")
+        print(f"  Langue: {doc.get('langue', 'N/A')}")
+
+## %% Point d'entrée principal du script
+if __name__ == "__main__":
+    get_index_stats()
+    get_network_distribution()
+    get_temporal_distribution()
+    get_document_sample()