Ajout d'une application de recherche

2024-10-03 22:30:39 -04:00 · 2024-10-03 22:30:39 -04:00 · 5535db30c2
commit 5535db30c2
parent f62eaf5ac8
2 changed files with 151 additions and 2 deletions
--- a/search_app_ui/streamlit_app.py
+++ b/search_app_ui/streamlit_app.py
@ -0,0 +1,149 @@
+import streamlit as st
+import typesense
+from datetime import datetime, time
+import pandas as pd
+import plotly.express as px
+from dotenv import load_dotenv
+import os
+
+# Add this CSS to create a scrollable results area
+st.markdown("""
+<style>
+.scrollable-results {
+    height: 400px;
+    overflow-y: scroll;
+    border: 1px solid #ccc;
+    padding: 10px;
+    border-radius: 5px;
+}
+</style>
+""", unsafe_allow_html=True)
+
+# Charger les variables d'environnement
+load_dotenv()
+
+# Initialiser le client Typesense
+client = typesense.Client({
+    'nodes': [{
+        'host': 'localhost',
+        'port': '8108',
+        'protocol': 'http'
+    }],
+    'api_key': os.getenv('TYPESENSE_API_KEY'),
+    'connection_timeout_seconds': 2
+})
+
+def rechercher_documents(cette_requete, ces_filtres=None, facette_par=None):
+    parametres_recherche = {
+        'q': cette_requete,
+        'query_by': 'texte',
+        'sort_by': 'creation_timestamp:desc',
+        'per_page': 100,
+        'page': 1
+    }
+
+    if ces_filtres:
+        parametres_recherche['filter_by'] = ces_filtres
+
+    if facette_par:
+        parametres_recherche['facet_by'] = facette_par
+
+    all_results = []
+    while True:
+        results = client.collections['social_media_posts'].documents.search(parametres_recherche)
+        all_results.extend(results['hits'])
+        if len(all_results) >= results['found']:
+            break
+        parametres_recherche['page'] += 1
+
+    results['hits'] = all_results
+    return results
+
+# Interface utilisateur Streamlit
+st.title('Recherche de Contenu sur les Réseaux Sociaux')
+
+# Champ de recherche
+requete = st.text_input('Entrez votre requête de recherche')
+
+# Filtre de plage de dates
+col1, col2 = st.columns(2)
+date_debut = col1.date_input('Date de début')
+date_fin = col2.date_input('Date de fin')
+
+# Filtre de réseau social
+reseaux = ['Facebook', 'Instagram', 'Threads' ,'LinkedIn', 'WordPress']
+reseaux_selectionnes = st.multiselect('Sélectionnez les réseaux sociaux', reseaux)
+
+if st.button('Rechercher'):
+    # Préparer les filtres
+    debut_datetime = datetime.combine(date_debut, time.min)
+    fin_datetime = datetime.combine(date_fin, time.max)
+    filtre_date = f"creation_timestamp:[{int(debut_datetime.timestamp())}..{int(fin_datetime.timestamp())}]"
+    filtre_reseau = f"network:[{' '.join(reseaux_selectionnes)}]" if reseaux_selectionnes else None
+
+    filtres = ' && '.join(filter(None, [filtre_date, filtre_reseau]))
+
+    # Effectuer la recherche pour tous les résultats
+    tous_resultats = rechercher_documents(requete, ces_filtres=filtres, facette_par='network')
+    nombre_total_resultats = tous_resultats['found']
+
+    # Afficher le nombre total de résultats
+    st.subheader(f"Trouvé {nombre_total_resultats} résultats")
+    
+    # Afficher les facettes
+    if 'facet_counts' in tous_resultats:
+        facettes_reseau = {facette['value']: facette['count'] for facette in tous_resultats['facet_counts'][0]['counts']}
+        st.subheader("Résultats par Réseau")
+        fig = px.pie(values=list(facettes_reseau.values()), names=list(facettes_reseau.keys()), title="Distribution par Réseau")
+        st.plotly_chart(fig)
+
+    # Distribution temporelle par réseau et par mois
+    if nombre_total_resultats > 0:
+        st.subheader("Résultats au fil du temps par réseau (agrégation mensuelle)")
+
+        df_temporel = pd.DataFrame({
+            'date': [datetime.fromtimestamp(hit['document']['creation_timestamp']) for hit in tous_resultats['hits']],
+            'network': [hit['document']['network'] for hit in tous_resultats['hits']]
+        })
+
+        df_temporel['mois'] = df_temporel['date'].dt.to_period('M')
+        df_temporel = df_temporel.groupby(['mois', 'network']).size().reset_index(name='count')
+        df_temporel['mois'] = df_temporel['mois'].dt.to_timestamp()
+
+        fig = px.line(df_temporel, x='mois', y='count', color='network',
+                      title="Distribution temporelle par réseau (agrégation mensuelle)")
+        fig.update_layout(xaxis_title="Mois", yaxis_title="Nombre de posts")
+        fig.update_xaxes(tickformat="%B %Y")
+        st.plotly_chart(fig)
+
+        fig_bar = px.bar(df_temporel, x='mois', y='count', color='network',
+                         title="Distribution temporelle par réseau (barres empilées, agrégation mensuelle)")
+        fig_bar.update_layout(xaxis_title="Mois", yaxis_title="Nombre de posts")
+        fig_bar.update_xaxes(tickformat="%B %Y")
+        st.plotly_chart(fig_bar)
+
+        st.subheader("Tableau récapitulatif mensuel")
+        df_pivot = df_temporel.pivot(index='mois', columns='network', values='count').fillna(0)
+        df_pivot['Total'] = df_pivot.sum(axis=1)
+        df_pivot = df_pivot.reset_index()
+        df_pivot['mois'] = df_pivot['mois'].dt.strftime('%B %Y')
+        st.dataframe(df_pivot)
+    
+    # Create a string to hold all results
+    all_results_text = ""
+
+    # Populate the string with all results
+    for hit in tous_resultats['hits']:
+        horodatage = hit['document']['creation_timestamp']
+        all_results_text += f"**{hit['document']['network']}** - {datetime.fromtimestamp(horodatage).strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+
+        paragraphes = hit['document']['texte'].split('\n')
+        for paragraphe in paragraphes:
+            if paragraphe.strip():
+                all_results_text += f"{paragraphe}\n\n"
+
+        all_results_text += "---\n\n"
+
+    # Display the results in a text area
+    st.text_area("Résultats de la recherche", all_results_text, height=400)
+
--- a/import_data/26_importation_threads.py
+++ b/import_data/26_importation_threads.py
@ -18,7 +18,7 @@ except NameError:
 project_root = script_dir
 instagram_data_path = os.path.join(project_root, 'import_data', 'data', 'Instagram', 'threads', 'threads_and_replies.json')

-with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
+with open(instagram_data_path, "r") as posts:
    post_comments_1 = json.loads(convert_encoding_meta(posts.read()))

 #%% In[ ]:
@ -30,7 +30,7 @@ for post in post_comments_1['text_post_app_text_posts']:
                                 "chemin": instagram_data_path,
                                 "index": "rs_instagram_threads",
                                 "type": "posts",
-                                 "network": "Instagram"})
+                                 "network": "Threads"})

 #%% In[ ]:
 ig_comments_df = pd.DataFrame(threads_comments)