import pandas as pd import datetime from utils.documents_to_database import documents_to_database # In[ ]: linkedin_data_path = "data/LinkedIn/shares/Shares.csv" raw_shares = pd.read_csv(linkedin_data_path) # In[ ]: raw_shares['index'] = "rs_linkedin_shares" raw_shares['type'] = "posts" raw_shares['network'] = "LinkedIn" raw_shares['chemin'] = linkedin_data_path # In[ ]: raw_shares["datepublication"] = raw_shares["Date"].apply( lambda x: str(datetime.datetime.fromisoformat(x).isoformat())) del raw_shares["Date"] # In[ ]: raw_shares.rename(columns={"ShareLink": "uri", "ShareCommentary": "texte"}, inplace=True) # In[ ]: raw_shares["texte"] = raw_shares["texte"].apply(lambda x: str(x)) # In[ ]: del raw_shares["SharedUrl"] del raw_shares["MediaUrl"] del raw_shares["Visibility"] # In[ ]: raw_shares.fillna(value="", inplace=True) # In[ ]: raw_shares.drop_duplicates(subset=['texte', 'datepublication'], inplace=True) # In[ ]: # Filter empty texte raw_shares = raw_shares[~raw_shares['texte'].str.strip('\n').str.strip().eq('')] # In[ ]: documents_to_database(raw_shares)