libere-tes-chaine-de-mots/import_data/32_importation_linkedin_comments.py

72 lines
2.2 KiB
Python
Raw Normal View History

import pandas as pd
import codecs
import datetime
from utils.documents_to_database import documents_to_database
# In[ ]:
linkedin_data_path = "import_data/data/LinkedIn/comments/Comments.csv"
# In[ ]:
raw_comments_list = []
with (open(linkedin_data_path, 'r') as f):
current_comment = []
for my_line in f.readlines():
if my_line.startswith("Date,Link,Message"):
headers = my_line.strip().split(",")
else:
# Check if line starts with a ISO 8601 date
try:
datetime.datetime.strptime(str(my_line).split(",")[0], '%Y-%m-%d %H:%M:%S')
date_test = True
except ValueError:
date_test = False
if date_test:
if len(current_comment) == 3:
current_comment[2] = (str(current_comment[2])
.replace('\\"', '"')
.replace("\\'", r"'"))
raw_comments_list.append(current_comment)
current_comment = my_line.strip().split(",", maxsplit=2)
pass
else:
current_comment[2] = current_comment[2] + " " + my_line.strip()
# In[ ]:
raw_comments_csv = pd.DataFrame(raw_comments_list, columns=headers)
raw_comments = raw_comments_csv[(raw_comments_csv['Message'] != "")].drop_duplicates()
# In[ ]:
raw_comments['index'] = "rs_linkedin_comments"
raw_comments['type'] = "comments"
raw_comments['network'] = "LinkedIn"
raw_comments['chemin'] = linkedin_data_path
# In[ ]:
# Remove empty header
raw_comments = raw_comments[1:].reset_index(drop=True)
# In[ ]:
raw_comments["datepublication"] = raw_comments["Date"].apply(
lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
del raw_comments["Date"]
# In[ ]:
raw_comments.rename(columns={"Link": "uri", "Message": "texte"}, inplace=True)
# In[ ]:
raw_comments["chemin"] = linkedin_data_path
# In[ ]:
raw_comments.fillna(value="", inplace=True)
# In[ ]:
raw_comments.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
# In[ ]:
# Filter empty texte
raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')]
# In[ ]:
documents_to_database(raw_comments)