72 lines
2.2 KiB
Python
72 lines
2.2 KiB
Python
|
import pandas as pd
|
||
|
import codecs
|
||
|
import datetime
|
||
|
|
||
|
from utils.documents_to_database import documents_to_database
|
||
|
|
||
|
# In[ ]:
|
||
|
linkedin_data_path = "import_data/data/LinkedIn/comments/Comments.csv"
|
||
|
|
||
|
# In[ ]:
|
||
|
raw_comments_list = []
|
||
|
with (open(linkedin_data_path, 'r') as f):
|
||
|
current_comment = []
|
||
|
for my_line in f.readlines():
|
||
|
if my_line.startswith("Date,Link,Message"):
|
||
|
headers = my_line.strip().split(",")
|
||
|
else:
|
||
|
# Check if line starts with a ISO 8601 date
|
||
|
try:
|
||
|
datetime.datetime.strptime(str(my_line).split(",")[0], '%Y-%m-%d %H:%M:%S')
|
||
|
date_test = True
|
||
|
except ValueError:
|
||
|
date_test = False
|
||
|
if date_test:
|
||
|
if len(current_comment) == 3:
|
||
|
current_comment[2] = (str(current_comment[2])
|
||
|
.replace('\\"', '"')
|
||
|
.replace("\\'", r"'"))
|
||
|
raw_comments_list.append(current_comment)
|
||
|
current_comment = my_line.strip().split(",", maxsplit=2)
|
||
|
pass
|
||
|
else:
|
||
|
current_comment[2] = current_comment[2] + " " + my_line.strip()
|
||
|
|
||
|
# In[ ]:
|
||
|
raw_comments_csv = pd.DataFrame(raw_comments_list, columns=headers)
|
||
|
raw_comments = raw_comments_csv[(raw_comments_csv['Message'] != "")].drop_duplicates()
|
||
|
|
||
|
# In[ ]:
|
||
|
raw_comments['index'] = "rs_linkedin_comments"
|
||
|
raw_comments['type'] = "comments"
|
||
|
raw_comments['network'] = "LinkedIn"
|
||
|
raw_comments['chemin'] = linkedin_data_path
|
||
|
|
||
|
# In[ ]:
|
||
|
# Remove empty header
|
||
|
raw_comments = raw_comments[1:].reset_index(drop=True)
|
||
|
|
||
|
# In[ ]:
|
||
|
raw_comments["datepublication"] = raw_comments["Date"].apply(
|
||
|
lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
|
||
|
del raw_comments["Date"]
|
||
|
|
||
|
# In[ ]:
|
||
|
raw_comments.rename(columns={"Link": "uri", "Message": "texte"}, inplace=True)
|
||
|
|
||
|
# In[ ]:
|
||
|
raw_comments["chemin"] = linkedin_data_path
|
||
|
|
||
|
# In[ ]:
|
||
|
raw_comments.fillna(value="", inplace=True)
|
||
|
|
||
|
# In[ ]:
|
||
|
raw_comments.drop_duplicates(subset=['texte', 'datepublication'], inplace=True)
|
||
|
|
||
|
# In[ ]:
|
||
|
# Filter empty texte
|
||
|
raw_comments = raw_comments[~raw_comments['texte'].str.strip('\n').str.strip().eq('')]
|
||
|
|
||
|
# In[ ]:
|
||
|
documents_to_database(raw_comments)
|