importation corrigée des commentaires LinkedIn
This commit is contained in:
parent
aff201f6cf
commit
5cb589bd18
4 changed files with 25 additions and 39 deletions
|
@ -3,12 +3,11 @@ import datetime
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from utils.get_ids import get_idtypedocument, get_idreseausocial
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
from utils.convert_encoding_meta import convert_encoding_meta
|
from utils.convert_encoding_meta import convert_encoding_meta
|
||||||
|
|
||||||
# In[ ]:
|
# In[ ]:
|
||||||
instagram_data_path = 'import_data/data/Instagram/content/posts_1.json'
|
instagram_data_path = 'data/Instagram/content/posts_1.json'
|
||||||
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
|
||||||
posts_json = json.loads(convert_encoding_meta(posts.read()))
|
posts_json = json.loads(convert_encoding_meta(posts.read()))
|
||||||
|
|
||||||
|
|
19
import_data/32_importation_linkedin_comments.R
Normal file
19
import_data/32_importation_linkedin_comments.R
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv"
|
||||||
|
library("readr")
|
||||||
|
library("magrittr")
|
||||||
|
library("dplyr")
|
||||||
|
|
||||||
|
# Read CSV file
|
||||||
|
ddd <- readr::read_delim(linkedin_data_path,
|
||||||
|
escape_backslash = TRUE,
|
||||||
|
trim_ws = TRUE,
|
||||||
|
skip_empty_rows = FALSE,
|
||||||
|
delim = ",")
|
||||||
|
|
||||||
|
# Remove carriage returns
|
||||||
|
ddd %>%
|
||||||
|
mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>%
|
||||||
|
select(-Message) -> ddd2
|
||||||
|
|
||||||
|
# Save the cleaned data to a new CSV file
|
||||||
|
ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE)
|
|
@ -1,40 +1,14 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import codecs
|
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
|
|
||||||
# In[ ]:
|
# In[ ]:
|
||||||
linkedin_data_path = "import_data/data/LinkedIn/comments/Comments.csv"
|
linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv"
|
||||||
|
|
||||||
# In[ ]:
|
# In[ ]:
|
||||||
raw_comments_list = []
|
raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8')
|
||||||
with (open(linkedin_data_path, 'r') as f):
|
raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates()
|
||||||
current_comment = []
|
|
||||||
for my_line in f.readlines():
|
|
||||||
if my_line.startswith("Date,Link,Message"):
|
|
||||||
headers = my_line.strip().split(",")
|
|
||||||
else:
|
|
||||||
# Check if line starts with a ISO 8601 date
|
|
||||||
try:
|
|
||||||
datetime.datetime.strptime(str(my_line).split(",")[0], '%Y-%m-%d %H:%M:%S')
|
|
||||||
date_test = True
|
|
||||||
except ValueError:
|
|
||||||
date_test = False
|
|
||||||
if date_test:
|
|
||||||
if len(current_comment) == 3:
|
|
||||||
current_comment[2] = (str(current_comment[2])
|
|
||||||
.replace('\\"', '"')
|
|
||||||
.replace("\\'", r"'"))
|
|
||||||
raw_comments_list.append(current_comment)
|
|
||||||
current_comment = my_line.strip().split(",", maxsplit=2)
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
current_comment[2] = current_comment[2] + " " + my_line.strip()
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
raw_comments_csv = pd.DataFrame(raw_comments_list, columns=headers)
|
|
||||||
raw_comments = raw_comments_csv[(raw_comments_csv['Message'] != "")].drop_duplicates()
|
|
||||||
|
|
||||||
# In[ ]:
|
# In[ ]:
|
||||||
raw_comments['index'] = "rs_linkedin_comments"
|
raw_comments['index'] = "rs_linkedin_comments"
|
||||||
|
@ -42,17 +16,13 @@ raw_comments['type'] = "comments"
|
||||||
raw_comments['network'] = "LinkedIn"
|
raw_comments['network'] = "LinkedIn"
|
||||||
raw_comments['chemin'] = linkedin_data_path
|
raw_comments['chemin'] = linkedin_data_path
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
# Remove empty header
|
|
||||||
raw_comments = raw_comments[1:].reset_index(drop=True)
|
|
||||||
|
|
||||||
# In[ ]:
|
# In[ ]:
|
||||||
raw_comments["datepublication"] = raw_comments["Date"].apply(
|
raw_comments["datepublication"] = raw_comments["Date"].apply(
|
||||||
lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
|
lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
|
||||||
del raw_comments["Date"]
|
del raw_comments["Date"]
|
||||||
|
|
||||||
# In[ ]:
|
# In[ ]:
|
||||||
raw_comments.rename(columns={"Link": "uri", "Message": "texte"}, inplace=True)
|
raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True)
|
||||||
|
|
||||||
# In[ ]:
|
# In[ ]:
|
||||||
raw_comments["chemin"] = linkedin_data_path
|
raw_comments["chemin"] = linkedin_data_path
|
||||||
|
|
|
@ -5,12 +5,10 @@ import xmltodict
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import markdownify
|
import markdownify
|
||||||
|
|
||||||
import utils.config
|
|
||||||
from utils.get_ids import get_idreseausocial, get_idtypedocument
|
|
||||||
from utils.documents_to_database import documents_to_database
|
from utils.documents_to_database import documents_to_database
|
||||||
|
|
||||||
# In[ ]:
|
# In[ ]:
|
||||||
wordpress_xml_path = "import_data/data/Wordpress/jevalideca/wordpress.xml"
|
wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml"
|
||||||
with open(wordpress_xml_path, "r") as xml_file:
|
with open(wordpress_xml_path, "r") as xml_file:
|
||||||
wordpress_xml = xml_file.read()
|
wordpress_xml = xml_file.read()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue