importation corrigée des commentaires LinkedIn

2024-07-18 21:54:15 -04:00 · 2024-07-18 21:54:15 -04:00 · 5cb589bd18
commit 5cb589bd18
parent aff201f6cf
4 changed files with 25 additions and 39 deletions
--- a/import_data/21_importation_instagram_publications.py
+++ b/import_data/21_importation_instagram_publications.py
@ -3,12 +3,11 @@ import datetime
 import pandas as pd
 import json

-from utils.get_ids import get_idtypedocument, get_idreseausocial
 from utils.documents_to_database import documents_to_database
 from utils.convert_encoding_meta import convert_encoding_meta

 # In[ ]:
-instagram_data_path = 'import_data/data/Instagram/content/posts_1.json'
+instagram_data_path = 'data/Instagram/content/posts_1.json'
 with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts:
    posts_json = json.loads(convert_encoding_meta(posts.read()))

--- a/import_data/32_importation_linkedin_comments.R
+++ b/import_data/32_importation_linkedin_comments.R
@ -0,0 +1,19 @@
+linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv"
+library("readr")
+library("magrittr")
+library("dplyr")
+
+# Read CSV file
+ddd <- readr::read_delim(linkedin_data_path,
+                         escape_backslash = TRUE,
+                         trim_ws = TRUE,
+                         skip_empty_rows = FALSE,
+                         delim = ",")
+
+# Remove carriage returns
+ddd %>%
+  mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>%
+  select(-Message) -> ddd2
+
+# Save the cleaned data to a new CSV file
+ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE)
--- a/import_data/32_importation_linkedin_comments.py
+++ b/import_data/32_importation_linkedin_comments.py
@ -1,40 +1,14 @@
 import pandas as pd
-import codecs
 import datetime

 from utils.documents_to_database import documents_to_database

 # In[ ]:
-linkedin_data_path = "import_data/data/LinkedIn/comments/Comments.csv"
+linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv"

 # In[ ]:
-raw_comments_list = []
-with (open(linkedin_data_path, 'r') as f):
-    current_comment = []
-    for my_line in f.readlines():
-        if my_line.startswith("Date,Link,Message"):
-            headers = my_line.strip().split(",")
-        else:
-            # Check if line starts with a ISO 8601 date
-            try:
-                datetime.datetime.strptime(str(my_line).split(",")[0], '%Y-%m-%d %H:%M:%S')
-                date_test = True
-            except ValueError:
-                date_test = False
-            if date_test:
-                if len(current_comment) == 3:
-                    current_comment[2] = (str(current_comment[2])
-                                          .replace('\\"', '"')
-                                          .replace("\\'", r"'"))
-                raw_comments_list.append(current_comment)
-                current_comment = my_line.strip().split(",", maxsplit=2)
-                pass
-            else:
-                current_comment[2] = current_comment[2] + " " + my_line.strip()
-
-# In[ ]:
-raw_comments_csv = pd.DataFrame(raw_comments_list, columns=headers)
-raw_comments = raw_comments_csv[(raw_comments_csv['Message'] != "")].drop_duplicates()
+raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8')
+raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates()

 # In[ ]:
 raw_comments['index'] = "rs_linkedin_comments"
@ -42,17 +16,13 @@ raw_comments['type'] = "comments"
 raw_comments['network'] = "LinkedIn"
 raw_comments['chemin'] = linkedin_data_path

-# In[ ]:
-# Remove empty header
-raw_comments = raw_comments[1:].reset_index(drop=True)
-
 # In[ ]:
 raw_comments["datepublication"] = raw_comments["Date"].apply(
    lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat()))
 del raw_comments["Date"]

 # In[ ]:
-raw_comments.rename(columns={"Link": "uri", "Message": "texte"}, inplace=True)
+raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True)

 # In[ ]:
 raw_comments["chemin"] = linkedin_data_path
--- a/import_data/41_importation_wordpress.py
+++ b/import_data/41_importation_wordpress.py
@ -5,12 +5,10 @@ import xmltodict
 import pandas as pd
 import markdownify

-import utils.config
-from utils.get_ids import get_idreseausocial, get_idtypedocument
 from utils.documents_to_database import documents_to_database

 # In[ ]:
-wordpress_xml_path = "import_data/data/Wordpress/jevalideca/wordpress.xml"
+wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml"
 with open(wordpress_xml_path, "r") as xml_file:
    wordpress_xml = xml_file.read()