diff --git a/import_data/21_importation_instagram_publications.py b/import_data/21_importation_instagram_publications.py index ce11c0d..7902b6a 100644 --- a/import_data/21_importation_instagram_publications.py +++ b/import_data/21_importation_instagram_publications.py @@ -3,12 +3,11 @@ import datetime import pandas as pd import json -from utils.get_ids import get_idtypedocument, get_idreseausocial from utils.documents_to_database import documents_to_database from utils.convert_encoding_meta import convert_encoding_meta # In[ ]: -instagram_data_path = 'import_data/data/Instagram/content/posts_1.json' +instagram_data_path = 'data/Instagram/content/posts_1.json' with open(instagram_data_path, "r", encoding="raw-unicode-escape") as posts: posts_json = json.loads(convert_encoding_meta(posts.read())) diff --git a/import_data/32_importation_linkedin_comments.R b/import_data/32_importation_linkedin_comments.R new file mode 100644 index 0000000..eb6c855 --- /dev/null +++ b/import_data/32_importation_linkedin_comments.R @@ -0,0 +1,19 @@ +linkedin_data_path <- "import_data/data/LinkedIn/comments/Comments.csv" +library("readr") +library("magrittr") +library("dplyr") + +# Read CSV file +ddd <- readr::read_delim(linkedin_data_path, + escape_backslash = TRUE, + trim_ws = TRUE, + skip_empty_rows = FALSE, + delim = ",") + +# Remove carriage returns +ddd %>% + mutate(MessageFix = Message %>% stringr::str_replace_all(pattern = "[\r\n\t]+", replacement = " ")) %>% + select(-Message) -> ddd2 + +# Save the cleaned data to a new CSV file +ddd2 %>% write.csv("import_data/data/LinkedIn/comments/Comments-FIX.csv", row.names = FALSE) diff --git a/import_data/32_importation_linkedin_comments.py b/import_data/32_importation_linkedin_comments.py index 783bc72..8e91cb2 100644 --- a/import_data/32_importation_linkedin_comments.py +++ b/import_data/32_importation_linkedin_comments.py @@ -1,40 +1,14 @@ import pandas as pd -import codecs import datetime from utils.documents_to_database import documents_to_database # In[ ]: -linkedin_data_path = "import_data/data/LinkedIn/comments/Comments.csv" +linkedin_data_path = "data/LinkedIn/comments/Comments-FIX.csv" # In[ ]: -raw_comments_list = [] -with (open(linkedin_data_path, 'r') as f): - current_comment = [] - for my_line in f.readlines(): - if my_line.startswith("Date,Link,Message"): - headers = my_line.strip().split(",") - else: - # Check if line starts with a ISO 8601 date - try: - datetime.datetime.strptime(str(my_line).split(",")[0], '%Y-%m-%d %H:%M:%S') - date_test = True - except ValueError: - date_test = False - if date_test: - if len(current_comment) == 3: - current_comment[2] = (str(current_comment[2]) - .replace('\\"', '"') - .replace("\\'", r"'")) - raw_comments_list.append(current_comment) - current_comment = my_line.strip().split(",", maxsplit=2) - pass - else: - current_comment[2] = current_comment[2] + " " + my_line.strip() - -# In[ ]: -raw_comments_csv = pd.DataFrame(raw_comments_list, columns=headers) -raw_comments = raw_comments_csv[(raw_comments_csv['Message'] != "")].drop_duplicates() +raw_comments_csv = pd.read_csv(linkedin_data_path, encoding='utf-8') +raw_comments = raw_comments_csv[(raw_comments_csv['MessageFix'] != "")].drop_duplicates() # In[ ]: raw_comments['index'] = "rs_linkedin_comments" @@ -42,17 +16,13 @@ raw_comments['type'] = "comments" raw_comments['network'] = "LinkedIn" raw_comments['chemin'] = linkedin_data_path -# In[ ]: -# Remove empty header -raw_comments = raw_comments[1:].reset_index(drop=True) - # In[ ]: raw_comments["datepublication"] = raw_comments["Date"].apply( lambda x: str(datetime.datetime.fromisoformat(str(x)).isoformat())) del raw_comments["Date"] # In[ ]: -raw_comments.rename(columns={"Link": "uri", "Message": "texte"}, inplace=True) +raw_comments.rename(columns={"Link": "uri", "MessageFix": "texte"}, inplace=True) # In[ ]: raw_comments["chemin"] = linkedin_data_path diff --git a/import_data/41_importation_wordpress.py b/import_data/41_importation_wordpress.py index 1d68845..1307bb9 100644 --- a/import_data/41_importation_wordpress.py +++ b/import_data/41_importation_wordpress.py @@ -5,12 +5,10 @@ import xmltodict import pandas as pd import markdownify -import utils.config -from utils.get_ids import get_idreseausocial, get_idtypedocument from utils.documents_to_database import documents_to_database # In[ ]: -wordpress_xml_path = "import_data/data/Wordpress/jevalideca/wordpress.xml" +wordpress_xml_path = "data/Wordpress/jevalideca/wordpress.xml" with open(wordpress_xml_path, "r") as xml_file: wordpress_xml = xml_file.read()