--- title: "Préparation des données Sentometrics" author: "François Pelletier" date: "06/10/2019" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r} library("jsonlite") library("tidyverse") library("RSQLite") library("DBI") library("lubridate") ``` ```{r} blog_exemple <- jsonlite::read_json("google_news_blogs/blogs/blogs_0000001.json") ``` - Identifiant ```{r} blog_exemple$uuid ``` - Date ```{r} blog_exemple$published ``` - Contenu ```{r} blog_exemple$text ``` - Features - Persons ```{r} blog_exemple$entities$persons %>% sapply(FUN = function(x) x$name) ``` - Organizations ```{r} blog_exemple$entities$organizations %>% sapply(FUN = function(x) x$name) ``` - Locations ```{r} blog_exemple$entities$locations %>% sapply(FUN = function(x) x$name) ``` ## Dataframes Core ```{r} extract_names <- function(list_entities){ name_entities <- list_entities %>% sapply(FUN = function(x) x$name) if (length(name_entities) > 0) return(name_entities) else return(NA) } generer_core_df <- function(json_contents){ tibble(uuid = json_contents$uuid %>% coalesce(""), site = json_contents$thread$site %>% coalesce(""), site_type = json_contents$thread$site_type %>% coalesce(""), country = json_contents$thread$country %>% coalesce(""), published = lubridate::as_datetime(json_contents$thread$published) %>% coalesce(ISOdate(1900,1,1)), title_full = json_contents$thread$title_full %>% coalesce(""), text = json_contents$text %>% coalesce("")) } generer_entities_df <- function(json_contents){ this_df <- bind_rows(tibble(uuid = json_contents$uuid, entity_type="persons", entity=json_contents$entities$persons %>% extract_names) , tibble(uuid = json_contents$uuid, entity_type="organizations", entity=json_contents$entities$organizations %>% extract_names), tibble(uuid = json_contents$uuid, entity_type="locations", entity=json_contents$entities$locations %>% extract_names)) this_df <- na.omit(this_df) } ``` ```{r} core_df <- generer_core_df(blog_exemple) core_df %>% glimpse ``` ```{r} entities_df <- generer_entities_df(blog_exemple) entities_df %>% glimpse ``` ## Création des schémas de la base de données ```{r} if(file.exists("google_news.sqlite")) file.remove("google_news.sqlite") con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite") dbCreateTable(con,"core",core_df) dbCreateTable(con,"entities",entities_df) ``` ## Importation des données ```{r} file_blogs <- list.files(path = "google_news_blogs/blogs",pattern = "*.json",full.names = TRUE) file_news <- list.files(path = "google_news_blogs/news",pattern = "*.json",full.names = TRUE) ``` ```{r} traiter_json <- function(file_path){ json_contents <- jsonlite::read_json(file_path) core_df <- generer_core_df(json_contents) entities_df <- generer_entities_df(json_contents) dbAppendTable(con,"core",core_df) dbAppendTable(con,"entities",entities_df) } ``` # Traitement des fichiers ```{r eval=FALSE} i <- 0 # itérateur for (file_blog in file_blogs){ if(!(i %% 1000)){ print(paste0(i,": Traitement de ",file_blog)) } traiter_json(file_blog) i <- i+1 } ii <- 0 # itérateur for (file_article in file_news){ if(!(ii %% 1000)){ print(paste0(ii,": Traitement de ",file_article)) } traiter_json(file_article) ii <- ii+1 } ```