préparation des données

2019-10-07 00:20:34 -04:00 · 2019-10-07 00:20:34 -04:00 · 01b0f0bd92
commit 01b0f0bd92
parent 5feb3e6c30
2 changed files with 163 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
--- a/Preparation_BD.Rmd
+++ b/Preparation_BD.Rmd
@ -0,0 +1,159 @@
+---
+title: "Presentation Sentometrics"
+author: "François Pelletier"
+date: "06/10/2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+```{r}
+library("jsonlite")
+library("tidyverse")
+library("RSQLite")
+library("DBI")
+library("lubridate")
+```
+
+
+```{r}
+blog_exemple <- jsonlite::read_json("google_news_blogs/blogs/blogs_0000001.json")
+```
+
+- Identifiant
+
+```{r}
+blog_exemple$uuid
+```
+
+- Date
+
+```{r}
+blog_exemple$published
+```
+
+- Contenu
+
+```{r}
+blog_exemple$text
+```
+
+- Features
+  - Persons
+```{r}
+blog_exemple$entities$persons %>% sapply(FUN = function(x) x$name)
+```
+  - Organizations
+```{r}
+blog_exemple$entities$organizations %>% sapply(FUN = function(x) x$name)
+```
+  - Locations
+```{r}
+blog_exemple$entities$locations %>% sapply(FUN = function(x) x$name)
+```  
+
+## Dataframes
+
+Core
+
+```{r}
+extract_names <- function(list_entities){
+  name_entities <- list_entities %>% sapply(FUN = function(x) x$name)
+  if (length(name_entities) > 0)
+    return(name_entities)
+  else
+    return(NA)
+}
+
+generer_core_df <- function(json_contents){
+  tibble(uuid = json_contents$uuid %>% coalesce(""),
+       site = json_contents$thread$site %>% coalesce(""),
+       site_type = json_contents$thread$site_type %>% coalesce(""),
+       country = json_contents$thread$country %>% coalesce(""),
+       published = lubridate::as_datetime(json_contents$thread$published) %>% coalesce(ISOdate(1900,1,1)),
+       title_full = json_contents$thread$title_full %>% coalesce(""),
+       text = json_contents$text %>% coalesce(""))
+}
+
+generer_entities_df <- function(json_contents){
+  this_df <- bind_rows(tibble(uuid = json_contents$uuid,
+                            entity_type="persons",
+                            entity=json_contents$entities$persons %>% 
+                              extract_names) ,
+                       tibble(uuid = json_contents$uuid,
+                            entity_type="organizations",
+                            entity=json_contents$entities$organizations %>% 
+                              extract_names),
+                       tibble(uuid = json_contents$uuid,
+                            entity_type="locations",
+                            entity=json_contents$entities$locations %>% 
+                              extract_names))
+  this_df <- na.omit(this_df)
+}
+```
+
+```{r}
+core_df <- generer_core_df(blog_exemple) 
+core_df %>% glimpse
+```
+
+```{r}
+entities_df <- generer_entities_df(blog_exemple)
+entities_df %>% glimpse
+```
+
+
+
+## Création des schémas de la base de données
+
+```{r}
+if(file.exists("google_news.sqlite"))
+  file.remove("google_news.sqlite")
+con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")
+dbCreateTable(con,"core",core_df)
+dbCreateTable(con,"entities",entities_df)
+```
+
+## Importation des données
+
+```{r}
+file_blogs <- list.files(path = "google_news_blogs/blogs",pattern = "*.json",full.names = TRUE)
+file_news <- list.files(path = "google_news_blogs/news",pattern = "*.json",full.names = TRUE)
+```
+
+```{r}
+
+traiter_json <- function(file_path){
+
+  json_contents <- jsonlite::read_json(file_path)
+  core_df <- generer_core_df(json_contents)
+  entities_df <- generer_entities_df(json_contents)
+  dbAppendTable(con,"core",core_df)
+  dbAppendTable(con,"entities",entities_df)
+}
+
+i <- 0 # itérateur
+for (file_blog in file_blogs){
+  if(!(i %% 1000)){
+    print(paste0(i,": Traitement de ",file_blog))
+  }
+  traiter_json(file_blog)
+  i <- i+1
+}
+
+```
+
+
+```{r}
+ii <- 0 # itérateur
+for (file_article in file_news){
+  if(!(ii %% 1000)){
+    print(paste0(ii,": Traitement de ",file_article))
+  }
+  traiter_json(file_article)
+  ii <- ii+1
+}
+```
+