presentation-sentometrics/Preparation_BD.Rmd

---
title: "Préparation des données Sentometrics"
author: "François Pelletier"
date: "06/10/2019"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library("jsonlite")
library("tidyverse")
library("RSQLite")
library("DBI")
library("lubridate")
```


```{r}
blog_exemple <- jsonlite::read_json("google_news_blogs/blogs/blogs_0000001.json")
```

- Identifiant

```{r}
blog_exemple$uuid
```

- Date

```{r}
blog_exemple$published
```

- Contenu

```{r}
blog_exemple$text
```

- Features
  - Persons
```{r}
blog_exemple$entities$persons %>% sapply(FUN = function(x) x$name)
```
  - Organizations
```{r}
blog_exemple$entities$organizations %>% sapply(FUN = function(x) x$name)
```
  - Locations
```{r}
blog_exemple$entities$locations %>% sapply(FUN = function(x) x$name)
```  

## Dataframes

Core

```{r}
extract_names <- function(list_entities){
  name_entities <- list_entities %>% sapply(FUN = function(x) x$name)
  if (length(name_entities) > 0)
    return(name_entities)
  else
    return(NA)
}

generer_core_df <- function(json_contents){
  tibble(uuid = json_contents$uuid %>% coalesce(""),
       site = json_contents$thread$site %>% coalesce(""),
       site_type = json_contents$thread$site_type %>% coalesce(""),
       country = json_contents$thread$country %>% coalesce(""),
       published = lubridate::as_datetime(json_contents$thread$published) %>% coalesce(ISOdate(1900,1,1)),
       title_full = json_contents$thread$title_full %>% coalesce(""),
       text = json_contents$text %>% coalesce(""))
}

generer_entities_df <- function(json_contents){
  this_df <- bind_rows(tibble(uuid = json_contents$uuid,
                            entity_type="persons",
                            entity=json_contents$entities$persons %>% 
                              extract_names) ,
                       tibble(uuid = json_contents$uuid,
                            entity_type="organizations",
                            entity=json_contents$entities$organizations %>% 
                              extract_names),
                       tibble(uuid = json_contents$uuid,
                            entity_type="locations",
                            entity=json_contents$entities$locations %>% 
                              extract_names))
  this_df <- na.omit(this_df)
}
```

```{r}
core_df <- generer_core_df(blog_exemple) 
core_df %>% glimpse
```

```{r}
entities_df <- generer_entities_df(blog_exemple)
entities_df %>% glimpse
```


## Création des schémas de la base de données

```{r}
if(file.exists("google_news.sqlite"))
  file.remove("google_news.sqlite")
con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")
dbCreateTable(con,"core",core_df)
dbCreateTable(con,"entities",entities_df)
```

## Importation des données

```{r}
file_blogs <- list.files(path = "google_news_blogs/blogs",pattern = "*.json",full.names = TRUE)
file_news <- list.files(path = "google_news_blogs/news",pattern = "*.json",full.names = TRUE)
```

```{r}
traiter_json <- function(file_path){

  json_contents <- jsonlite::read_json(file_path)
  core_df <- generer_core_df(json_contents)
  entities_df <- generer_entities_df(json_contents)
  dbAppendTable(con,"core",core_df)
  dbAppendTable(con,"entities",entities_df)
}
```


# Traitement des fichiers

```{r eval=FALSE}
i <- 0 # itérateur
for (file_blog in file_blogs){
  if(!(i %% 1000)){
    print(paste0(i,": Traitement de ",file_blog))
  }
  traiter_json(file_blog)
  i <- i+1
}

ii <- 0 # itérateur
for (file_article in file_news){
  if(!(ii %% 1000)){
    print(paste0(ii,": Traitement de ",file_article))
  }
  traiter_json(file_article)
  ii <- ii+1
}

```
préparation des données 2019-10-07 00:20:34 -04:00			`---`
transformation des données et présentation 2019-10-07 17:50:48 -04:00			`title: "Préparation des données Sentometrics"`
préparation des données 2019-10-07 00:20:34 -04:00			`author: "François Pelletier"`
			`date: "06/10/2019"`
			`output: html_document`
			`---`

			```{r setup, include=FALSE}
			`knitr::opts_chunk$set(echo = TRUE)`
			```

			```{r}
			`library("jsonlite")`
			`library("tidyverse")`
			`library("RSQLite")`
			`library("DBI")`
			`library("lubridate")`
			```


			```{r}
			`blog_exemple <- jsonlite::read_json("google_news_blogs/blogs/blogs_0000001.json")`
			```

			`- Identifiant`

			```{r}
			`blog_exemple$uuid`
			```

			`- Date`

			```{r}
			`blog_exemple$published`
			```

			`- Contenu`

			```{r}
			`blog_exemple$text`
			```

			`- Features`
			`- Persons`
			```{r}
			`blog_exemple$entities$persons %>% sapply(FUN = function(x) x$name)`
			```
			`- Organizations`
			```{r}
			`blog_exemple$entities$organizations %>% sapply(FUN = function(x) x$name)`
			```
			`- Locations`
			```{r}
			`blog_exemple$entities$locations %>% sapply(FUN = function(x) x$name)`
			```

			`## Dataframes`

			`Core`

			```{r}
			`extract_names <- function(list_entities){`
			`name_entities <- list_entities %>% sapply(FUN = function(x) x$name)`
			`if (length(name_entities) > 0)`
			`return(name_entities)`
			`else`
			`return(NA)`
			`}`

			`generer_core_df <- function(json_contents){`
			`tibble(uuid = json_contents$uuid %>% coalesce(""),`
			`site = json_contents$thread$site %>% coalesce(""),`
			`site_type = json_contents$thread$site_type %>% coalesce(""),`
			`country = json_contents$thread$country %>% coalesce(""),`
			`published = lubridate::as_datetime(json_contents$thread$published) %>% coalesce(ISOdate(1900,1,1)),`
			`title_full = json_contents$thread$title_full %>% coalesce(""),`
			`text = json_contents$text %>% coalesce(""))`
			`}`

			`generer_entities_df <- function(json_contents){`
			`this_df <- bind_rows(tibble(uuid = json_contents$uuid,`
			`entity_type="persons",`
			`entity=json_contents$entities$persons %>%`
			`extract_names) ,`
			`tibble(uuid = json_contents$uuid,`
			`entity_type="organizations",`
			`entity=json_contents$entities$organizations %>%`
			`extract_names),`
			`tibble(uuid = json_contents$uuid,`
			`entity_type="locations",`
			`entity=json_contents$entities$locations %>%`
			`extract_names))`
			`this_df <- na.omit(this_df)`
			`}`
			```

			```{r}
			`core_df <- generer_core_df(blog_exemple)`
			`core_df %>% glimpse`
			```

			```{r}
			`entities_df <- generer_entities_df(blog_exemple)`
			`entities_df %>% glimpse`
			```



			`## Création des schémas de la base de données`

			```{r}
			`if(file.exists("google_news.sqlite"))`
			`file.remove("google_news.sqlite")`
			`con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")`
			`dbCreateTable(con,"core",core_df)`
			`dbCreateTable(con,"entities",entities_df)`
			```

			`## Importation des données`

			```{r}
			`file_blogs <- list.files(path = "google_news_blogs/blogs",pattern = "*.json",full.names = TRUE)`
			`file_news <- list.files(path = "google_news_blogs/news",pattern = "*.json",full.names = TRUE)`
			```

			```{r}
			`traiter_json <- function(file_path){`

			`json_contents <- jsonlite::read_json(file_path)`
			`core_df <- generer_core_df(json_contents)`
			`entities_df <- generer_entities_df(json_contents)`
			`dbAppendTable(con,"core",core_df)`
			`dbAppendTable(con,"entities",entities_df)`
			`}`
transformation des données et présentation 2019-10-07 17:50:48 -04:00			```

préparation des données 2019-10-07 00:20:34 -04:00
transformation des données et présentation 2019-10-07 17:50:48 -04:00			`# Traitement des fichiers`

			```{r eval=FALSE}
préparation des données 2019-10-07 00:20:34 -04:00			`i <- 0 # itérateur`
			`for (file_blog in file_blogs){`
			`if(!(i %% 1000)){`
			`print(paste0(i,": Traitement de ",file_blog))`
			`}`
			`traiter_json(file_blog)`
			`i <- i+1`
			`}`

			`ii <- 0 # itérateur`
			`for (file_article in file_news){`
			`if(!(ii %% 1000)){`
			`print(paste0(ii,": Traitement de ",file_article))`
			`}`
			`traiter_json(file_article)`
			`ii <- ii+1`
			`}`
transformation des données et présentation 2019-10-07 17:50:48 -04:00
préparation des données 2019-10-07 00:20:34 -04:00			```