préparation des données
This commit is contained in:
parent
5feb3e6c30
commit
01b0f0bd92
2 changed files with 163 additions and 0 deletions
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
.Rproj.user
|
||||
.Rhistory
|
||||
.RData
|
||||
.Ruserdata
|
159
Preparation_BD.Rmd
Normal file
159
Preparation_BD.Rmd
Normal file
|
@ -0,0 +1,159 @@
|
|||
---
|
||||
title: "Presentation Sentometrics"
|
||||
author: "François Pelletier"
|
||||
date: "06/10/2019"
|
||||
output: html_document
|
||||
---
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
```
|
||||
|
||||
```{r}
|
||||
library("jsonlite")
|
||||
library("tidyverse")
|
||||
library("RSQLite")
|
||||
library("DBI")
|
||||
library("lubridate")
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
blog_exemple <- jsonlite::read_json("google_news_blogs/blogs/blogs_0000001.json")
|
||||
```
|
||||
|
||||
- Identifiant
|
||||
|
||||
```{r}
|
||||
blog_exemple$uuid
|
||||
```
|
||||
|
||||
- Date
|
||||
|
||||
```{r}
|
||||
blog_exemple$published
|
||||
```
|
||||
|
||||
- Contenu
|
||||
|
||||
```{r}
|
||||
blog_exemple$text
|
||||
```
|
||||
|
||||
- Features
|
||||
- Persons
|
||||
```{r}
|
||||
blog_exemple$entities$persons %>% sapply(FUN = function(x) x$name)
|
||||
```
|
||||
- Organizations
|
||||
```{r}
|
||||
blog_exemple$entities$organizations %>% sapply(FUN = function(x) x$name)
|
||||
```
|
||||
- Locations
|
||||
```{r}
|
||||
blog_exemple$entities$locations %>% sapply(FUN = function(x) x$name)
|
||||
```
|
||||
|
||||
## Dataframes
|
||||
|
||||
Core
|
||||
|
||||
```{r}
|
||||
extract_names <- function(list_entities){
|
||||
name_entities <- list_entities %>% sapply(FUN = function(x) x$name)
|
||||
if (length(name_entities) > 0)
|
||||
return(name_entities)
|
||||
else
|
||||
return(NA)
|
||||
}
|
||||
|
||||
generer_core_df <- function(json_contents){
|
||||
tibble(uuid = json_contents$uuid %>% coalesce(""),
|
||||
site = json_contents$thread$site %>% coalesce(""),
|
||||
site_type = json_contents$thread$site_type %>% coalesce(""),
|
||||
country = json_contents$thread$country %>% coalesce(""),
|
||||
published = lubridate::as_datetime(json_contents$thread$published) %>% coalesce(ISOdate(1900,1,1)),
|
||||
title_full = json_contents$thread$title_full %>% coalesce(""),
|
||||
text = json_contents$text %>% coalesce(""))
|
||||
}
|
||||
|
||||
generer_entities_df <- function(json_contents){
|
||||
this_df <- bind_rows(tibble(uuid = json_contents$uuid,
|
||||
entity_type="persons",
|
||||
entity=json_contents$entities$persons %>%
|
||||
extract_names) ,
|
||||
tibble(uuid = json_contents$uuid,
|
||||
entity_type="organizations",
|
||||
entity=json_contents$entities$organizations %>%
|
||||
extract_names),
|
||||
tibble(uuid = json_contents$uuid,
|
||||
entity_type="locations",
|
||||
entity=json_contents$entities$locations %>%
|
||||
extract_names))
|
||||
this_df <- na.omit(this_df)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
core_df <- generer_core_df(blog_exemple)
|
||||
core_df %>% glimpse
|
||||
```
|
||||
|
||||
```{r}
|
||||
entities_df <- generer_entities_df(blog_exemple)
|
||||
entities_df %>% glimpse
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Création des schémas de la base de données
|
||||
|
||||
```{r}
|
||||
if(file.exists("google_news.sqlite"))
|
||||
file.remove("google_news.sqlite")
|
||||
con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")
|
||||
dbCreateTable(con,"core",core_df)
|
||||
dbCreateTable(con,"entities",entities_df)
|
||||
```
|
||||
|
||||
## Importation des données
|
||||
|
||||
```{r}
|
||||
file_blogs <- list.files(path = "google_news_blogs/blogs",pattern = "*.json",full.names = TRUE)
|
||||
file_news <- list.files(path = "google_news_blogs/news",pattern = "*.json",full.names = TRUE)
|
||||
```
|
||||
|
||||
```{r}
|
||||
|
||||
traiter_json <- function(file_path){
|
||||
|
||||
json_contents <- jsonlite::read_json(file_path)
|
||||
core_df <- generer_core_df(json_contents)
|
||||
entities_df <- generer_entities_df(json_contents)
|
||||
dbAppendTable(con,"core",core_df)
|
||||
dbAppendTable(con,"entities",entities_df)
|
||||
}
|
||||
|
||||
i <- 0 # itérateur
|
||||
for (file_blog in file_blogs){
|
||||
if(!(i %% 1000)){
|
||||
print(paste0(i,": Traitement de ",file_blog))
|
||||
}
|
||||
traiter_json(file_blog)
|
||||
i <- i+1
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
ii <- 0 # itérateur
|
||||
for (file_article in file_news){
|
||||
if(!(ii %% 1000)){
|
||||
print(paste0(ii,": Traitement de ",file_article))
|
||||
}
|
||||
traiter_json(file_article)
|
||||
ii <- ii+1
|
||||
}
|
||||
```
|
||||
|
Loading…
Reference in a new issue