presentation-sentometrics/Transformation_BD.Rmd

112 lines
2.9 KiB
Text
Raw Normal View History

2019-10-08 04:24:15 +00:00
---
title: "Formatage des données"
author: "François Pelletier"
date: "07/10/2019"
output: html_document
---
```{r}
library("tidyverse")
library("RSQLite")
library("DBI")
library("sentometrics")
```
```{r}
con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")
```
# Aperçu
```{r}
tbl(con,"core") %>% head(10) %>% collect() %>% glimpse()
```
# Top 10 de modalités
```{r}
top_10_sites <- tbl(con,"core") %>%
select(site) %>%
group_by(site) %>%
count() %>%
arrange(desc(n)) %>%
head(10) %>%
collect()
2019-10-08 04:24:15 +00:00
saveRDS(top_10_sites,"top_10_sites.RDS")
top_10_sites
```
```{r}
top_10_country <- tbl(con,"core") %>%
select(country) %>%
mutate(country = ifelse(country=="","XX",country)) %>%
group_by(country) %>%
count() %>%
arrange(desc(n)) %>%
head(10) %>%
collect()
2019-10-08 04:24:15 +00:00
saveRDS(top_10_country,"top_10_country.RDS")
top_10_country
```
# Entities features
## Compteurs
```{r}
entities_count <- tbl(con,"entities") %>% group_by(uuid,entity_type) %>% count %>% collect()
```
```{r}
2019-10-08 04:24:15 +00:00
entities_count_t <- entities_count %>% reshape2::dcast(uuid~paste0("entity_",entity_type),fun.aggregate = sum, value.var = "n")
2019-10-07 21:56:59 +00:00
entities_count_t %>% head(10) %>% glimpse()
```
## Aperçu
```{r}
tbl(con,"entities") %>% group_by(entity_type, entity) %>% count() %>% arrange(desc(n)) %>% head(100) %>% collect()
```
# Core features
```{r}
core_features_corpus <- tbl(con,"core") %>% collect() %>%
transmute(
id=uuid,
date=lubridate::as_datetime(published),
2019-10-08 04:24:15 +00:00
texts=paste(title_full,text,sep = "\n"),
# Site
site_01 = ifelse(site==top_10_sites$site[1],1,0),
site_02 = ifelse(site==top_10_sites$site[2],1,0),
site_03 = ifelse(site==top_10_sites$site[3],1,0),
site_04 = ifelse(site==top_10_sites$site[4],1,0),
site_05 = ifelse(site==top_10_sites$site[5],1,0),
site_06 = ifelse(site==top_10_sites$site[6],1,0),
site_07 = ifelse(site==top_10_sites$site[7],1,0),
site_08 = ifelse(site==top_10_sites$site[8],1,0),
site_09 = ifelse(site==top_10_sites$site[9],1,0),
site_10 = ifelse(site==top_10_sites$site[10],1,0),
# Site type
is_blog = ifelse(site_type=="blogs",1,0),
# Country
country_01 = ifelse(country==top_10_country$country[1],1,0),
country_02 = ifelse(country==top_10_country$country[2],1,0),
country_03 = ifelse(country==top_10_country$country[3],1,0),
country_04 = ifelse(country==top_10_country$country[4],1,0),
country_05 = ifelse(country==top_10_country$country[5],1,0),
country_06 = ifelse(country==top_10_country$country[6],1,0),
country_07 = ifelse(country==top_10_country$country[7],1,0),
country_08 = ifelse(country==top_10_country$country[8],1,0),
country_09 = ifelse(country==top_10_country$country[9],1,0),
country_10 = ifelse(country==top_10_country$country[10],1,0)
) %>% left_join(entities_count_t,by=c("id"="uuid")) %>% sento_corpus()
```
```{r}
saveRDS(core_features_corpus,file = "core_features_corpus.RDS")
```