111 lines
2.9 KiB
Text
111 lines
2.9 KiB
Text
---
|
|
title: "Formatage des données"
|
|
author: "François Pelletier"
|
|
date: "07/10/2019"
|
|
output: html_document
|
|
---
|
|
|
|
```{r}
|
|
library("tidyverse")
|
|
library("RSQLite")
|
|
library("DBI")
|
|
library("sentometrics")
|
|
```
|
|
|
|
|
|
```{r}
|
|
con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")
|
|
```
|
|
|
|
# Aperçu
|
|
|
|
```{r}
|
|
tbl(con,"core") %>% head(10) %>% collect() %>% glimpse()
|
|
```
|
|
|
|
# Top 10 de modalités
|
|
```{r}
|
|
top_10_sites <- tbl(con,"core") %>%
|
|
select(site) %>%
|
|
group_by(site) %>%
|
|
count() %>%
|
|
arrange(desc(n)) %>%
|
|
head(10) %>%
|
|
collect()
|
|
saveRDS(top_10_sites,"top_10_sites.RDS")
|
|
top_10_sites
|
|
```
|
|
|
|
```{r}
|
|
top_10_country <- tbl(con,"core") %>%
|
|
select(country) %>%
|
|
mutate(country = ifelse(country=="","XX",country)) %>%
|
|
group_by(country) %>%
|
|
count() %>%
|
|
arrange(desc(n)) %>%
|
|
head(10) %>%
|
|
collect()
|
|
saveRDS(top_10_country,"top_10_country.RDS")
|
|
top_10_country
|
|
```
|
|
|
|
# Entities features
|
|
|
|
## Compteurs
|
|
|
|
```{r}
|
|
entities_count <- tbl(con,"entities") %>% group_by(uuid,entity_type) %>% count %>% collect()
|
|
```
|
|
|
|
```{r}
|
|
entities_count_t <- entities_count %>% reshape2::dcast(uuid~paste0("entity_",entity_type),fun.aggregate = sum, value.var = "n")
|
|
entities_count_t %>% head(10) %>% glimpse()
|
|
```
|
|
|
|
## Aperçu
|
|
|
|
```{r}
|
|
tbl(con,"entities") %>% group_by(entity_type, entity) %>% count() %>% arrange(desc(n)) %>% head(100) %>% collect()
|
|
```
|
|
|
|
# Core features
|
|
|
|
```{r}
|
|
core_features_corpus <- tbl(con,"core") %>% collect() %>%
|
|
transmute(
|
|
id=uuid,
|
|
date=lubridate::as_datetime(published),
|
|
texts=paste(title_full,text,sep = "\n"),
|
|
# Site
|
|
site_01 = ifelse(site==top_10_sites$site[1],1,0),
|
|
site_02 = ifelse(site==top_10_sites$site[2],1,0),
|
|
site_03 = ifelse(site==top_10_sites$site[3],1,0),
|
|
site_04 = ifelse(site==top_10_sites$site[4],1,0),
|
|
site_05 = ifelse(site==top_10_sites$site[5],1,0),
|
|
site_06 = ifelse(site==top_10_sites$site[6],1,0),
|
|
site_07 = ifelse(site==top_10_sites$site[7],1,0),
|
|
site_08 = ifelse(site==top_10_sites$site[8],1,0),
|
|
site_09 = ifelse(site==top_10_sites$site[9],1,0),
|
|
site_10 = ifelse(site==top_10_sites$site[10],1,0),
|
|
# Site type
|
|
is_blog = ifelse(site_type=="blogs",1,0),
|
|
# Country
|
|
country_01 = ifelse(country==top_10_country$country[1],1,0),
|
|
country_02 = ifelse(country==top_10_country$country[2],1,0),
|
|
country_03 = ifelse(country==top_10_country$country[3],1,0),
|
|
country_04 = ifelse(country==top_10_country$country[4],1,0),
|
|
country_05 = ifelse(country==top_10_country$country[5],1,0),
|
|
country_06 = ifelse(country==top_10_country$country[6],1,0),
|
|
country_07 = ifelse(country==top_10_country$country[7],1,0),
|
|
country_08 = ifelse(country==top_10_country$country[8],1,0),
|
|
country_09 = ifelse(country==top_10_country$country[9],1,0),
|
|
country_10 = ifelse(country==top_10_country$country[10],1,0)
|
|
) %>% left_join(entities_count_t,by=c("id"="uuid")) %>% sento_corpus()
|
|
```
|
|
|
|
```{r}
|
|
saveRDS(core_features_corpus,file = "core_features_corpus.RDS")
|
|
```
|
|
|
|
|
|
|