--- title: "Formatage des données" author: "François Pelletier" date: "07/10/2019" output: html_document --- ```{r} library("tidyverse") library("RSQLite") library("DBI") library("sentometrics") ``` ```{r} con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite") ``` # Aperçu ```{r} tbl(con,"core") %>% head(10) %>% collect() %>% glimpse() ``` # Top 10 de modalités ```{r} top_10_sites <- tbl(con,"core") %>% select(site) %>% group_by(site) %>% count() %>% arrange(desc(n)) %>% head(10) %>% collect() saveRDS(top_10_sites,"top_10_sites.RDS") top_10_sites ``` ```{r} top_10_country <- tbl(con,"core") %>% select(country) %>% mutate(country = ifelse(country=="","XX",country)) %>% group_by(country) %>% count() %>% arrange(desc(n)) %>% head(10) %>% collect() saveRDS(top_10_country,"top_10_country.RDS") top_10_country ``` # Entities features ## Compteurs ```{r} entities_count <- tbl(con,"entities") %>% group_by(uuid,entity_type) %>% count %>% collect() ``` ```{r} entities_count_t <- entities_count %>% reshape2::dcast(uuid~paste0("entity_",entity_type),fun.aggregate = sum, value.var = "n") entities_count_t %>% head(10) %>% glimpse() ``` ## Aperçu ```{r} tbl(con,"entities") %>% group_by(entity_type, entity) %>% count() %>% arrange(desc(n)) %>% head(100) %>% collect() ``` # Core features ```{r} core_features_corpus <- tbl(con,"core") %>% collect() %>% transmute( id=uuid, date=lubridate::as_datetime(published), texts=paste(title_full,text,sep = "\n"), # Site site_01 = ifelse(site==top_10_sites$site[1],1,0), site_02 = ifelse(site==top_10_sites$site[2],1,0), site_03 = ifelse(site==top_10_sites$site[3],1,0), site_04 = ifelse(site==top_10_sites$site[4],1,0), site_05 = ifelse(site==top_10_sites$site[5],1,0), site_06 = ifelse(site==top_10_sites$site[6],1,0), site_07 = ifelse(site==top_10_sites$site[7],1,0), site_08 = ifelse(site==top_10_sites$site[8],1,0), site_09 = ifelse(site==top_10_sites$site[9],1,0), site_10 = ifelse(site==top_10_sites$site[10],1,0), # Site type is_blog = ifelse(site_type=="blogs",1,0), # Country country_01 = ifelse(country==top_10_country$country[1],1,0), country_02 = ifelse(country==top_10_country$country[2],1,0), country_03 = ifelse(country==top_10_country$country[3],1,0), country_04 = ifelse(country==top_10_country$country[4],1,0), country_05 = ifelse(country==top_10_country$country[5],1,0), country_06 = ifelse(country==top_10_country$country[6],1,0), country_07 = ifelse(country==top_10_country$country[7],1,0), country_08 = ifelse(country==top_10_country$country[8],1,0), country_09 = ifelse(country==top_10_country$country[9],1,0), country_10 = ifelse(country==top_10_country$country[10],1,0) ) %>% left_join(entities_count_t,by=c("id"="uuid")) %>% sento_corpus() ``` ```{r} saveRDS(core_features_corpus,file = "core_features_corpus.RDS") ```