From 120a075b616a63f6f08b57e07eca373b91401ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Tue, 8 Oct 2019 00:24:15 -0400 Subject: [PATCH] mise a jour Rmd --- Analyse_BD.Rmd | 103 ++++++++++++++++++++++++++++++++++++++++++ Transformation_BD.Rmd | 8 ++-- 2 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 Analyse_BD.Rmd diff --git a/Analyse_BD.Rmd b/Analyse_BD.Rmd new file mode 100644 index 0000000..d65877f --- /dev/null +++ b/Analyse_BD.Rmd @@ -0,0 +1,103 @@ +--- +title: "Analyse BD" +author: "François Pelletier" +date: "07/10/2019" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r} +library("sentometrics") +library("tidyverse") +library("plotly") +``` + +```{r} +core_features_corpus.RDS <- readRDS("core_features_corpus.RDS") +top_10_country <- readRDS("top_10_country.RDS") +top_10_sites <- readRDS("top_10_sites.RDS") +corpusSample <- quanteda::corpus_sample(core_features_corpus.RDS, size = 200) +``` + +# Définition des lexiques + +```{r} +data("list_valence_shifters", package = "sentometrics") +data("list_lexicons", package = "sentometrics") + +lexIn <- list_lexicons[c("FEEL_en_tr")] +valIn <- list_valence_shifters[["en"]] + +l1 <- sento_lexicons(lexIn,valIn) +``` + +# Calcul des sentiments + +```{r} +c_sentiments_sample <- compute_sentiment(x = corpusSample, + lexicons = l1, + how = "counts", + nCore = 8) +c_sentiments_sample +``` + +```{r} +c_control_compute <- ctr_agg(howWithin = "proportional", + howDocs = "equal_weight", + howTime = "equal_weight", + lag = 7, + by = "day") + +c_sentiments <- sento_measures(sento_corpus = core_features_corpus.RDS, + lexicons = l1, + ctr = c_control_compute) +``` + +```{r} +c_measures <- as.data.table(c_sentiments) +``` + +```{r} +c_measures_g <- measures_global(c_sentiments) +``` + +# Sentiment par site + +```{r} +c_measures_melt <- c_measures %>% + select(date,starts_with("FEEL_en_tr--site")) %>% + `colnames<-`(c("date",top_10_sites$site)) %>% + melt(id="date",variable.name = "site") +plot_site <- ggplot(data=c_measures_melt, + aes(x=date, y=value, colour=site))+ + geom_line() +ggplotly(plot_site) +``` + +# Sentiment par pays + +```{r} +c_measures_melt <- c_measures %>% + select(date,starts_with("FEEL_en_tr--country")) %>% + `colnames<-`(c("date",top_10_country$country)) %>% + melt(id="date",variable.name = "country") +plot_country <- ggplot(data=c_measures_melt, + aes(x=date, y=value, colour=country))+ + geom_line() +ggplotly(plot_country) +``` + +# Sentiment par compteur d'entités + +```{r} +c_measures_melt <- c_measures %>% + select(date,starts_with("FEEL_en_tr--entity")) %>% + melt(id="date",variable.name = "entity") +plot_entity <- ggplot(data=c_measures_melt, + aes(x=date, y=value, colour=entity))+ + geom_line() +ggplotly(plot_entity) +``` \ No newline at end of file diff --git a/Transformation_BD.Rmd b/Transformation_BD.Rmd index f635d52..34e3448 100644 --- a/Transformation_BD.Rmd +++ b/Transformation_BD.Rmd @@ -1,4 +1,4 @@ ---- + --- title: "Formatage des données" author: "François Pelletier" date: "07/10/2019" @@ -32,6 +32,7 @@ top_10_sites <- tbl(con,"core") %>% arrange(desc(n)) %>% head(10) %>% collect() +saveRDS(top_10_sites,"top_10_sites.RDS") top_10_sites ``` @@ -44,6 +45,7 @@ top_10_country <- tbl(con,"core") %>% arrange(desc(n)) %>% head(10) %>% collect() +saveRDS(top_10_country,"top_10_country.RDS") top_10_country ``` @@ -56,7 +58,7 @@ entities_count <- tbl(con,"entities") %>% group_by(uuid,entity_type) %>% count % ``` ```{r} -entities_count_t <- entities_count %>% reshape2::dcast(uuid~entity_type,fun.aggregate = sum, value.var = "n") +entities_count_t <- entities_count %>% reshape2::dcast(uuid~paste0("entity_",entity_type),fun.aggregate = sum, value.var = "n") entities_count_t %>% head(10) %>% glimpse() ``` @@ -73,7 +75,7 @@ core_features_corpus <- tbl(con,"core") %>% collect() %>% transmute( id=uuid, date=lubridate::as_datetime(published), - texts=paste(title_full,text,sep = "\n") %>% substr(start = 1,stop = 1000), + texts=paste(title_full,text,sep = "\n"), # Site site_01 = ifelse(site==top_10_sites$site[1],1,0), site_02 = ifelse(site==top_10_sites$site[2],1,0),