From 120a075b616a63f6f08b57e07eca373b91401ca6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= <francois@francoispelletier.org>
Date: Tue, 8 Oct 2019 00:24:15 -0400
Subject: [PATCH] mise a jour Rmd

---
 Analyse_BD.Rmd        | 103 ++++++++++++++++++++++++++++++++++++++++++
 Transformation_BD.Rmd |   8 ++--
 2 files changed, 108 insertions(+), 3 deletions(-)
 create mode 100644 Analyse_BD.Rmd

diff --git a/Analyse_BD.Rmd b/Analyse_BD.Rmd
new file mode 100644
index 0000000..d65877f
--- /dev/null
+++ b/Analyse_BD.Rmd
@@ -0,0 +1,103 @@
+---
+title: "Analyse BD"
+author: "François Pelletier"
+date: "07/10/2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+```{r}
+library("sentometrics")
+library("tidyverse")
+library("plotly")
+```
+
+```{r}
+core_features_corpus.RDS <- readRDS("core_features_corpus.RDS")
+top_10_country <- readRDS("top_10_country.RDS")
+top_10_sites <- readRDS("top_10_sites.RDS")
+corpusSample <- quanteda::corpus_sample(core_features_corpus.RDS, size = 200)
+```
+
+# Définition des lexiques
+
+```{r}
+data("list_valence_shifters", package = "sentometrics")
+data("list_lexicons", package = "sentometrics")
+
+lexIn <- list_lexicons[c("FEEL_en_tr")]
+valIn <- list_valence_shifters[["en"]]
+
+l1 <- sento_lexicons(lexIn,valIn)
+```
+
+# Calcul des sentiments
+
+```{r}
+c_sentiments_sample <- compute_sentiment(x = corpusSample, 
+                                  lexicons = l1, 
+                                  how = "counts", 
+                                  nCore = 8)
+c_sentiments_sample
+```
+
+```{r}
+c_control_compute <- ctr_agg(howWithin = "proportional", 
+                             howDocs = "equal_weight",
+                             howTime = "equal_weight",
+                             lag = 7,
+                             by = "day")
+
+c_sentiments <- sento_measures(sento_corpus = core_features_corpus.RDS, 
+                               lexicons = l1,
+                               ctr = c_control_compute)
+```
+
+```{r}
+c_measures <- as.data.table(c_sentiments)
+```
+
+```{r}
+c_measures_g <- measures_global(c_sentiments)
+```
+
+# Sentiment par site
+
+```{r}
+c_measures_melt <- c_measures %>% 
+  select(date,starts_with("FEEL_en_tr--site")) %>% 
+  `colnames<-`(c("date",top_10_sites$site)) %>%
+  melt(id="date",variable.name = "site")
+plot_site <- ggplot(data=c_measures_melt, 
+       aes(x=date, y=value, colour=site))+
+  geom_line()
+ggplotly(plot_site)
+```
+
+# Sentiment par pays
+
+```{r}
+c_measures_melt <- c_measures %>% 
+  select(date,starts_with("FEEL_en_tr--country")) %>% 
+  `colnames<-`(c("date",top_10_country$country)) %>%
+  melt(id="date",variable.name = "country")
+plot_country <- ggplot(data=c_measures_melt, 
+       aes(x=date, y=value, colour=country))+
+  geom_line()
+ggplotly(plot_country)
+```
+
+# Sentiment par compteur d'entités
+
+```{r}
+c_measures_melt <- c_measures %>% 
+  select(date,starts_with("FEEL_en_tr--entity")) %>% 
+  melt(id="date",variable.name = "entity")
+plot_entity <- ggplot(data=c_measures_melt, 
+       aes(x=date, y=value, colour=entity))+
+  geom_line()
+ggplotly(plot_entity)
+```
\ No newline at end of file
diff --git a/Transformation_BD.Rmd b/Transformation_BD.Rmd
index f635d52..34e3448 100644
--- a/Transformation_BD.Rmd
+++ b/Transformation_BD.Rmd
@@ -1,4 +1,4 @@
----
+  ---
 title: "Formatage des données"
 author: "François Pelletier"
 date: "07/10/2019"
@@ -32,6 +32,7 @@ top_10_sites <- tbl(con,"core") %>%
   arrange(desc(n)) %>% 
   head(10) %>% 
   collect()
+saveRDS(top_10_sites,"top_10_sites.RDS")
 top_10_sites
 ```
 
@@ -44,6 +45,7 @@ top_10_country <- tbl(con,"core") %>%
   arrange(desc(n)) %>% 
   head(10) %>% 
   collect()
+saveRDS(top_10_country,"top_10_country.RDS")
 top_10_country
 ```
 
@@ -56,7 +58,7 @@ entities_count <- tbl(con,"entities") %>% group_by(uuid,entity_type) %>% count %
 ```
 
 ```{r}
-entities_count_t <- entities_count %>% reshape2::dcast(uuid~entity_type,fun.aggregate = sum, value.var = "n")
+entities_count_t <- entities_count %>% reshape2::dcast(uuid~paste0("entity_",entity_type),fun.aggregate = sum, value.var = "n")
 entities_count_t %>% head(10) %>% glimpse()
 ```
 
@@ -73,7 +75,7 @@ core_features_corpus <- tbl(con,"core") %>% collect() %>%
   transmute(
     id=uuid,
     date=lubridate::as_datetime(published),
-    texts=paste(title_full,text,sep = "\n") %>% substr(start = 1,stop = 1000),
+    texts=paste(title_full,text,sep = "\n"),
     # Site
     site_01 = ifelse(site==top_10_sites$site[1],1,0),
     site_02 = ifelse(site==top_10_sites$site[2],1,0),