transformation des données et présentation
This commit is contained in:
parent
01b0f0bd92
commit
104d649bb9
4 changed files with 194 additions and 6 deletions
|
@ -1,5 +1,5 @@
|
|||
---
|
||||
title: "Presentation Sentometrics"
|
||||
title: "Préparation des données Sentometrics"
|
||||
author: "François Pelletier"
|
||||
date: "06/10/2019"
|
||||
output: html_document
|
||||
|
@ -124,7 +124,6 @@ file_news <- list.files(path = "google_news_blogs/news",pattern = "*.json",full.
|
|||
```
|
||||
|
||||
```{r}
|
||||
|
||||
traiter_json <- function(file_path){
|
||||
|
||||
json_contents <- jsonlite::read_json(file_path)
|
||||
|
@ -133,7 +132,12 @@ traiter_json <- function(file_path){
|
|||
dbAppendTable(con,"core",core_df)
|
||||
dbAppendTable(con,"entities",entities_df)
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
# Traitement des fichiers
|
||||
|
||||
```{r eval=FALSE}
|
||||
i <- 0 # itérateur
|
||||
for (file_blog in file_blogs){
|
||||
if(!(i %% 1000)){
|
||||
|
@ -143,10 +147,6 @@ for (file_blog in file_blogs){
|
|||
i <- i+1
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
ii <- 0 # itérateur
|
||||
for (file_article in file_news){
|
||||
if(!(ii %% 1000)){
|
||||
|
@ -155,5 +155,6 @@ for (file_article in file_news){
|
|||
traiter_json(file_article)
|
||||
ii <- ii+1
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
|
79
Presentation.Rmd
Normal file
79
Presentation.Rmd
Normal file
|
@ -0,0 +1,79 @@
|
|||
---
|
||||
title: "Sentometrics - Présentation CAAMD"
|
||||
author: "François Pelletier"
|
||||
date: "07/10/2019"
|
||||
output:
|
||||
beamer_presentation: default
|
||||
slidy_presentation: default
|
||||
---
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = FALSE)
|
||||
```
|
||||
|
||||
## Sentometrics
|
||||
|
||||
Présentation basée sur un atelier présenté par Keven Bluteau à **R à Québec 2019**.
|
||||
|
||||
D'où vient le nom ?
|
||||
|
||||
- Mélange d'analyse de sentiments et d'économétrie
|
||||
- Type d'analyse de plus en plus fréquent en finance, en marketing et en politique.
|
||||
|
||||
Quelle forme prend le produit ?
|
||||
|
||||
- Package R
|
||||
- Services conseils ($)
|
||||
|
||||
## Article de référence
|
||||
|
||||
[The R Package sentometrics to Compute, Aggregate and Predict with Textual Sentiment](https://ssrn.com/abstract=3067734)
|
||||
|
||||
## Pourquoi ?
|
||||
|
||||
Les données qualitatives sont de plus en plus utilisées pour raffiner les analyses prédictives, car elle donnent une rétroaction sur la passé et le futur, contrairement aux données numériques qui donnent toujours une image passée ou présente d'une réalité.
|
||||
|
||||
## Historique des packages R
|
||||
|
||||
- [tm](https://cran.r-project.org/web/packages/tm/index.html) (2008)
|
||||
- [openNLP](https://cran.r-project.org/web/packages/openNLP/index.html) (2016)
|
||||
- [quanteda](https://cran.r-project.org/web/packages/quanteda/index.html) (2018)
|
||||
- [tidytext](https://cran.r-project.org/web/packages/tidytext/index.html) (2016)
|
||||
|
||||
## Les bases
|
||||
|
||||
Sentometrics est construit sur `quanteda` et `data.table`. Les modèles sont estimés avec `glmnet` et `caret`.
|
||||
|
||||
## Les fonctionnalités
|
||||
|
||||
![](images/sentometrics_functionality.png)
|
||||
|
||||
## Calcul des sentiments
|
||||
|
||||
|
||||
- Unigrammes: somme pondérée des scores pour tous les mots apparaissant dans un lexique
|
||||
- Bigrammes avec décalage de polarité (valence shifting)
|
||||
- Va intégrer l'impact de mots négatifs par exemple (good vs. not good)
|
||||
- Groupements avec décalage de polarité (valence shifting)
|
||||
- Fenêtre mobile avant et après le mot
|
||||
|
||||
## Aggrégation des sentiments
|
||||
|
||||
Les sentiments sont aggrégés en deux phases:
|
||||
|
||||
- Pour tous les documents durant une période donnée
|
||||
- Pour plusieurs périodes consécutives
|
||||
|
||||
## Création des métriques
|
||||
|
||||
- Aggrégation à l'intérieur du document (howWithin)
|
||||
- Aggrégation à l'intérieur d'un intervalle de temps (howDocs)
|
||||
- Aggrégation au fil du temps (howTime)
|
||||
|
||||
## Modélisation
|
||||
|
||||
- Régression avec Elastic Net
|
||||
- Configuration des hyperparamètres avec `ctr_model()`
|
||||
- Entraînement avec `sento_model()`
|
||||
|
||||
|
108
Transformation_BD.Rmd
Normal file
108
Transformation_BD.Rmd
Normal file
|
@ -0,0 +1,108 @@
|
|||
---
|
||||
title: "Formatage des données"
|
||||
author: "François Pelletier"
|
||||
date: "07/10/2019"
|
||||
output: html_document
|
||||
---
|
||||
|
||||
```{r}
|
||||
library("tidyverse")
|
||||
library("RSQLite")
|
||||
library("DBI")
|
||||
library("sentometrics")
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")
|
||||
```
|
||||
|
||||
# Aperçu
|
||||
|
||||
```{r}
|
||||
tbl(con,"core") %>% head(10) %>% collect() %>% glimpse()
|
||||
```
|
||||
|
||||
# Top 10 de modalités
|
||||
```{r}
|
||||
top_10_sites <- tbl(con,"core") %>%
|
||||
select(site) %>%
|
||||
group_by(site) %>%
|
||||
count() %>%
|
||||
arrange(desc(n)) %>%
|
||||
head(10) %>%
|
||||
collect()
|
||||
top_10_sites
|
||||
```
|
||||
|
||||
```{r}
|
||||
top_10_country <- tbl(con,"core") %>%
|
||||
select(country) %>%
|
||||
mutate(country = ifelse(country=="","XX",country)) %>%
|
||||
group_by(country) %>%
|
||||
count() %>%
|
||||
arrange(desc(n)) %>%
|
||||
head(10) %>%
|
||||
collect()
|
||||
top_10_country
|
||||
```
|
||||
|
||||
# Entities features
|
||||
|
||||
## Compteurs
|
||||
|
||||
```{r}
|
||||
entities_count <- tbl(con,"entities") %>% group_by(uuid,entity_type) %>% count %>% collect()
|
||||
```
|
||||
|
||||
```{r}
|
||||
entities_count_t <- entities_count %>% reshape2::dcast(uuid~entity_type,fun.aggregate = sum, value.var = "n")
|
||||
```
|
||||
|
||||
## Aperçu
|
||||
|
||||
```{r}
|
||||
tbl(con,"entities") %>% group_by(entity_type, entity) %>% count() %>% arrange(desc(n)) %>% head(100) %>% collect()
|
||||
```
|
||||
|
||||
# Core features
|
||||
|
||||
```{r}
|
||||
core_features_corpus <- tbl(con,"core") %>% collect() %>%
|
||||
transmute(
|
||||
id=uuid,
|
||||
date=lubridate::as_datetime(published),
|
||||
texts=paste(title_full,text,sep = "\n") %>% substr(start = 1,stop = 1000),
|
||||
# Site
|
||||
site_01 = ifelse(site==top_10_sites$site[1],1,0),
|
||||
site_02 = ifelse(site==top_10_sites$site[2],1,0),
|
||||
site_03 = ifelse(site==top_10_sites$site[3],1,0),
|
||||
site_04 = ifelse(site==top_10_sites$site[4],1,0),
|
||||
site_05 = ifelse(site==top_10_sites$site[5],1,0),
|
||||
site_06 = ifelse(site==top_10_sites$site[6],1,0),
|
||||
site_07 = ifelse(site==top_10_sites$site[7],1,0),
|
||||
site_08 = ifelse(site==top_10_sites$site[8],1,0),
|
||||
site_09 = ifelse(site==top_10_sites$site[9],1,0),
|
||||
site_10 = ifelse(site==top_10_sites$site[10],1,0),
|
||||
# Site type
|
||||
is_blog = ifelse(site_type=="blogs",1,0),
|
||||
# Country
|
||||
country_01 = ifelse(country==top_10_country$country[1],1,0),
|
||||
country_02 = ifelse(country==top_10_country$country[2],1,0),
|
||||
country_03 = ifelse(country==top_10_country$country[3],1,0),
|
||||
country_04 = ifelse(country==top_10_country$country[4],1,0),
|
||||
country_05 = ifelse(country==top_10_country$country[5],1,0),
|
||||
country_06 = ifelse(country==top_10_country$country[6],1,0),
|
||||
country_07 = ifelse(country==top_10_country$country[7],1,0),
|
||||
country_08 = ifelse(country==top_10_country$country[8],1,0),
|
||||
country_09 = ifelse(country==top_10_country$country[9],1,0),
|
||||
country_10 = ifelse(country==top_10_country$country[10],1,0)
|
||||
) %>% left_join(entities_count_t,by=c("id"="uuid")) %>% sento_corpus()
|
||||
```
|
||||
|
||||
```{r}
|
||||
saveRDS(core_features_corpus,file = "core_features_corpus.RDS")
|
||||
```
|
||||
|
||||
|
||||
|
BIN
images/sentometrics_functionality.png
Normal file
BIN
images/sentometrics_functionality.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 111 KiB |
Loading…
Reference in a new issue