From da58ef73c540a8895ffb56a1b522e384360b28da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= <francois@francoispelletier.org>
Date: Sat, 17 Oct 2015 23:49:34 -0400
Subject: [PATCH] =?UTF-8?q?d=C3=A9but=20analyse?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Analyse.Rmd    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 Extraction.Rmd |  1 +
 2 files changed, 47 insertions(+)
 create mode 100644 Analyse.Rmd

diff --git a/Analyse.Rmd b/Analyse.Rmd
new file mode 100644
index 0000000..a392d86
--- /dev/null
+++ b/Analyse.Rmd
@@ -0,0 +1,46 @@
+---
+title: "Analyse des données"
+author: "François Pelletier"
+date: "17 octobre 2015"
+output: html_document
+---
+
+## Chargement des données
+```{r}
+load(file = "donnees_clean.RData")
+```
+
+## Aperçu quantitatif des données
+
+```{r}
+library(psych)
+attach(rawdata2)
+```
+
+## Nom de l'organisme
+
+On remarque, en calculant la fréquence des noms des organismes, que certains d'entre eux présentent des variations dans leur nom. Nous allons corriger ceci en utilisant un algorithme de similarité de la famille Latent Dirichlet Allocation.
+```{r}
+(freq_nom_organisme <- as.data.frame(table(nom_organisme)))
+library(tm)
+library(RTextTools)
+library(topicmodels)
+
+unique_nom_organisme <- unique(nom_organisme)
+
+organisme_matrix <- create_matrix(as.vector(unique_nom_organisme), 
+                        language = "french", 
+                        removeNumbers = TRUE, 
+                        stemWords = TRUE, 
+                        weighting = weightTf)
+
+
+lda <- LDA(organisme_matrix, 154 , method = "VEM", control = list(alpha = 0.75))
+nom_topic <- data.frame(nom_organisme = unique_nom_organisme,TOPIC = as.integer(topics(lda)))
+
+nom_topic_merged <- merge(nom_topic,freq_nom_organisme,by = "nom_organisme")
+nom_topic_sorted <- nom_topic_merged[order(nom_topic_merged$TOPIC,-nom_topic_merged$Freq),]
+```
+
+
+
diff --git a/Extraction.Rmd b/Extraction.Rmd
index 759d289..aa3e29b 100644
--- a/Extraction.Rmd
+++ b/Extraction.Rmd
@@ -138,6 +138,7 @@ Jointure de tous les fichiers de données dans une même table
 rawdata2 <- dplyr::bind_rows(rawdata)
 
 write.csv(rawdata2, "donnees_clean.csv",quote = TRUE,row.names = FALSE, na = "")
+save(rawdata2,file = "donnees_clean.RData")
 ```