From 834f38b15157cd42c1e154bb16fc5bf37c3adba1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= <francois@francoispelletier.org>
Date: Thu, 31 Oct 2019 01:32:02 -0400
Subject: [PATCH] ajout fonction compteurs features

---
 sentiment_analysis.py           |  46 +++++++++--
 sentiment_analysis_functions.py | 138 +++++++++++++++++++++++---------
 2 files changed, 141 insertions(+), 43 deletions(-)

diff --git a/sentiment_analysis.py b/sentiment_analysis.py
index 19b6e0d..36e99dd 100644
--- a/sentiment_analysis.py
+++ b/sentiment_analysis.py
@@ -7,10 +7,12 @@ import sentiment_analysis_functions as sfun
 # nltk.download('punkt')
 # nltk.download('wordnet')
 # nltk.download('stopwords')
+# nltk.download('averaged_perceptron_tagger')
+# nltk.download('universal_tagset')
 
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.linear_model import LogisticRegression
+# from sklearn.feature_extraction.text import CountVectorizer
+# from sklearn.naive_bayes import MultinomialNB
+# from sklearn.linear_model import LogisticRegression
 
 train_pos_reviews_fn = "./data/train-positive-t1.txt"
 train_neg_reviews_fn = "./data/train-negative-t1.txt"
@@ -27,6 +29,7 @@ def load_reviews(filename):
         reviews_list = json.load(fp)
     return reviews_list
 
+
 if __name__ == '__main__':
     train_positive_reviews = load_reviews(train_pos_reviews_fn)
     train_negative_reviews = load_reviews(train_neg_reviews_fn)
@@ -34,13 +37,44 @@ if __name__ == '__main__':
     test_negative_reviews = load_reviews(test_neg_reviews_fn)
     print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews))
     print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews))
-    
+
     x0 = sfun.tokenize(train_positive_reviews)
     x1 = sfun.norm_stemming(x0)
     x2 = sfun.norm_lemmatize(x0)
-    
+
     x11 = sfun.select_freq(x1)
     x21 = sfun.select_freq(x2)
 
     x12 = sfun.select_rem_stopwords(x1)
-    x22 = sfun.select_rem_stopwords(x2)
\ No newline at end of file
+    x22 = sfun.select_rem_stopwords(x2)
+
+    x13 = sfun.select_open_class(x1)
+    x23 = sfun.select_open_class(x2)
+    
+    x11c = sfun.corpus_documents(x11)
+    x21c = sfun.corpus_documents(x21)
+    x12c = sfun.corpus_documents(x12)
+    x22c = sfun.corpus_documents(x22)
+    x13c = sfun.corpus_documents(x13)
+    x23c = sfun.corpus_documents(x23)
+    
+    x111 = sfun.value_count(x11c)
+    x211 = sfun.value_count(x21c)
+    x121 = sfun.value_count(x12c)
+    x221 = sfun.value_count(x22c)
+    x131 = sfun.value_count(x13c)
+    x231 = sfun.value_count(x23c)
+    
+    x112 = sfun.value_occurence(x11c)
+    x212 = sfun.value_occurence(x21c)
+    x122 = sfun.value_occurence(x12c)
+    x222 = sfun.value_occurence(x22c)
+    x132 = sfun.value_occurence(x13c)
+    x232 = sfun.value_occurence(x23c)
+    
+    x113 = sfun.value_tfidf(x11c)
+    x213 = sfun.value_tfidf(x21c)
+    x123 = sfun.value_tfidf(x12c)
+    x223 = sfun.value_tfidf(x22c)
+    x133 = sfun.value_tfidf(x13c)
+    x233 = sfun.value_tfidf(x23c)
\ No newline at end of file
diff --git a/sentiment_analysis_functions.py b/sentiment_analysis_functions.py
index 979e85b..4e926f4 100644
--- a/sentiment_analysis_functions.py
+++ b/sentiment_analysis_functions.py
@@ -9,15 +9,17 @@ Created on Sun Oct 27 17:16:54 2019
 import nltk
 import re
 from collections import defaultdict
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
+
+
+# Normalisation
 
-## Normalisation
-    
 def tokenize(reviews):
     tokenizer = nltk.tokenize.ToktokTokenizer()
     tokenized_reviews = []
     for review in reviews:
         # Plusieurs fin de phrases étaient représentées par deux espaces ou plus.
-        review = re.sub(r"\s{2,}",". ",review)
+        review = re.sub(r"\s{2,}", ". ", review)
         review = str.lower(review)
         tokenized_sentences = []
         sentences = nltk.sent_tokenize(review)
@@ -31,6 +33,7 @@ def tokenize(reviews):
         tokenized_reviews.append(tokenized_sentences)
     return tokenized_reviews
 
+
 def norm_stemming(tokenized_reviews):
     porter = nltk.PorterStemmer()
     stemmed_reviews = []
@@ -42,6 +45,7 @@ def norm_stemming(tokenized_reviews):
         stemmed_reviews.append(stemmed_review)
     return stemmed_reviews
 
+
 def norm_lemmatize(tokenized_reviews):
     wnl = nltk.WordNetLemmatizer()
     normalized_reviews = []
@@ -54,8 +58,9 @@ def norm_lemmatize(tokenized_reviews):
         normalized_reviews.append(normalized_review)
     return normalized_reviews
 
-## Feature selection
-    
+
+# Feature selection
+
 def dict_frequency(norm_reviews):
     tokens_frequency = defaultdict(int)
     for review in norm_reviews:
@@ -64,54 +69,113 @@ def dict_frequency(norm_reviews):
                 tokens_frequency[token] += 1
     return tokens_frequency
 
+
 def select_freq(norm_reviews):
     tokens_frequency = dict_frequency(norm_reviews)
     norm_reviews_freq3 = [
-     [
-      [token for token in sentence if tokens_frequency[token] > 3] 
-      for sentence in review
-      ] 
-     for review in norm_reviews
-     ]
+        [
+            [token for token in sentence if tokens_frequency[token] > 3]
+            for sentence in review
+        ]
+        for review in norm_reviews
+    ]
     return norm_reviews_freq3
-    
+
+
 def select_rem_stopwords(norm_reviews):
     sws = set(nltk.corpus.stopwords.words('english'))
     norm_reviews_stoprem = [
-     [
-      [token for token in sentence if token not in sws] 
-      for sentence in review
-      ] 
-     for review in norm_reviews
-     ]
+        [
+            [token for token in sentence if token not in sws]
+            for sentence in review
+        ]
+        for review in norm_reviews
+    ]
     return norm_reviews_stoprem
-    
-def select_open_class(reviews):
-    return 0
-    
-## Attribute value
-    
+
+
+def pos_tag_reviews(norm_reviews):
+    tagged_reviews = []
+    for review in norm_reviews:
+        tagged_review = nltk.pos_tag_sents(review, tagset='universal')
+        tagged_reviews.append(tagged_review)
+    return tagged_reviews
+
+
+def validate_open_class(token_tuple):
+    # i is not tagged right
+    if token_tuple[1] in ['NOUN','ADJ','ADV','VERB'] and token_tuple[0] != 'i':
+        return True
+    else:
+        return False
+
+
+def select_open_class(norm_reviews):
+    tagged_reviews = pos_tag_reviews(norm_reviews)
+    select_oc_token = [
+        [
+            [token_tuple[0] for token_tuple in sentence if validate_open_class(token_tuple)]
+            for sentence in review
+        ]
+        for review in tagged_reviews
+    ]
+    return select_oc_token
+
+
+# Attribute value
+
+def dummy_function(x):
+    return x
+
+def corpus_documents(corpus):
+    corpus_documents = []
+    for document in corpus:
+        doc_sentences = []
+        for sentence in document:
+            doc_sentences += sentence
+        corpus_documents.append(doc_sentences)
+    return corpus_documents
+
 def value_count(reviews):
-    return 0
-    
+    cvr = CountVectorizer(analyzer='word', 
+                          tokenizer=dummy_function, 
+                          preprocessor=dummy_function)
+    feat_value_count = cvr.fit_transform(reviews)
+    return feat_value_count
+
+
 def value_occurence(reviews):
-    return 0
-    
+    hvr = CountVectorizer(analyzer='word', 
+                            tokenizer=dummy_function, 
+                            preprocessor=dummy_function,
+                            binary=True)
+    feat_value_occurence = hvr.fit_transform(reviews)
+    return feat_value_occurence    
+
+
 def value_tfidf(reviews):
-    return 0
-    
-## Other attributes
-    
+    tdvr = TfidfVectorizer(analyzer='word', 
+                          tokenizer=dummy_function, 
+                          preprocessor=dummy_function)
+    feat_value_tfidf = tdvr.fit_transform(reviews)
+    return feat_value_tfidf
+
+
+# Other attributes
+
 def attribute_polarity_count(reviews):
     return 0
-    
+
+
 def attribute_length(reviews):
     return 0
-    
-## Training
-    
+
+
+# Training
+
 def train_naive_model(reviews):
     return 0
-    
+
+
 def train_regression_model(reviews):
     return 0