création des features terminée

2019-10-31 22:35:24 -04:00 · 2019-10-31 22:35:24 -04:00 · b9c99e321e
commit b9c99e321e
parent 834f38b151
2 changed files with 151 additions and 60 deletions
--- a/sentiment_analysis.py
+++ b/sentiment_analysis.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import json
 import sentiment_analysis_functions as sfun
+from scipy.sparse import csr_matrix, hstack

 # installation
 # import nltk
@ -9,8 +10,9 @@ import sentiment_analysis_functions as sfun
 # nltk.download('stopwords')
 # nltk.download('averaged_perceptron_tagger')
 # nltk.download('universal_tagset')
+# nltk.download('sentiwordnet')
+

-# from sklearn.feature_extraction.text import CountVectorizer
 # from sklearn.naive_bayes import MultinomialNB
 # from sklearn.linear_model import LogisticRegression

@ -37,44 +39,124 @@ if __name__ == '__main__':
    test_negative_reviews = load_reviews(test_neg_reviews_fn)
    print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews))
    print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews))
-
-    x0 = sfun.tokenize(train_positive_reviews)
-    x1 = sfun.norm_stemming(x0)
-    x2 = sfun.norm_lemmatize(x0)
-
-    x11 = sfun.select_freq(x1)
-    x21 = sfun.select_freq(x2)
-
-    x12 = sfun.select_rem_stopwords(x1)
-    x22 = sfun.select_rem_stopwords(x2)
-
-    x13 = sfun.select_open_class(x1)
-    x23 = sfun.select_open_class(x2)
    
-    x11c = sfun.corpus_documents(x11)
-    x21c = sfun.corpus_documents(x21)
-    x12c = sfun.corpus_documents(x12)
-    x22c = sfun.corpus_documents(x22)
-    x13c = sfun.corpus_documents(x13)
-    x23c = sfun.corpus_documents(x23)
+    # Train and test datasets
    
-    x111 = sfun.value_count(x11c)
-    x211 = sfun.value_count(x21c)
-    x121 = sfun.value_count(x12c)
-    x221 = sfun.value_count(x22c)
-    x131 = sfun.value_count(x13c)
-    x231 = sfun.value_count(x23c)
+    train_dataset = train_positive_reviews+train_negative_reviews
+    train_dataset_response = [1]*len(train_positive_reviews)+[0]*len(train_negative_reviews)
    
-    x112 = sfun.value_occurence(x11c)
-    x212 = sfun.value_occurence(x21c)
-    x122 = sfun.value_occurence(x12c)
-    x222 = sfun.value_occurence(x22c)
-    x132 = sfun.value_occurence(x13c)
-    x232 = sfun.value_occurence(x23c)
+    test_dataset = test_positive_reviews+test_negative_reviews
+    test_dataset_response = [1]*len(test_positive_reviews)+[0]*len(test_negative_reviews)
+
+    # Tokenisation
    
-    x113 = sfun.value_tfidf(x11c)
-    x213 = sfun.value_tfidf(x21c)
-    x123 = sfun.value_tfidf(x12c)
-    x223 = sfun.value_tfidf(x22c)
-    x133 = sfun.value_tfidf(x13c)
-    x233 = sfun.value_tfidf(x23c)
+    # Tokenize train
+    train_tokens = sfun.tokenize(train_dataset)
+    norm_train_tokens = []
+    norm_train_tokens.append(sfun.norm_stemming(train_tokens))
+    norm_train_tokens.append(sfun.norm_lemmatize(train_tokens))
+    
+    # Tokenize test
+    test_tokens = sfun.tokenize(test_dataset)
+    norm_test_tokens = []
+    norm_test_tokens.append(sfun.norm_stemming(test_tokens))
+    norm_test_tokens.append(sfun.norm_lemmatize(test_tokens))   
+    
+    
+    # Normalize and select tokens
+    norm_select_train_tokens = []
+    norm_select_test_tokens = []
+    norm_select_train_tokens_split = []
+    norm_select_test_tokens_split = []
+    for norm_method in range(0,2):
+        # train tokens
+        select_train_tokens = []
+        select_train_tokens_split = []
+        nn1 = sfun.select_freq(norm_train_tokens[norm_method])
+        nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_method])
+        nn3 = sfun.select_open_class(norm_train_tokens[norm_method])
+        select_train_tokens_split.append(nn1)
+        select_train_tokens_split.append(nn2)
+        select_train_tokens_split.append(nn3)
+        select_train_tokens.append(sfun.corpus_documents(nn1))
+        select_train_tokens.append(sfun.corpus_documents(nn2))
+        select_train_tokens.append(sfun.corpus_documents(nn3))       
+        norm_select_train_tokens_split.append(select_train_tokens_split)
+        norm_select_train_tokens.append(select_train_tokens)
+        # test tokens
+        select_test_tokens = []
+        select_test_tokens_split = []
+        nn1 = sfun.select_freq(norm_test_tokens[norm_method])
+        nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_method])
+        nn3 = sfun.select_open_class(norm_test_tokens[norm_method])
+        select_test_tokens_split.append(nn1)
+        select_test_tokens_split.append(nn2)
+        select_test_tokens_split.append(nn3)
+        select_test_tokens.append(sfun.corpus_documents(nn1))
+        select_test_tokens.append(sfun.corpus_documents(nn2))
+        select_test_tokens.append(sfun.corpus_documents(nn3))       
+        norm_select_test_tokens_split.append(select_test_tokens_split)
+        norm_select_test_tokens.append(select_test_tokens)
+    
+    # Création des objets Vectorizer (Train seulement)
+    norm_select_vectorizers = []
+    for norm_method in range(0,2):
+        select_vectorizers = []
+        for select_method in range(0,3):
+            vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_method][select_method])
+            select_vectorizers.append(vectorizers)
+        norm_select_vectorizers.append(select_vectorizers)
+    
+    # Transformation des jetons en vecteurs
+    v_norm_select_vectors_train = []
+    v_norm_select_vectors_test = []
+    for norm_method in range(0,2):
+        v_select_vectors_train = []
+        v_select_vectors_test = []        
+        for select_method in range(0,3):
+            v_vectors_train = []
+            v_vectors_test = []
+            for vector_method in range(0,3):
+                v_vector_train = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_train_tokens[norm_method][select_method])
+                v_vector_test = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_test_tokens[norm_method][select_method])
+                v_vectors_train.append(v_vector_train)
+                v_vectors_test.append(v_vector_test)
+            v_select_vectors_train.append(v_vectors_train)
+            v_select_vectors_test.append(v_vectors_test)
+        v_norm_select_vectors_train.append(v_select_vectors_train)
+        v_norm_select_vectors_test.append(v_select_vectors_test)
+        
+    # Ajout des attributs
+    v_norm_select_polarity_count_train = []
+    v_norm_select_polarity_count_test = []
+    for norm_method in range(0,2):
+        v_select_polarity_count_train = []
+        v_select_polarity_count_test = []
+        for select_method in range(0,3):
+            v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_method][select_method])
+            v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_method][select_method])
+            v_select_polarity_count_train.append(csr_matrix(v_polarity_count_train))
+            v_select_polarity_count_test.append(csr_matrix(v_polarity_count_test))
+        v_norm_select_polarity_count_train.append(v_select_polarity_count_train)
+        v_norm_select_polarity_count_test.append(v_select_polarity_count_test)
+    
+    # Création des matrices finales
+    v_final_train = []
+    v_final_test = []
+    for norm_method in range(0,2):
+        v_select_final_train = []
+        v_select_final_test = []
+        for select_method in range(0,3):
+            v_vector_final_train = []
+            v_vector_final_test = []            
+            for vector_method in range(0,3):
+                v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_method][select_method][vector_method],
+                                                v_norm_select_polarity_count_train[norm_method][select_method]]))
+                v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_method][select_method][vector_method],
+                                               v_norm_select_polarity_count_test[norm_method][select_method]]))
+            v_select_final_train.append(v_vector_final_train)
+            v_select_final_test.append(v_vector_final_test)
+        v_final_train.append(v_select_final_train)
+        v_final_test.append(v_select_final_test)
+            
+    # Scoring des modèles
--- a/sentiment_analysis_functions.py
+++ b/sentiment_analysis_functions.py
@ -8,9 +8,11 @@ Created on Sun Oct 27 17:16:54 2019

 import nltk
 import re
+import math
 from collections import defaultdict
-from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
-
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from nltk.corpus import wordnet as wn
+from nltk.corpus import sentiwordnet as swn

 # Normalisation

@ -127,6 +129,7 @@ def select_open_class(norm_reviews):
 def dummy_function(x):
    return x

+
 def corpus_documents(corpus):
    corpus_documents = []
    for document in corpus:
@ -136,39 +139,45 @@ def corpus_documents(corpus):
        corpus_documents.append(doc_sentences)
    return corpus_documents

-def value_count(reviews):
+
+def get_vectorizers(corpus_documents):
    cvr = CountVectorizer(analyzer='word', 
                          tokenizer=dummy_function, 
                          preprocessor=dummy_function)
-    feat_value_count = cvr.fit_transform(reviews)
-    return feat_value_count
-
-
-def value_occurence(reviews):
    hvr = CountVectorizer(analyzer='word', 
                            tokenizer=dummy_function, 
                            preprocessor=dummy_function,
                            binary=True)
-    feat_value_occurence = hvr.fit_transform(reviews)
-    return feat_value_occurence    
-
-
-def value_tfidf(reviews):
    tdvr = TfidfVectorizer(analyzer='word', 
                          tokenizer=dummy_function, 
                          preprocessor=dummy_function)
-    feat_value_tfidf = tdvr.fit_transform(reviews)
-    return feat_value_tfidf
-
+    
+    cvr.fit(corpus_documents)
+    hvr.fit(corpus_documents)    
+    tdvr.fit(corpus_documents)
+    return [cvr,hvr,tdvr]

 # Other attributes

-def attribute_polarity_count(reviews):
-    return 0
-
-
-def attribute_length(reviews):
-    return 0
+def attribute_polarity_count(norm_reviews):
+    polarity = []
+    for review in norm_reviews:
+        polarity_pos=0
+        polarity_neg=0
+        word_count=0
+        for sentence in review:
+            for word in sentence:
+                word_count += 1
+                try:
+                    swn_synset = swn.senti_synset(wn.synsets(word)[0].name())
+                    if swn_synset.pos_score() > 0.0:
+                        polarity_pos += 1
+                    if swn_synset.neg_score() > 0.0:
+                        polarity_neg += 1
+                except:
+                    pass
+        polarity.append([polarity_pos,polarity_neg,word_count])
+    return polarity


 # Training