From b9c99e321e567cef3d2e3d95d0aaeec0d2dafdbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Thu, 31 Oct 2019 22:35:24 -0400 Subject: [PATCH] =?UTF-8?q?cr=C3=A9ation=20des=20features=20termin=C3=A9e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sentiment_analysis.py | 158 ++++++++++++++++++++++++-------- sentiment_analysis_functions.py | 53 ++++++----- 2 files changed, 151 insertions(+), 60 deletions(-) diff --git a/sentiment_analysis.py b/sentiment_analysis.py index 36e99dd..4847b00 100644 --- a/sentiment_analysis.py +++ b/sentiment_analysis.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import json import sentiment_analysis_functions as sfun +from scipy.sparse import csr_matrix, hstack # installation # import nltk @@ -9,8 +10,9 @@ import sentiment_analysis_functions as sfun # nltk.download('stopwords') # nltk.download('averaged_perceptron_tagger') # nltk.download('universal_tagset') +# nltk.download('sentiwordnet') + -# from sklearn.feature_extraction.text import CountVectorizer # from sklearn.naive_bayes import MultinomialNB # from sklearn.linear_model import LogisticRegression @@ -37,44 +39,124 @@ if __name__ == '__main__': test_negative_reviews = load_reviews(test_neg_reviews_fn) print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews)) print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews)) - - x0 = sfun.tokenize(train_positive_reviews) - x1 = sfun.norm_stemming(x0) - x2 = sfun.norm_lemmatize(x0) - - x11 = sfun.select_freq(x1) - x21 = sfun.select_freq(x2) - - x12 = sfun.select_rem_stopwords(x1) - x22 = sfun.select_rem_stopwords(x2) - - x13 = sfun.select_open_class(x1) - x23 = sfun.select_open_class(x2) - x11c = sfun.corpus_documents(x11) - x21c = sfun.corpus_documents(x21) - x12c = sfun.corpus_documents(x12) - x22c = sfun.corpus_documents(x22) - x13c = sfun.corpus_documents(x13) - x23c = sfun.corpus_documents(x23) + # Train and test datasets - x111 = sfun.value_count(x11c) - x211 = sfun.value_count(x21c) - x121 = sfun.value_count(x12c) - x221 = sfun.value_count(x22c) - x131 = sfun.value_count(x13c) - x231 = sfun.value_count(x23c) + train_dataset = train_positive_reviews+train_negative_reviews + train_dataset_response = [1]*len(train_positive_reviews)+[0]*len(train_negative_reviews) - x112 = sfun.value_occurence(x11c) - x212 = sfun.value_occurence(x21c) - x122 = sfun.value_occurence(x12c) - x222 = sfun.value_occurence(x22c) - x132 = sfun.value_occurence(x13c) - x232 = sfun.value_occurence(x23c) + test_dataset = test_positive_reviews+test_negative_reviews + test_dataset_response = [1]*len(test_positive_reviews)+[0]*len(test_negative_reviews) + + # Tokenisation - x113 = sfun.value_tfidf(x11c) - x213 = sfun.value_tfidf(x21c) - x123 = sfun.value_tfidf(x12c) - x223 = sfun.value_tfidf(x22c) - x133 = sfun.value_tfidf(x13c) - x233 = sfun.value_tfidf(x23c) \ No newline at end of file + # Tokenize train + train_tokens = sfun.tokenize(train_dataset) + norm_train_tokens = [] + norm_train_tokens.append(sfun.norm_stemming(train_tokens)) + norm_train_tokens.append(sfun.norm_lemmatize(train_tokens)) + + # Tokenize test + test_tokens = sfun.tokenize(test_dataset) + norm_test_tokens = [] + norm_test_tokens.append(sfun.norm_stemming(test_tokens)) + norm_test_tokens.append(sfun.norm_lemmatize(test_tokens)) + + + # Normalize and select tokens + norm_select_train_tokens = [] + norm_select_test_tokens = [] + norm_select_train_tokens_split = [] + norm_select_test_tokens_split = [] + for norm_method in range(0,2): + # train tokens + select_train_tokens = [] + select_train_tokens_split = [] + nn1 = sfun.select_freq(norm_train_tokens[norm_method]) + nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_method]) + nn3 = sfun.select_open_class(norm_train_tokens[norm_method]) + select_train_tokens_split.append(nn1) + select_train_tokens_split.append(nn2) + select_train_tokens_split.append(nn3) + select_train_tokens.append(sfun.corpus_documents(nn1)) + select_train_tokens.append(sfun.corpus_documents(nn2)) + select_train_tokens.append(sfun.corpus_documents(nn3)) + norm_select_train_tokens_split.append(select_train_tokens_split) + norm_select_train_tokens.append(select_train_tokens) + # test tokens + select_test_tokens = [] + select_test_tokens_split = [] + nn1 = sfun.select_freq(norm_test_tokens[norm_method]) + nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_method]) + nn3 = sfun.select_open_class(norm_test_tokens[norm_method]) + select_test_tokens_split.append(nn1) + select_test_tokens_split.append(nn2) + select_test_tokens_split.append(nn3) + select_test_tokens.append(sfun.corpus_documents(nn1)) + select_test_tokens.append(sfun.corpus_documents(nn2)) + select_test_tokens.append(sfun.corpus_documents(nn3)) + norm_select_test_tokens_split.append(select_test_tokens_split) + norm_select_test_tokens.append(select_test_tokens) + + # Création des objets Vectorizer (Train seulement) + norm_select_vectorizers = [] + for norm_method in range(0,2): + select_vectorizers = [] + for select_method in range(0,3): + vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_method][select_method]) + select_vectorizers.append(vectorizers) + norm_select_vectorizers.append(select_vectorizers) + + # Transformation des jetons en vecteurs + v_norm_select_vectors_train = [] + v_norm_select_vectors_test = [] + for norm_method in range(0,2): + v_select_vectors_train = [] + v_select_vectors_test = [] + for select_method in range(0,3): + v_vectors_train = [] + v_vectors_test = [] + for vector_method in range(0,3): + v_vector_train = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_train_tokens[norm_method][select_method]) + v_vector_test = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_test_tokens[norm_method][select_method]) + v_vectors_train.append(v_vector_train) + v_vectors_test.append(v_vector_test) + v_select_vectors_train.append(v_vectors_train) + v_select_vectors_test.append(v_vectors_test) + v_norm_select_vectors_train.append(v_select_vectors_train) + v_norm_select_vectors_test.append(v_select_vectors_test) + + # Ajout des attributs + v_norm_select_polarity_count_train = [] + v_norm_select_polarity_count_test = [] + for norm_method in range(0,2): + v_select_polarity_count_train = [] + v_select_polarity_count_test = [] + for select_method in range(0,3): + v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_method][select_method]) + v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_method][select_method]) + v_select_polarity_count_train.append(csr_matrix(v_polarity_count_train)) + v_select_polarity_count_test.append(csr_matrix(v_polarity_count_test)) + v_norm_select_polarity_count_train.append(v_select_polarity_count_train) + v_norm_select_polarity_count_test.append(v_select_polarity_count_test) + + # Création des matrices finales + v_final_train = [] + v_final_test = [] + for norm_method in range(0,2): + v_select_final_train = [] + v_select_final_test = [] + for select_method in range(0,3): + v_vector_final_train = [] + v_vector_final_test = [] + for vector_method in range(0,3): + v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_method][select_method][vector_method], + v_norm_select_polarity_count_train[norm_method][select_method]])) + v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_method][select_method][vector_method], + v_norm_select_polarity_count_test[norm_method][select_method]])) + v_select_final_train.append(v_vector_final_train) + v_select_final_test.append(v_vector_final_test) + v_final_train.append(v_select_final_train) + v_final_test.append(v_select_final_test) + + # Scoring des modèles diff --git a/sentiment_analysis_functions.py b/sentiment_analysis_functions.py index 4e926f4..18e6c5d 100644 --- a/sentiment_analysis_functions.py +++ b/sentiment_analysis_functions.py @@ -8,9 +8,11 @@ Created on Sun Oct 27 17:16:54 2019 import nltk import re +import math from collections import defaultdict -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer - +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer +from nltk.corpus import wordnet as wn +from nltk.corpus import sentiwordnet as swn # Normalisation @@ -127,6 +129,7 @@ def select_open_class(norm_reviews): def dummy_function(x): return x + def corpus_documents(corpus): corpus_documents = [] for document in corpus: @@ -136,39 +139,45 @@ def corpus_documents(corpus): corpus_documents.append(doc_sentences) return corpus_documents -def value_count(reviews): + +def get_vectorizers(corpus_documents): cvr = CountVectorizer(analyzer='word', tokenizer=dummy_function, preprocessor=dummy_function) - feat_value_count = cvr.fit_transform(reviews) - return feat_value_count - - -def value_occurence(reviews): hvr = CountVectorizer(analyzer='word', tokenizer=dummy_function, preprocessor=dummy_function, binary=True) - feat_value_occurence = hvr.fit_transform(reviews) - return feat_value_occurence - - -def value_tfidf(reviews): tdvr = TfidfVectorizer(analyzer='word', tokenizer=dummy_function, preprocessor=dummy_function) - feat_value_tfidf = tdvr.fit_transform(reviews) - return feat_value_tfidf - + + cvr.fit(corpus_documents) + hvr.fit(corpus_documents) + tdvr.fit(corpus_documents) + return [cvr,hvr,tdvr] # Other attributes -def attribute_polarity_count(reviews): - return 0 - - -def attribute_length(reviews): - return 0 +def attribute_polarity_count(norm_reviews): + polarity = [] + for review in norm_reviews: + polarity_pos=0 + polarity_neg=0 + word_count=0 + for sentence in review: + for word in sentence: + word_count += 1 + try: + swn_synset = swn.senti_synset(wn.synsets(word)[0].name()) + if swn_synset.pos_score() > 0.0: + polarity_pos += 1 + if swn_synset.neg_score() > 0.0: + polarity_neg += 1 + except: + pass + polarity.append([polarity_pos,polarity_neg,word_count]) + return polarity # Training