création des features terminée

This commit is contained in:
François Pelletier 2019-10-31 22:35:24 -04:00
parent 834f38b151
commit b9c99e321e
2 changed files with 151 additions and 60 deletions

View file

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import json
import sentiment_analysis_functions as sfun
from scipy.sparse import csr_matrix, hstack
# installation
# import nltk
@ -9,8 +10,9 @@ import sentiment_analysis_functions as sfun
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')
# nltk.download('sentiwordnet')
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
@ -37,44 +39,124 @@ if __name__ == '__main__':
test_negative_reviews = load_reviews(test_neg_reviews_fn)
print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews))
print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews))
x0 = sfun.tokenize(train_positive_reviews)
x1 = sfun.norm_stemming(x0)
x2 = sfun.norm_lemmatize(x0)
x11 = sfun.select_freq(x1)
x21 = sfun.select_freq(x2)
x12 = sfun.select_rem_stopwords(x1)
x22 = sfun.select_rem_stopwords(x2)
x13 = sfun.select_open_class(x1)
x23 = sfun.select_open_class(x2)
x11c = sfun.corpus_documents(x11)
x21c = sfun.corpus_documents(x21)
x12c = sfun.corpus_documents(x12)
x22c = sfun.corpus_documents(x22)
x13c = sfun.corpus_documents(x13)
x23c = sfun.corpus_documents(x23)
# Train and test datasets
x111 = sfun.value_count(x11c)
x211 = sfun.value_count(x21c)
x121 = sfun.value_count(x12c)
x221 = sfun.value_count(x22c)
x131 = sfun.value_count(x13c)
x231 = sfun.value_count(x23c)
train_dataset = train_positive_reviews+train_negative_reviews
train_dataset_response = [1]*len(train_positive_reviews)+[0]*len(train_negative_reviews)
x112 = sfun.value_occurence(x11c)
x212 = sfun.value_occurence(x21c)
x122 = sfun.value_occurence(x12c)
x222 = sfun.value_occurence(x22c)
x132 = sfun.value_occurence(x13c)
x232 = sfun.value_occurence(x23c)
test_dataset = test_positive_reviews+test_negative_reviews
test_dataset_response = [1]*len(test_positive_reviews)+[0]*len(test_negative_reviews)
# Tokenisation
x113 = sfun.value_tfidf(x11c)
x213 = sfun.value_tfidf(x21c)
x123 = sfun.value_tfidf(x12c)
x223 = sfun.value_tfidf(x22c)
x133 = sfun.value_tfidf(x13c)
x233 = sfun.value_tfidf(x23c)
# Tokenize train
train_tokens = sfun.tokenize(train_dataset)
norm_train_tokens = []
norm_train_tokens.append(sfun.norm_stemming(train_tokens))
norm_train_tokens.append(sfun.norm_lemmatize(train_tokens))
# Tokenize test
test_tokens = sfun.tokenize(test_dataset)
norm_test_tokens = []
norm_test_tokens.append(sfun.norm_stemming(test_tokens))
norm_test_tokens.append(sfun.norm_lemmatize(test_tokens))
# Normalize and select tokens
norm_select_train_tokens = []
norm_select_test_tokens = []
norm_select_train_tokens_split = []
norm_select_test_tokens_split = []
for norm_method in range(0,2):
# train tokens
select_train_tokens = []
select_train_tokens_split = []
nn1 = sfun.select_freq(norm_train_tokens[norm_method])
nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_method])
nn3 = sfun.select_open_class(norm_train_tokens[norm_method])
select_train_tokens_split.append(nn1)
select_train_tokens_split.append(nn2)
select_train_tokens_split.append(nn3)
select_train_tokens.append(sfun.corpus_documents(nn1))
select_train_tokens.append(sfun.corpus_documents(nn2))
select_train_tokens.append(sfun.corpus_documents(nn3))
norm_select_train_tokens_split.append(select_train_tokens_split)
norm_select_train_tokens.append(select_train_tokens)
# test tokens
select_test_tokens = []
select_test_tokens_split = []
nn1 = sfun.select_freq(norm_test_tokens[norm_method])
nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_method])
nn3 = sfun.select_open_class(norm_test_tokens[norm_method])
select_test_tokens_split.append(nn1)
select_test_tokens_split.append(nn2)
select_test_tokens_split.append(nn3)
select_test_tokens.append(sfun.corpus_documents(nn1))
select_test_tokens.append(sfun.corpus_documents(nn2))
select_test_tokens.append(sfun.corpus_documents(nn3))
norm_select_test_tokens_split.append(select_test_tokens_split)
norm_select_test_tokens.append(select_test_tokens)
# Création des objets Vectorizer (Train seulement)
norm_select_vectorizers = []
for norm_method in range(0,2):
select_vectorizers = []
for select_method in range(0,3):
vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_method][select_method])
select_vectorizers.append(vectorizers)
norm_select_vectorizers.append(select_vectorizers)
# Transformation des jetons en vecteurs
v_norm_select_vectors_train = []
v_norm_select_vectors_test = []
for norm_method in range(0,2):
v_select_vectors_train = []
v_select_vectors_test = []
for select_method in range(0,3):
v_vectors_train = []
v_vectors_test = []
for vector_method in range(0,3):
v_vector_train = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_train_tokens[norm_method][select_method])
v_vector_test = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_test_tokens[norm_method][select_method])
v_vectors_train.append(v_vector_train)
v_vectors_test.append(v_vector_test)
v_select_vectors_train.append(v_vectors_train)
v_select_vectors_test.append(v_vectors_test)
v_norm_select_vectors_train.append(v_select_vectors_train)
v_norm_select_vectors_test.append(v_select_vectors_test)
# Ajout des attributs
v_norm_select_polarity_count_train = []
v_norm_select_polarity_count_test = []
for norm_method in range(0,2):
v_select_polarity_count_train = []
v_select_polarity_count_test = []
for select_method in range(0,3):
v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_method][select_method])
v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_method][select_method])
v_select_polarity_count_train.append(csr_matrix(v_polarity_count_train))
v_select_polarity_count_test.append(csr_matrix(v_polarity_count_test))
v_norm_select_polarity_count_train.append(v_select_polarity_count_train)
v_norm_select_polarity_count_test.append(v_select_polarity_count_test)
# Création des matrices finales
v_final_train = []
v_final_test = []
for norm_method in range(0,2):
v_select_final_train = []
v_select_final_test = []
for select_method in range(0,3):
v_vector_final_train = []
v_vector_final_test = []
for vector_method in range(0,3):
v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_method][select_method][vector_method],
v_norm_select_polarity_count_train[norm_method][select_method]]))
v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_method][select_method][vector_method],
v_norm_select_polarity_count_test[norm_method][select_method]]))
v_select_final_train.append(v_vector_final_train)
v_select_final_test.append(v_vector_final_test)
v_final_train.append(v_select_final_train)
v_final_test.append(v_select_final_test)
# Scoring des modèles

View file

@ -8,9 +8,11 @@ Created on Sun Oct 27 17:16:54 2019
import nltk
import re
import math
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
# Normalisation
@ -127,6 +129,7 @@ def select_open_class(norm_reviews):
def dummy_function(x):
return x
def corpus_documents(corpus):
corpus_documents = []
for document in corpus:
@ -136,39 +139,45 @@ def corpus_documents(corpus):
corpus_documents.append(doc_sentences)
return corpus_documents
def value_count(reviews):
def get_vectorizers(corpus_documents):
cvr = CountVectorizer(analyzer='word',
tokenizer=dummy_function,
preprocessor=dummy_function)
feat_value_count = cvr.fit_transform(reviews)
return feat_value_count
def value_occurence(reviews):
hvr = CountVectorizer(analyzer='word',
tokenizer=dummy_function,
preprocessor=dummy_function,
binary=True)
feat_value_occurence = hvr.fit_transform(reviews)
return feat_value_occurence
def value_tfidf(reviews):
tdvr = TfidfVectorizer(analyzer='word',
tokenizer=dummy_function,
preprocessor=dummy_function)
feat_value_tfidf = tdvr.fit_transform(reviews)
return feat_value_tfidf
cvr.fit(corpus_documents)
hvr.fit(corpus_documents)
tdvr.fit(corpus_documents)
return [cvr,hvr,tdvr]
# Other attributes
def attribute_polarity_count(reviews):
return 0
def attribute_length(reviews):
return 0
def attribute_polarity_count(norm_reviews):
polarity = []
for review in norm_reviews:
polarity_pos=0
polarity_neg=0
word_count=0
for sentence in review:
for word in sentence:
word_count += 1
try:
swn_synset = swn.senti_synset(wn.synsets(word)[0].name())
if swn_synset.pos_score() > 0.0:
polarity_pos += 1
if swn_synset.neg_score() > 0.0:
polarity_neg += 1
except:
pass
polarity.append([polarity_pos,polarity_neg,word_count])
return polarity
# Training