création des features terminée
This commit is contained in:
parent
834f38b151
commit
b9c99e321e
2 changed files with 151 additions and 60 deletions
|
@ -1,6 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import sentiment_analysis_functions as sfun
|
||||
from scipy.sparse import csr_matrix, hstack
|
||||
|
||||
# installation
|
||||
# import nltk
|
||||
|
@ -9,8 +10,9 @@ import sentiment_analysis_functions as sfun
|
|||
# nltk.download('stopwords')
|
||||
# nltk.download('averaged_perceptron_tagger')
|
||||
# nltk.download('universal_tagset')
|
||||
# nltk.download('sentiwordnet')
|
||||
|
||||
|
||||
# from sklearn.feature_extraction.text import CountVectorizer
|
||||
# from sklearn.naive_bayes import MultinomialNB
|
||||
# from sklearn.linear_model import LogisticRegression
|
||||
|
||||
|
@ -37,44 +39,124 @@ if __name__ == '__main__':
|
|||
test_negative_reviews = load_reviews(test_neg_reviews_fn)
|
||||
print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews))
|
||||
print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews))
|
||||
|
||||
x0 = sfun.tokenize(train_positive_reviews)
|
||||
x1 = sfun.norm_stemming(x0)
|
||||
x2 = sfun.norm_lemmatize(x0)
|
||||
|
||||
x11 = sfun.select_freq(x1)
|
||||
x21 = sfun.select_freq(x2)
|
||||
|
||||
x12 = sfun.select_rem_stopwords(x1)
|
||||
x22 = sfun.select_rem_stopwords(x2)
|
||||
|
||||
x13 = sfun.select_open_class(x1)
|
||||
x23 = sfun.select_open_class(x2)
|
||||
|
||||
x11c = sfun.corpus_documents(x11)
|
||||
x21c = sfun.corpus_documents(x21)
|
||||
x12c = sfun.corpus_documents(x12)
|
||||
x22c = sfun.corpus_documents(x22)
|
||||
x13c = sfun.corpus_documents(x13)
|
||||
x23c = sfun.corpus_documents(x23)
|
||||
# Train and test datasets
|
||||
|
||||
x111 = sfun.value_count(x11c)
|
||||
x211 = sfun.value_count(x21c)
|
||||
x121 = sfun.value_count(x12c)
|
||||
x221 = sfun.value_count(x22c)
|
||||
x131 = sfun.value_count(x13c)
|
||||
x231 = sfun.value_count(x23c)
|
||||
train_dataset = train_positive_reviews+train_negative_reviews
|
||||
train_dataset_response = [1]*len(train_positive_reviews)+[0]*len(train_negative_reviews)
|
||||
|
||||
x112 = sfun.value_occurence(x11c)
|
||||
x212 = sfun.value_occurence(x21c)
|
||||
x122 = sfun.value_occurence(x12c)
|
||||
x222 = sfun.value_occurence(x22c)
|
||||
x132 = sfun.value_occurence(x13c)
|
||||
x232 = sfun.value_occurence(x23c)
|
||||
test_dataset = test_positive_reviews+test_negative_reviews
|
||||
test_dataset_response = [1]*len(test_positive_reviews)+[0]*len(test_negative_reviews)
|
||||
|
||||
# Tokenisation
|
||||
|
||||
x113 = sfun.value_tfidf(x11c)
|
||||
x213 = sfun.value_tfidf(x21c)
|
||||
x123 = sfun.value_tfidf(x12c)
|
||||
x223 = sfun.value_tfidf(x22c)
|
||||
x133 = sfun.value_tfidf(x13c)
|
||||
x233 = sfun.value_tfidf(x23c)
|
||||
# Tokenize train
|
||||
train_tokens = sfun.tokenize(train_dataset)
|
||||
norm_train_tokens = []
|
||||
norm_train_tokens.append(sfun.norm_stemming(train_tokens))
|
||||
norm_train_tokens.append(sfun.norm_lemmatize(train_tokens))
|
||||
|
||||
# Tokenize test
|
||||
test_tokens = sfun.tokenize(test_dataset)
|
||||
norm_test_tokens = []
|
||||
norm_test_tokens.append(sfun.norm_stemming(test_tokens))
|
||||
norm_test_tokens.append(sfun.norm_lemmatize(test_tokens))
|
||||
|
||||
|
||||
# Normalize and select tokens
|
||||
norm_select_train_tokens = []
|
||||
norm_select_test_tokens = []
|
||||
norm_select_train_tokens_split = []
|
||||
norm_select_test_tokens_split = []
|
||||
for norm_method in range(0,2):
|
||||
# train tokens
|
||||
select_train_tokens = []
|
||||
select_train_tokens_split = []
|
||||
nn1 = sfun.select_freq(norm_train_tokens[norm_method])
|
||||
nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_method])
|
||||
nn3 = sfun.select_open_class(norm_train_tokens[norm_method])
|
||||
select_train_tokens_split.append(nn1)
|
||||
select_train_tokens_split.append(nn2)
|
||||
select_train_tokens_split.append(nn3)
|
||||
select_train_tokens.append(sfun.corpus_documents(nn1))
|
||||
select_train_tokens.append(sfun.corpus_documents(nn2))
|
||||
select_train_tokens.append(sfun.corpus_documents(nn3))
|
||||
norm_select_train_tokens_split.append(select_train_tokens_split)
|
||||
norm_select_train_tokens.append(select_train_tokens)
|
||||
# test tokens
|
||||
select_test_tokens = []
|
||||
select_test_tokens_split = []
|
||||
nn1 = sfun.select_freq(norm_test_tokens[norm_method])
|
||||
nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_method])
|
||||
nn3 = sfun.select_open_class(norm_test_tokens[norm_method])
|
||||
select_test_tokens_split.append(nn1)
|
||||
select_test_tokens_split.append(nn2)
|
||||
select_test_tokens_split.append(nn3)
|
||||
select_test_tokens.append(sfun.corpus_documents(nn1))
|
||||
select_test_tokens.append(sfun.corpus_documents(nn2))
|
||||
select_test_tokens.append(sfun.corpus_documents(nn3))
|
||||
norm_select_test_tokens_split.append(select_test_tokens_split)
|
||||
norm_select_test_tokens.append(select_test_tokens)
|
||||
|
||||
# Création des objets Vectorizer (Train seulement)
|
||||
norm_select_vectorizers = []
|
||||
for norm_method in range(0,2):
|
||||
select_vectorizers = []
|
||||
for select_method in range(0,3):
|
||||
vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_method][select_method])
|
||||
select_vectorizers.append(vectorizers)
|
||||
norm_select_vectorizers.append(select_vectorizers)
|
||||
|
||||
# Transformation des jetons en vecteurs
|
||||
v_norm_select_vectors_train = []
|
||||
v_norm_select_vectors_test = []
|
||||
for norm_method in range(0,2):
|
||||
v_select_vectors_train = []
|
||||
v_select_vectors_test = []
|
||||
for select_method in range(0,3):
|
||||
v_vectors_train = []
|
||||
v_vectors_test = []
|
||||
for vector_method in range(0,3):
|
||||
v_vector_train = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_train_tokens[norm_method][select_method])
|
||||
v_vector_test = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_test_tokens[norm_method][select_method])
|
||||
v_vectors_train.append(v_vector_train)
|
||||
v_vectors_test.append(v_vector_test)
|
||||
v_select_vectors_train.append(v_vectors_train)
|
||||
v_select_vectors_test.append(v_vectors_test)
|
||||
v_norm_select_vectors_train.append(v_select_vectors_train)
|
||||
v_norm_select_vectors_test.append(v_select_vectors_test)
|
||||
|
||||
# Ajout des attributs
|
||||
v_norm_select_polarity_count_train = []
|
||||
v_norm_select_polarity_count_test = []
|
||||
for norm_method in range(0,2):
|
||||
v_select_polarity_count_train = []
|
||||
v_select_polarity_count_test = []
|
||||
for select_method in range(0,3):
|
||||
v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_method][select_method])
|
||||
v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_method][select_method])
|
||||
v_select_polarity_count_train.append(csr_matrix(v_polarity_count_train))
|
||||
v_select_polarity_count_test.append(csr_matrix(v_polarity_count_test))
|
||||
v_norm_select_polarity_count_train.append(v_select_polarity_count_train)
|
||||
v_norm_select_polarity_count_test.append(v_select_polarity_count_test)
|
||||
|
||||
# Création des matrices finales
|
||||
v_final_train = []
|
||||
v_final_test = []
|
||||
for norm_method in range(0,2):
|
||||
v_select_final_train = []
|
||||
v_select_final_test = []
|
||||
for select_method in range(0,3):
|
||||
v_vector_final_train = []
|
||||
v_vector_final_test = []
|
||||
for vector_method in range(0,3):
|
||||
v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_method][select_method][vector_method],
|
||||
v_norm_select_polarity_count_train[norm_method][select_method]]))
|
||||
v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_method][select_method][vector_method],
|
||||
v_norm_select_polarity_count_test[norm_method][select_method]]))
|
||||
v_select_final_train.append(v_vector_final_train)
|
||||
v_select_final_test.append(v_vector_final_test)
|
||||
v_final_train.append(v_select_final_train)
|
||||
v_final_test.append(v_select_final_test)
|
||||
|
||||
# Scoring des modèles
|
||||
|
|
|
@ -8,9 +8,11 @@ Created on Sun Oct 27 17:16:54 2019
|
|||
|
||||
import nltk
|
||||
import re
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
||||
from nltk.corpus import wordnet as wn
|
||||
from nltk.corpus import sentiwordnet as swn
|
||||
|
||||
# Normalisation
|
||||
|
||||
|
@ -127,6 +129,7 @@ def select_open_class(norm_reviews):
|
|||
def dummy_function(x):
|
||||
return x
|
||||
|
||||
|
||||
def corpus_documents(corpus):
|
||||
corpus_documents = []
|
||||
for document in corpus:
|
||||
|
@ -136,39 +139,45 @@ def corpus_documents(corpus):
|
|||
corpus_documents.append(doc_sentences)
|
||||
return corpus_documents
|
||||
|
||||
def value_count(reviews):
|
||||
|
||||
def get_vectorizers(corpus_documents):
|
||||
cvr = CountVectorizer(analyzer='word',
|
||||
tokenizer=dummy_function,
|
||||
preprocessor=dummy_function)
|
||||
feat_value_count = cvr.fit_transform(reviews)
|
||||
return feat_value_count
|
||||
|
||||
|
||||
def value_occurence(reviews):
|
||||
hvr = CountVectorizer(analyzer='word',
|
||||
tokenizer=dummy_function,
|
||||
preprocessor=dummy_function,
|
||||
binary=True)
|
||||
feat_value_occurence = hvr.fit_transform(reviews)
|
||||
return feat_value_occurence
|
||||
|
||||
|
||||
def value_tfidf(reviews):
|
||||
tdvr = TfidfVectorizer(analyzer='word',
|
||||
tokenizer=dummy_function,
|
||||
preprocessor=dummy_function)
|
||||
feat_value_tfidf = tdvr.fit_transform(reviews)
|
||||
return feat_value_tfidf
|
||||
|
||||
|
||||
cvr.fit(corpus_documents)
|
||||
hvr.fit(corpus_documents)
|
||||
tdvr.fit(corpus_documents)
|
||||
return [cvr,hvr,tdvr]
|
||||
|
||||
# Other attributes
|
||||
|
||||
def attribute_polarity_count(reviews):
|
||||
return 0
|
||||
|
||||
|
||||
def attribute_length(reviews):
|
||||
return 0
|
||||
def attribute_polarity_count(norm_reviews):
|
||||
polarity = []
|
||||
for review in norm_reviews:
|
||||
polarity_pos=0
|
||||
polarity_neg=0
|
||||
word_count=0
|
||||
for sentence in review:
|
||||
for word in sentence:
|
||||
word_count += 1
|
||||
try:
|
||||
swn_synset = swn.senti_synset(wn.synsets(word)[0].name())
|
||||
if swn_synset.pos_score() > 0.0:
|
||||
polarity_pos += 1
|
||||
if swn_synset.neg_score() > 0.0:
|
||||
polarity_neg += 1
|
||||
except:
|
||||
pass
|
||||
polarity.append([polarity_pos,polarity_neg,word_count])
|
||||
return polarity
|
||||
|
||||
|
||||
# Training
|
||||
|
|
Loading…
Reference in a new issue