fin code premier problème

This commit is contained in:
François Pelletier 2019-11-02 01:17:08 -04:00
parent 196b0d9649
commit e84f3820b0

View file

@ -1,7 +1,9 @@
# -*- coding: utf-8 -*-
import json
import sentiment_analysis_functions as sfun
from sklearn.metrics import accuracy_score, recall_score, precision_score
from scipy.sparse import csr_matrix, hstack
import pandas as pd
# installation
# import nltk
@ -44,6 +46,25 @@ if __name__ == '__main__':
test_dataset = test_positive_reviews+test_negative_reviews
test_dataset_response = [1]*len(test_positive_reviews)+[0]*len(test_negative_reviews)
# Méthodes
norm_names = {
0:"Stemming",
1:"Lemmatisation"
}
select_names = {
0:"Fréquence",
1:"Mot-outils",
2:"Classe ouverte"
}
vectorizer_names = {
0:"Compteur",
1:"Occurence",
2:"TF-IDF"
}
# Tokenisation
@ -59,19 +80,18 @@ if __name__ == '__main__':
norm_test_tokens.append(sfun.norm_stemming(test_tokens))
norm_test_tokens.append(sfun.norm_lemmatize(test_tokens))
# Normalize and select tokens
norm_select_train_tokens = []
norm_select_test_tokens = []
norm_select_train_tokens_split = []
norm_select_test_tokens_split = []
for norm_method in range(0,2):
for norm_m in range(0,2):
# train tokens
select_train_tokens = []
select_train_tokens_split = []
nn1 = sfun.select_freq(norm_train_tokens[norm_method])
nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_method])
nn3 = sfun.select_open_class(norm_train_tokens[norm_method])
nn1 = sfun.select_freq(norm_train_tokens[norm_m])
nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_m])
nn3 = sfun.select_open_class(norm_train_tokens[norm_m])
select_train_tokens_split.append(nn1)
select_train_tokens_split.append(nn2)
select_train_tokens_split.append(nn3)
@ -83,9 +103,9 @@ if __name__ == '__main__':
# test tokens
select_test_tokens = []
select_test_tokens_split = []
nn1 = sfun.select_freq(norm_test_tokens[norm_method])
nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_method])
nn3 = sfun.select_open_class(norm_test_tokens[norm_method])
nn1 = sfun.select_freq(norm_test_tokens[norm_m])
nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_m])
nn3 = sfun.select_open_class(norm_test_tokens[norm_m])
select_test_tokens_split.append(nn1)
select_test_tokens_split.append(nn2)
select_test_tokens_split.append(nn3)
@ -97,25 +117,25 @@ if __name__ == '__main__':
# Création des objets Vectorizer (Train seulement)
norm_select_vectorizers = []
for norm_method in range(0,2):
for norm_m in range(0,2):
select_vectorizers = []
for select_method in range(0,3):
vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_method][select_method])
for sel_m in range(0,3):
vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_m][sel_m])
select_vectorizers.append(vectorizers)
norm_select_vectorizers.append(select_vectorizers)
# Transformation des jetons en vecteurs
v_norm_select_vectors_train = []
v_norm_select_vectors_test = []
for norm_method in range(0,2):
for norm_m in range(0,2):
v_select_vectors_train = []
v_select_vectors_test = []
for select_method in range(0,3):
for sel_m in range(0,3):
v_vectors_train = []
v_vectors_test = []
for vector_method in range(0,3):
v_vector_train = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_train_tokens[norm_method][select_method])
v_vector_test = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_test_tokens[norm_method][select_method])
for vect_m in range(0,3):
v_vector_train = norm_select_vectorizers[norm_m][sel_m][vect_m].transform(norm_select_train_tokens[norm_m][sel_m])
v_vector_test = norm_select_vectorizers[norm_m][sel_m][vect_m].transform(norm_select_test_tokens[norm_m][sel_m])
v_vectors_train.append(v_vector_train)
v_vectors_test.append(v_vector_test)
v_select_vectors_train.append(v_vectors_train)
@ -126,12 +146,12 @@ if __name__ == '__main__':
# Ajout des attributs
v_norm_select_polarity_count_train = []
v_norm_select_polarity_count_test = []
for norm_method in range(0,2):
for norm_m in range(0,2):
v_select_polarity_count_train = []
v_select_polarity_count_test = []
for select_method in range(0,3):
v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_method][select_method])
v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_method][select_method])
for sel_m in range(0,3):
v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_m][sel_m])
v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_m][sel_m])
v_select_polarity_count_train.append(csr_matrix(v_polarity_count_train))
v_select_polarity_count_test.append(csr_matrix(v_polarity_count_test))
v_norm_select_polarity_count_train.append(v_select_polarity_count_train)
@ -140,17 +160,17 @@ if __name__ == '__main__':
# Création des matrices finales
v_final_train = []
v_final_test = []
for norm_method in range(0,2):
for norm_m in range(0,2):
v_select_final_train = []
v_select_final_test = []
for select_method in range(0,3):
for sel_m in range(0,3):
v_vector_final_train = []
v_vector_final_test = []
for vector_method in range(0,3):
v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_method][select_method][vector_method],
v_norm_select_polarity_count_train[norm_method][select_method]]))
v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_method][select_method][vector_method],
v_norm_select_polarity_count_test[norm_method][select_method]]))
for vect_m in range(0,3):
v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_m][sel_m][vect_m],
v_norm_select_polarity_count_train[norm_m][sel_m]]))
v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_m][sel_m][vect_m],
v_norm_select_polarity_count_test[norm_m][sel_m]]))
v_select_final_train.append(v_vector_final_train)
v_select_final_test.append(v_vector_final_test)
v_final_train.append(v_select_final_train)
@ -162,21 +182,23 @@ if __name__ == '__main__':
scores_nb = []
modeles_reg = []
scores_reg = []
for norm_method in range(0,2):
for norm_m in range(0,2):
modeles_select_vector_nb = []
scores_select_vector_nb = []
modeles_select_vector_reg = []
scores_select_vector_reg = []
for select_method in range(0,3):
for sel_m in range(0,3):
modeles_vector_nb = []
scores_vector_nb = []
modeles_vector_reg = []
scores_vector_reg = []
for vector_method in range(0,3):
modele_nb = sfun.train_naive_model(v_final_train[norm_method][select_method][vector_method],train_dataset_response)
score_nb = modele_nb.predict(v_final_test[norm_method][select_method][vector_method])
modele_reg = sfun.train_regression_model(v_final_train[norm_method][select_method][vector_method],train_dataset_response)
score_reg = modele_reg.predict(v_final_test[norm_method][select_method][vector_method])
for vect_m in range(0,3):
modele_nb = sfun.train_naive_model(v_final_train[norm_m][sel_m][vect_m],train_dataset_response)
score_nb = modele_nb.predict(v_final_test[norm_m][sel_m][vect_m])
modele_reg = sfun.train_regression_model(v_final_train[norm_m][sel_m][vect_m],train_dataset_response)
score_reg = modele_reg.predict(v_final_test[norm_m][sel_m][vect_m])
modeles_vector_nb.append(modele_nb)
scores_vector_nb.append(score_nb)
modeles_vector_reg.append(modele_reg)
scores_vector_reg.append(score_reg)
modeles_select_vector_nb.append(modeles_vector_nb)
@ -186,4 +208,46 @@ if __name__ == '__main__':
modeles_nb.append(modeles_select_vector_nb)
scores_nb.append(scores_select_vector_nb)
modeles_reg.append(modeles_select_vector_reg)
scores_reg.append(scores_select_vector_reg)
scores_reg.append(scores_select_vector_reg)
# Performance des modéles
def class_metrics(test_y,pred_y):
return [precision_score(test_y,pred_y),recall_score(test_y,pred_y),accuracy_score(test_y,pred_y)]
cm_nb = []
cm_reg = []
for norm_m in range(0,2):
cm_sel_vect_nb = []
cm_sel_vect_reg = []
for sel_m in range(0,3):
cm_vect_nb = []
cm_vect_reg = []
for vect_m in range(0,3):
cm_vect_nb.append(class_metrics(test_dataset_response,scores_nb[norm_m][sel_m][vect_m]))
cm_vect_reg.append(class_metrics(test_dataset_response,scores_reg[norm_m][sel_m][vect_m]))
cm_sel_vect_nb.append(cm_vect_nb)
cm_sel_vect_reg.append(cm_vect_reg)
cm_nb.append(cm_sel_vect_nb)
cm_reg.append(cm_sel_vect_reg)
table_metriques = []
for norm_m in range(0,2):
for sel_m in range(0,3):
for vect_m in range(0,3):
table_metriques.append([norm_names.get(norm_m),
select_names.get(sel_m),
vectorizer_names.get(vect_m)]+
cm_nb[norm_m][sel_m][vect_m]+
cm_reg[norm_m][sel_m][vect_m])
table_metriques_df = pd.DataFrame(table_metriques,columns=['Normalisation',
'Sélection',
'Vectorisation',
'Précision NB',
'Rappel NB',
'Exactitude NB',
'Précision RL',
'Rappel RL',
'Exactitude RL'])