fin code premier problème
This commit is contained in:
parent
196b0d9649
commit
e84f3820b0
1 changed files with 99 additions and 35 deletions
|
@ -1,7 +1,9 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import sentiment_analysis_functions as sfun
|
||||
from sklearn.metrics import accuracy_score, recall_score, precision_score
|
||||
from scipy.sparse import csr_matrix, hstack
|
||||
import pandas as pd
|
||||
|
||||
# installation
|
||||
# import nltk
|
||||
|
@ -44,6 +46,25 @@ if __name__ == '__main__':
|
|||
|
||||
test_dataset = test_positive_reviews+test_negative_reviews
|
||||
test_dataset_response = [1]*len(test_positive_reviews)+[0]*len(test_negative_reviews)
|
||||
|
||||
# Méthodes
|
||||
|
||||
norm_names = {
|
||||
0:"Stemming",
|
||||
1:"Lemmatisation"
|
||||
}
|
||||
|
||||
select_names = {
|
||||
0:"Fréquence",
|
||||
1:"Mot-outils",
|
||||
2:"Classe ouverte"
|
||||
}
|
||||
|
||||
vectorizer_names = {
|
||||
0:"Compteur",
|
||||
1:"Occurence",
|
||||
2:"TF-IDF"
|
||||
}
|
||||
|
||||
# Tokenisation
|
||||
|
||||
|
@ -59,19 +80,18 @@ if __name__ == '__main__':
|
|||
norm_test_tokens.append(sfun.norm_stemming(test_tokens))
|
||||
norm_test_tokens.append(sfun.norm_lemmatize(test_tokens))
|
||||
|
||||
|
||||
# Normalize and select tokens
|
||||
norm_select_train_tokens = []
|
||||
norm_select_test_tokens = []
|
||||
norm_select_train_tokens_split = []
|
||||
norm_select_test_tokens_split = []
|
||||
for norm_method in range(0,2):
|
||||
for norm_m in range(0,2):
|
||||
# train tokens
|
||||
select_train_tokens = []
|
||||
select_train_tokens_split = []
|
||||
nn1 = sfun.select_freq(norm_train_tokens[norm_method])
|
||||
nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_method])
|
||||
nn3 = sfun.select_open_class(norm_train_tokens[norm_method])
|
||||
nn1 = sfun.select_freq(norm_train_tokens[norm_m])
|
||||
nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_m])
|
||||
nn3 = sfun.select_open_class(norm_train_tokens[norm_m])
|
||||
select_train_tokens_split.append(nn1)
|
||||
select_train_tokens_split.append(nn2)
|
||||
select_train_tokens_split.append(nn3)
|
||||
|
@ -83,9 +103,9 @@ if __name__ == '__main__':
|
|||
# test tokens
|
||||
select_test_tokens = []
|
||||
select_test_tokens_split = []
|
||||
nn1 = sfun.select_freq(norm_test_tokens[norm_method])
|
||||
nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_method])
|
||||
nn3 = sfun.select_open_class(norm_test_tokens[norm_method])
|
||||
nn1 = sfun.select_freq(norm_test_tokens[norm_m])
|
||||
nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_m])
|
||||
nn3 = sfun.select_open_class(norm_test_tokens[norm_m])
|
||||
select_test_tokens_split.append(nn1)
|
||||
select_test_tokens_split.append(nn2)
|
||||
select_test_tokens_split.append(nn3)
|
||||
|
@ -97,25 +117,25 @@ if __name__ == '__main__':
|
|||
|
||||
# Création des objets Vectorizer (Train seulement)
|
||||
norm_select_vectorizers = []
|
||||
for norm_method in range(0,2):
|
||||
for norm_m in range(0,2):
|
||||
select_vectorizers = []
|
||||
for select_method in range(0,3):
|
||||
vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_method][select_method])
|
||||
for sel_m in range(0,3):
|
||||
vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_m][sel_m])
|
||||
select_vectorizers.append(vectorizers)
|
||||
norm_select_vectorizers.append(select_vectorizers)
|
||||
|
||||
# Transformation des jetons en vecteurs
|
||||
v_norm_select_vectors_train = []
|
||||
v_norm_select_vectors_test = []
|
||||
for norm_method in range(0,2):
|
||||
for norm_m in range(0,2):
|
||||
v_select_vectors_train = []
|
||||
v_select_vectors_test = []
|
||||
for select_method in range(0,3):
|
||||
for sel_m in range(0,3):
|
||||
v_vectors_train = []
|
||||
v_vectors_test = []
|
||||
for vector_method in range(0,3):
|
||||
v_vector_train = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_train_tokens[norm_method][select_method])
|
||||
v_vector_test = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_test_tokens[norm_method][select_method])
|
||||
for vect_m in range(0,3):
|
||||
v_vector_train = norm_select_vectorizers[norm_m][sel_m][vect_m].transform(norm_select_train_tokens[norm_m][sel_m])
|
||||
v_vector_test = norm_select_vectorizers[norm_m][sel_m][vect_m].transform(norm_select_test_tokens[norm_m][sel_m])
|
||||
v_vectors_train.append(v_vector_train)
|
||||
v_vectors_test.append(v_vector_test)
|
||||
v_select_vectors_train.append(v_vectors_train)
|
||||
|
@ -126,12 +146,12 @@ if __name__ == '__main__':
|
|||
# Ajout des attributs
|
||||
v_norm_select_polarity_count_train = []
|
||||
v_norm_select_polarity_count_test = []
|
||||
for norm_method in range(0,2):
|
||||
for norm_m in range(0,2):
|
||||
v_select_polarity_count_train = []
|
||||
v_select_polarity_count_test = []
|
||||
for select_method in range(0,3):
|
||||
v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_method][select_method])
|
||||
v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_method][select_method])
|
||||
for sel_m in range(0,3):
|
||||
v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_m][sel_m])
|
||||
v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_m][sel_m])
|
||||
v_select_polarity_count_train.append(csr_matrix(v_polarity_count_train))
|
||||
v_select_polarity_count_test.append(csr_matrix(v_polarity_count_test))
|
||||
v_norm_select_polarity_count_train.append(v_select_polarity_count_train)
|
||||
|
@ -140,17 +160,17 @@ if __name__ == '__main__':
|
|||
# Création des matrices finales
|
||||
v_final_train = []
|
||||
v_final_test = []
|
||||
for norm_method in range(0,2):
|
||||
for norm_m in range(0,2):
|
||||
v_select_final_train = []
|
||||
v_select_final_test = []
|
||||
for select_method in range(0,3):
|
||||
for sel_m in range(0,3):
|
||||
v_vector_final_train = []
|
||||
v_vector_final_test = []
|
||||
for vector_method in range(0,3):
|
||||
v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_method][select_method][vector_method],
|
||||
v_norm_select_polarity_count_train[norm_method][select_method]]))
|
||||
v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_method][select_method][vector_method],
|
||||
v_norm_select_polarity_count_test[norm_method][select_method]]))
|
||||
for vect_m in range(0,3):
|
||||
v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_m][sel_m][vect_m],
|
||||
v_norm_select_polarity_count_train[norm_m][sel_m]]))
|
||||
v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_m][sel_m][vect_m],
|
||||
v_norm_select_polarity_count_test[norm_m][sel_m]]))
|
||||
v_select_final_train.append(v_vector_final_train)
|
||||
v_select_final_test.append(v_vector_final_test)
|
||||
v_final_train.append(v_select_final_train)
|
||||
|
@ -162,21 +182,23 @@ if __name__ == '__main__':
|
|||
scores_nb = []
|
||||
modeles_reg = []
|
||||
scores_reg = []
|
||||
for norm_method in range(0,2):
|
||||
for norm_m in range(0,2):
|
||||
modeles_select_vector_nb = []
|
||||
scores_select_vector_nb = []
|
||||
modeles_select_vector_reg = []
|
||||
scores_select_vector_reg = []
|
||||
for select_method in range(0,3):
|
||||
for sel_m in range(0,3):
|
||||
modeles_vector_nb = []
|
||||
scores_vector_nb = []
|
||||
modeles_vector_reg = []
|
||||
scores_vector_reg = []
|
||||
for vector_method in range(0,3):
|
||||
modele_nb = sfun.train_naive_model(v_final_train[norm_method][select_method][vector_method],train_dataset_response)
|
||||
score_nb = modele_nb.predict(v_final_test[norm_method][select_method][vector_method])
|
||||
modele_reg = sfun.train_regression_model(v_final_train[norm_method][select_method][vector_method],train_dataset_response)
|
||||
score_reg = modele_reg.predict(v_final_test[norm_method][select_method][vector_method])
|
||||
for vect_m in range(0,3):
|
||||
modele_nb = sfun.train_naive_model(v_final_train[norm_m][sel_m][vect_m],train_dataset_response)
|
||||
score_nb = modele_nb.predict(v_final_test[norm_m][sel_m][vect_m])
|
||||
modele_reg = sfun.train_regression_model(v_final_train[norm_m][sel_m][vect_m],train_dataset_response)
|
||||
score_reg = modele_reg.predict(v_final_test[norm_m][sel_m][vect_m])
|
||||
modeles_vector_nb.append(modele_nb)
|
||||
scores_vector_nb.append(score_nb)
|
||||
modeles_vector_reg.append(modele_reg)
|
||||
scores_vector_reg.append(score_reg)
|
||||
modeles_select_vector_nb.append(modeles_vector_nb)
|
||||
|
@ -186,4 +208,46 @@ if __name__ == '__main__':
|
|||
modeles_nb.append(modeles_select_vector_nb)
|
||||
scores_nb.append(scores_select_vector_nb)
|
||||
modeles_reg.append(modeles_select_vector_reg)
|
||||
scores_reg.append(scores_select_vector_reg)
|
||||
scores_reg.append(scores_select_vector_reg)
|
||||
|
||||
# Performance des modéles
|
||||
|
||||
def class_metrics(test_y,pred_y):
|
||||
return [precision_score(test_y,pred_y),recall_score(test_y,pred_y),accuracy_score(test_y,pred_y)]
|
||||
|
||||
cm_nb = []
|
||||
cm_reg = []
|
||||
for norm_m in range(0,2):
|
||||
cm_sel_vect_nb = []
|
||||
cm_sel_vect_reg = []
|
||||
for sel_m in range(0,3):
|
||||
cm_vect_nb = []
|
||||
cm_vect_reg = []
|
||||
for vect_m in range(0,3):
|
||||
cm_vect_nb.append(class_metrics(test_dataset_response,scores_nb[norm_m][sel_m][vect_m]))
|
||||
cm_vect_reg.append(class_metrics(test_dataset_response,scores_reg[norm_m][sel_m][vect_m]))
|
||||
cm_sel_vect_nb.append(cm_vect_nb)
|
||||
cm_sel_vect_reg.append(cm_vect_reg)
|
||||
cm_nb.append(cm_sel_vect_nb)
|
||||
cm_reg.append(cm_sel_vect_reg)
|
||||
|
||||
table_metriques = []
|
||||
for norm_m in range(0,2):
|
||||
for sel_m in range(0,3):
|
||||
for vect_m in range(0,3):
|
||||
table_metriques.append([norm_names.get(norm_m),
|
||||
select_names.get(sel_m),
|
||||
vectorizer_names.get(vect_m)]+
|
||||
cm_nb[norm_m][sel_m][vect_m]+
|
||||
cm_reg[norm_m][sel_m][vect_m])
|
||||
|
||||
table_metriques_df = pd.DataFrame(table_metriques,columns=['Normalisation',
|
||||
'Sélection',
|
||||
'Vectorisation',
|
||||
'Précision NB',
|
||||
'Rappel NB',
|
||||
'Exactitude NB',
|
||||
'Précision RL',
|
||||
'Rappel RL',
|
||||
'Exactitude RL'])
|
||||
|
||||
|
|
Loading…
Reference in a new issue