diff --git a/sentiment_analysis.py b/sentiment_analysis.py index 2854d8b..c016134 100644 --- a/sentiment_analysis.py +++ b/sentiment_analysis.py @@ -1,7 +1,9 @@ # -*- coding: utf-8 -*- import json import sentiment_analysis_functions as sfun +from sklearn.metrics import accuracy_score, recall_score, precision_score from scipy.sparse import csr_matrix, hstack +import pandas as pd # installation # import nltk @@ -44,6 +46,25 @@ if __name__ == '__main__': test_dataset = test_positive_reviews+test_negative_reviews test_dataset_response = [1]*len(test_positive_reviews)+[0]*len(test_negative_reviews) + + # Méthodes + +norm_names = { + 0:"Stemming", + 1:"Lemmatisation" +} + +select_names = { + 0:"Fréquence", + 1:"Mot-outils", + 2:"Classe ouverte" +} + +vectorizer_names = { + 0:"Compteur", + 1:"Occurence", + 2:"TF-IDF" +} # Tokenisation @@ -59,19 +80,18 @@ if __name__ == '__main__': norm_test_tokens.append(sfun.norm_stemming(test_tokens)) norm_test_tokens.append(sfun.norm_lemmatize(test_tokens)) - # Normalize and select tokens norm_select_train_tokens = [] norm_select_test_tokens = [] norm_select_train_tokens_split = [] norm_select_test_tokens_split = [] - for norm_method in range(0,2): + for norm_m in range(0,2): # train tokens select_train_tokens = [] select_train_tokens_split = [] - nn1 = sfun.select_freq(norm_train_tokens[norm_method]) - nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_method]) - nn3 = sfun.select_open_class(norm_train_tokens[norm_method]) + nn1 = sfun.select_freq(norm_train_tokens[norm_m]) + nn2 = sfun.select_rem_stopwords(norm_train_tokens[norm_m]) + nn3 = sfun.select_open_class(norm_train_tokens[norm_m]) select_train_tokens_split.append(nn1) select_train_tokens_split.append(nn2) select_train_tokens_split.append(nn3) @@ -83,9 +103,9 @@ if __name__ == '__main__': # test tokens select_test_tokens = [] select_test_tokens_split = [] - nn1 = sfun.select_freq(norm_test_tokens[norm_method]) - nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_method]) - nn3 = sfun.select_open_class(norm_test_tokens[norm_method]) + nn1 = sfun.select_freq(norm_test_tokens[norm_m]) + nn2 = sfun.select_rem_stopwords(norm_test_tokens[norm_m]) + nn3 = sfun.select_open_class(norm_test_tokens[norm_m]) select_test_tokens_split.append(nn1) select_test_tokens_split.append(nn2) select_test_tokens_split.append(nn3) @@ -97,25 +117,25 @@ if __name__ == '__main__': # Création des objets Vectorizer (Train seulement) norm_select_vectorizers = [] - for norm_method in range(0,2): + for norm_m in range(0,2): select_vectorizers = [] - for select_method in range(0,3): - vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_method][select_method]) + for sel_m in range(0,3): + vectorizers = sfun.get_vectorizers(norm_select_train_tokens[norm_m][sel_m]) select_vectorizers.append(vectorizers) norm_select_vectorizers.append(select_vectorizers) # Transformation des jetons en vecteurs v_norm_select_vectors_train = [] v_norm_select_vectors_test = [] - for norm_method in range(0,2): + for norm_m in range(0,2): v_select_vectors_train = [] v_select_vectors_test = [] - for select_method in range(0,3): + for sel_m in range(0,3): v_vectors_train = [] v_vectors_test = [] - for vector_method in range(0,3): - v_vector_train = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_train_tokens[norm_method][select_method]) - v_vector_test = norm_select_vectorizers[norm_method][select_method][vector_method].transform(norm_select_test_tokens[norm_method][select_method]) + for vect_m in range(0,3): + v_vector_train = norm_select_vectorizers[norm_m][sel_m][vect_m].transform(norm_select_train_tokens[norm_m][sel_m]) + v_vector_test = norm_select_vectorizers[norm_m][sel_m][vect_m].transform(norm_select_test_tokens[norm_m][sel_m]) v_vectors_train.append(v_vector_train) v_vectors_test.append(v_vector_test) v_select_vectors_train.append(v_vectors_train) @@ -126,12 +146,12 @@ if __name__ == '__main__': # Ajout des attributs v_norm_select_polarity_count_train = [] v_norm_select_polarity_count_test = [] - for norm_method in range(0,2): + for norm_m in range(0,2): v_select_polarity_count_train = [] v_select_polarity_count_test = [] - for select_method in range(0,3): - v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_method][select_method]) - v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_method][select_method]) + for sel_m in range(0,3): + v_polarity_count_train = sfun.attribute_polarity_count(norm_select_train_tokens_split[norm_m][sel_m]) + v_polarity_count_test = sfun.attribute_polarity_count(norm_select_test_tokens_split[norm_m][sel_m]) v_select_polarity_count_train.append(csr_matrix(v_polarity_count_train)) v_select_polarity_count_test.append(csr_matrix(v_polarity_count_test)) v_norm_select_polarity_count_train.append(v_select_polarity_count_train) @@ -140,17 +160,17 @@ if __name__ == '__main__': # Création des matrices finales v_final_train = [] v_final_test = [] - for norm_method in range(0,2): + for norm_m in range(0,2): v_select_final_train = [] v_select_final_test = [] - for select_method in range(0,3): + for sel_m in range(0,3): v_vector_final_train = [] v_vector_final_test = [] - for vector_method in range(0,3): - v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_method][select_method][vector_method], - v_norm_select_polarity_count_train[norm_method][select_method]])) - v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_method][select_method][vector_method], - v_norm_select_polarity_count_test[norm_method][select_method]])) + for vect_m in range(0,3): + v_vector_final_train.append(hstack([v_norm_select_vectors_train[norm_m][sel_m][vect_m], + v_norm_select_polarity_count_train[norm_m][sel_m]])) + v_vector_final_test.append(hstack([v_norm_select_vectors_test[norm_m][sel_m][vect_m], + v_norm_select_polarity_count_test[norm_m][sel_m]])) v_select_final_train.append(v_vector_final_train) v_select_final_test.append(v_vector_final_test) v_final_train.append(v_select_final_train) @@ -162,21 +182,23 @@ if __name__ == '__main__': scores_nb = [] modeles_reg = [] scores_reg = [] - for norm_method in range(0,2): + for norm_m in range(0,2): modeles_select_vector_nb = [] scores_select_vector_nb = [] modeles_select_vector_reg = [] scores_select_vector_reg = [] - for select_method in range(0,3): + for sel_m in range(0,3): modeles_vector_nb = [] scores_vector_nb = [] modeles_vector_reg = [] scores_vector_reg = [] - for vector_method in range(0,3): - modele_nb = sfun.train_naive_model(v_final_train[norm_method][select_method][vector_method],train_dataset_response) - score_nb = modele_nb.predict(v_final_test[norm_method][select_method][vector_method]) - modele_reg = sfun.train_regression_model(v_final_train[norm_method][select_method][vector_method],train_dataset_response) - score_reg = modele_reg.predict(v_final_test[norm_method][select_method][vector_method]) + for vect_m in range(0,3): + modele_nb = sfun.train_naive_model(v_final_train[norm_m][sel_m][vect_m],train_dataset_response) + score_nb = modele_nb.predict(v_final_test[norm_m][sel_m][vect_m]) + modele_reg = sfun.train_regression_model(v_final_train[norm_m][sel_m][vect_m],train_dataset_response) + score_reg = modele_reg.predict(v_final_test[norm_m][sel_m][vect_m]) + modeles_vector_nb.append(modele_nb) + scores_vector_nb.append(score_nb) modeles_vector_reg.append(modele_reg) scores_vector_reg.append(score_reg) modeles_select_vector_nb.append(modeles_vector_nb) @@ -186,4 +208,46 @@ if __name__ == '__main__': modeles_nb.append(modeles_select_vector_nb) scores_nb.append(scores_select_vector_nb) modeles_reg.append(modeles_select_vector_reg) - scores_reg.append(scores_select_vector_reg) \ No newline at end of file + scores_reg.append(scores_select_vector_reg) + + # Performance des modéles + + def class_metrics(test_y,pred_y): + return [precision_score(test_y,pred_y),recall_score(test_y,pred_y),accuracy_score(test_y,pred_y)] + + cm_nb = [] + cm_reg = [] + for norm_m in range(0,2): + cm_sel_vect_nb = [] + cm_sel_vect_reg = [] + for sel_m in range(0,3): + cm_vect_nb = [] + cm_vect_reg = [] + for vect_m in range(0,3): + cm_vect_nb.append(class_metrics(test_dataset_response,scores_nb[norm_m][sel_m][vect_m])) + cm_vect_reg.append(class_metrics(test_dataset_response,scores_reg[norm_m][sel_m][vect_m])) + cm_sel_vect_nb.append(cm_vect_nb) + cm_sel_vect_reg.append(cm_vect_reg) + cm_nb.append(cm_sel_vect_nb) + cm_reg.append(cm_sel_vect_reg) + + table_metriques = [] + for norm_m in range(0,2): + for sel_m in range(0,3): + for vect_m in range(0,3): + table_metriques.append([norm_names.get(norm_m), + select_names.get(sel_m), + vectorizer_names.get(vect_m)]+ + cm_nb[norm_m][sel_m][vect_m]+ + cm_reg[norm_m][sel_m][vect_m]) + + table_metriques_df = pd.DataFrame(table_metriques,columns=['Normalisation', + 'Sélection', + 'Vectorisation', + 'Précision NB', + 'Rappel NB', + 'Exactitude NB', + 'Précision RL', + 'Rappel RL', + 'Exactitude RL']) +