diff --git a/.gitignore b/.gitignore index 52abb23..2b626b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .idea/ data/ +images/ *.txt *.pdf diff --git a/t3_classification_questions.py b/t3_classification_questions.py index d916cf0..55f3130 100644 --- a/t3_classification_questions.py +++ b/t3_classification_questions.py @@ -1,10 +1,15 @@ # -*- coding: utf-8 -*- +import matplotlib.pyplot as plt +import seaborn as sn +from pandas import DataFrame from sklearn.feature_extraction.text import CountVectorizer -from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import confusion_matrix, accuracy_score +from sklearn.model_selection import cross_validate, GridSearchCV +from sklearn.naive_bayes import MultinomialNB from sklearn.utils.multiclass import unique_labels -from sklearn.model_selection import cross_validate + +# Pour le rapport training_questions_fn = "./data/questions-t3.txt" test_questions_fn = "./data/test-questions-t3.txt" @@ -20,6 +25,15 @@ def run_question_classification(training_fn, test_fn): print("Accuracy on test set: {0:.4f}".format(accuracy_test)) +def custom_count_vectorize_questions(min_df, max_df, use_stop_words): + stop_words = [None, 'english'][use_stop_words] + count_vec = CountVectorizer(analyzer='word', + min_df=min_df, + max_df=max_df, + stop_words=stop_words) + return count_vec + + def train_and_test_classifier(training_fn, test_fn): questions, labels = load_dataset(training_fn) print("Nb questions d'entraînement:", len(questions)) @@ -30,25 +44,101 @@ def train_and_test_classifier(training_fn, test_fn): # Votre code... mnb = MultinomialNB() + + # Sélection des meilleurs paramètres pour le CountVectorizer + # par recherche sur une grille et validation croisée + # sur le modèle multinomial Naive Bayes + avg_score = {} - for min_df in range(1, 5): - for max_df in [x/200.0 for x in range(1, 11)]: - count_vec = CountVectorizer(strip_accents='unicode', - # stop_words='english', - analyzer='word', - min_df=min_df, - max_df=max_df) - count_questions = count_vec.fit_transform(questions) - mnb_fit = cross_validate(mnb, count_questions, labels, cv=10) - avg_score[(min_df, max_df)] = mean(mnb_fit['test_score']) + for min_df in range(1, 21): + for max_df in [x / 200.0 for x in range(1, 21)]: + for use_stop_words in [True, False]: + count_vec = custom_count_vectorize_questions(min_df, max_df, use_stop_words) + count_questions = count_vec.fit_transform(questions) + mnb_fit = cross_validate(mnb, count_questions, labels, cv=5) + avg_score[(min_df, max_df, use_stop_words)] = mean(mnb_fit['test_score']) + + # Meilleurs hyper-paramètres pour le CountVectorizer + min_df_opt, max_df_opt, use_stop_words_opt = max(avg_score, key=avg_score.get) + + print("\nLes paramètres optimaux pour le CountVectorizer sont:\n" + + 'min_df: ' + str(min_df_opt) + "\n" + + 'max_df: ' + str(max_df_opt) + "\n" + + 'use_stop_words: ' + str(use_stop_words_opt)) + + # On peut ainsi utiliser le meilleur CountVectorizer pour préparer les données pour le modèle Naive Bayes + + count_vec_opt = custom_count_vectorize_questions(min_df_opt, max_df_opt, use_stop_words_opt) + count_questions_train_opt = count_vec_opt.fit_transform(questions) + count_questions_test_opt = count_vec_opt.transform(test_questions) + + print("\nRecherche de l'hyperparamètre alpha:") + + gvc = GridSearchCV(mnb, param_grid={'alpha': [x / 10.0 for x in range(1, 21)]}, cv=5) + + mnb_fit = gvc.fit(count_questions_train_opt, labels) + + print("Score: " + str(mnb_fit.best_score_)) + + print("Paramètres: " + str(mnb_fit.get_params())) + + u_labels = unique_labels(labels, test_labels) + + labels_predict_train = mnb_fit.predict(count_questions_train_opt) + labels_predict_test = mnb_fit.predict(count_questions_test_opt) - labels_predict_train = mnb_fit.predict(count_questions) - cm_predict = confusion_matrix(y_true=labels, - y_pred=labels_predict_train, - labels=unique_labels(labels, labels_predict_train)) accuracy_train = accuracy_score(y_true=labels, y_pred=labels_predict_train) + accuracy_test = accuracy_score(y_true=test_labels, y_pred=labels_predict_test) + + # Section pour le rapport + # Matrice de confusion pour l'entrainement + cm_predict_train = confusion_matrix(y_true=labels, y_pred=labels_predict_train, labels=u_labels) + df_cm_predict_train = DataFrame(cm_predict_train, index=u_labels, columns=u_labels) + + f, ax = plt.subplots(figsize=(9, 9)) + + plt.subplots_adjust(top=0.961, + bottom=0.169, + left=0.169, + right=0.983, + hspace=0.2, + wspace=0.2) + + sn_plot_cm_predict_train = sn.heatmap(df_cm_predict_train, + cmap='Oranges', + annot=True, + fmt="d", + ax=ax) + + sn_plot_cm_predict_train.set(title="Performance du modèle NB sur l'échantillon d'entrainement", + xlabel="Prédit", + ylabel="Observé") + sn_plot_cm_predict_train.get_figure().savefig("images/3_seaborn_df_cm_predict_train.png") + + # Matrice de confusion pour le test + cm_predict_test = confusion_matrix(y_true=test_labels, y_pred=labels_predict_test, labels=u_labels) + df_cm_predict_test = DataFrame(cm_predict_test, index=u_labels, columns=u_labels) + + f, ax2 = plt.subplots(figsize=(9, 9)) + + plt.subplots_adjust(top=0.961, + bottom=0.169, + left=0.169, + right=0.983, + hspace=0.2, + wspace=0.2) + + sn_plot_cm_predict_test = sn.heatmap(df_cm_predict_test, + cmap='Oranges', + annot=True, + fmt="d", + ax=ax2) + + sn_plot_cm_predict_test.set(title="Performance du modèle NB sur l'échantillon test", + xlabel="Prédit", + ylabel="Observé") + sn_plot_cm_predict_test.get_figure().savefig("images/3_seaborn_df_cm_predict_test.png") - accuracy_test = 0.8 # A modifier return accuracy_train, accuracy_test