code fonctionnel pour la question 3

Signed-off-by: François Pelletier <francois@francoispelletier.org>
2019-09-28 17:41:51 -04:00 · 2019-09-28 17:41:51 -04:00 · 4d2b030782
commit 4d2b030782
parent f48c6823b0
2 changed files with 108 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 .idea/
 data/
+images/
 *.txt
 *.pdf

--- a/t3_classification_questions.py
+++ b/t3_classification_questions.py
@ -1,10 +1,15 @@
 # -*- coding: utf-8 -*-

+import matplotlib.pyplot as plt
+import seaborn as sn
+from pandas import DataFrame
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.naive_bayes import MultinomialNB
 from sklearn.metrics import confusion_matrix, accuracy_score
+from sklearn.model_selection import cross_validate, GridSearchCV
+from sklearn.naive_bayes import MultinomialNB
 from sklearn.utils.multiclass import unique_labels
-from sklearn.model_selection import cross_validate
+
+# Pour le rapport

 training_questions_fn = "./data/questions-t3.txt"
 test_questions_fn = "./data/test-questions-t3.txt"
@ -20,6 +25,15 @@ def run_question_classification(training_fn, test_fn):
    print("Accuracy on test set: {0:.4f}".format(accuracy_test))


+def custom_count_vectorize_questions(min_df, max_df, use_stop_words):
+    stop_words = [None, 'english'][use_stop_words]
+    count_vec = CountVectorizer(analyzer='word',
+                                min_df=min_df,
+                                max_df=max_df,
+                                stop_words=stop_words)
+    return count_vec
+
+
 def train_and_test_classifier(training_fn, test_fn):
    questions, labels = load_dataset(training_fn)
    print("Nb questions d'entraînement:", len(questions))
@ -30,25 +44,101 @@ def train_and_test_classifier(training_fn, test_fn):
    # Votre code...

    mnb = MultinomialNB()
+
+    # Sélection des meilleurs paramètres pour le CountVectorizer
+    # par recherche sur une grille et validation croisée
+    # sur le modèle multinomial Naive Bayes
+
    avg_score = {}
-    for min_df in range(1, 5):
-        for max_df in [x/200.0 for x in range(1, 11)]:
-            count_vec = CountVectorizer(strip_accents='unicode',
-                                        # stop_words='english',
-                                        analyzer='word',
-                                        min_df=min_df,
-                                        max_df=max_df)
-            count_questions = count_vec.fit_transform(questions)
-            mnb_fit = cross_validate(mnb, count_questions, labels, cv=10)
-            avg_score[(min_df, max_df)] = mean(mnb_fit['test_score'])
+    for min_df in range(1, 21):
+        for max_df in [x / 200.0 for x in range(1, 21)]:
+            for use_stop_words in [True, False]:
+                count_vec = custom_count_vectorize_questions(min_df, max_df, use_stop_words)
+                count_questions = count_vec.fit_transform(questions)
+                mnb_fit = cross_validate(mnb, count_questions, labels, cv=5)
+                avg_score[(min_df, max_df, use_stop_words)] = mean(mnb_fit['test_score'])
+
+    # Meilleurs hyper-paramètres pour le CountVectorizer
+    min_df_opt, max_df_opt, use_stop_words_opt = max(avg_score, key=avg_score.get)
+
+    print("\nLes paramètres optimaux pour le CountVectorizer sont:\n" +
+          'min_df: ' + str(min_df_opt) + "\n" +
+          'max_df: ' + str(max_df_opt) + "\n" +
+          'use_stop_words: ' + str(use_stop_words_opt))
+
+    # On peut ainsi utiliser le meilleur CountVectorizer pour préparer les données pour le modèle Naive Bayes
+
+    count_vec_opt = custom_count_vectorize_questions(min_df_opt, max_df_opt, use_stop_words_opt)
+    count_questions_train_opt = count_vec_opt.fit_transform(questions)
+    count_questions_test_opt = count_vec_opt.transform(test_questions)
+
+    print("\nRecherche de l'hyperparamètre alpha:")
+
+    gvc = GridSearchCV(mnb, param_grid={'alpha': [x / 10.0 for x in range(1, 21)]}, cv=5)
+
+    mnb_fit = gvc.fit(count_questions_train_opt, labels)
+
+    print("Score: " + str(mnb_fit.best_score_))
+
+    print("Paramètres: " + str(mnb_fit.get_params()))
+
+    u_labels = unique_labels(labels, test_labels)
+
+    labels_predict_train = mnb_fit.predict(count_questions_train_opt)
+    labels_predict_test = mnb_fit.predict(count_questions_test_opt)

-    labels_predict_train = mnb_fit.predict(count_questions)
-    cm_predict = confusion_matrix(y_true=labels,
-                                  y_pred=labels_predict_train,
-                                  labels=unique_labels(labels, labels_predict_train))
    accuracy_train = accuracy_score(y_true=labels, y_pred=labels_predict_train)
+    accuracy_test = accuracy_score(y_true=test_labels, y_pred=labels_predict_test)
+
+    # Section pour le rapport
+    # Matrice de confusion pour l'entrainement
+    cm_predict_train = confusion_matrix(y_true=labels, y_pred=labels_predict_train, labels=u_labels)
+    df_cm_predict_train = DataFrame(cm_predict_train, index=u_labels, columns=u_labels)
+
+    f, ax = plt.subplots(figsize=(9, 9))
+
+    plt.subplots_adjust(top=0.961,
+                        bottom=0.169,
+                        left=0.169,
+                        right=0.983,
+                        hspace=0.2,
+                        wspace=0.2)
+
+    sn_plot_cm_predict_train = sn.heatmap(df_cm_predict_train,
+                                          cmap='Oranges',
+                                          annot=True,
+                                          fmt="d",
+                                          ax=ax)
+
+    sn_plot_cm_predict_train.set(title="Performance du modèle NB sur l'échantillon d'entrainement",
+                                 xlabel="Prédit",
+                                 ylabel="Observé")
+    sn_plot_cm_predict_train.get_figure().savefig("images/3_seaborn_df_cm_predict_train.png")
+
+    # Matrice de confusion pour le test
+    cm_predict_test = confusion_matrix(y_true=test_labels, y_pred=labels_predict_test, labels=u_labels)
+    df_cm_predict_test = DataFrame(cm_predict_test, index=u_labels, columns=u_labels)
+
+    f, ax2 = plt.subplots(figsize=(9, 9))
+
+    plt.subplots_adjust(top=0.961,
+                        bottom=0.169,
+                        left=0.169,
+                        right=0.983,
+                        hspace=0.2,
+                        wspace=0.2)
+
+    sn_plot_cm_predict_test = sn.heatmap(df_cm_predict_test,
+                                         cmap='Oranges',
+                                         annot=True,
+                                         fmt="d",
+                                         ax=ax2)
+
+    sn_plot_cm_predict_test.set(title="Performance du modèle NB sur l'échantillon test",
+                                xlabel="Prédit",
+                                ylabel="Observé")
+    sn_plot_cm_predict_test.get_figure().savefig("images/3_seaborn_df_cm_predict_test.png")

-    accuracy_test = 0.8  # A modifier
    return accuracy_train, accuracy_test