code fonctionnel pour la question 3

Signed-off-by: François Pelletier <francois@francoispelletier.org>
This commit is contained in:
François Pelletier 2019-09-28 17:41:51 -04:00
parent f48c6823b0
commit 4d2b030782
2 changed files with 108 additions and 17 deletions

1
.gitignore vendored
View file

@ -1,5 +1,6 @@
.idea/
data/
images/
*.txt
*.pdf

View file

@ -1,10 +1,15 @@
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import seaborn as sn
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import cross_validate
# Pour le rapport
training_questions_fn = "./data/questions-t3.txt"
test_questions_fn = "./data/test-questions-t3.txt"
@ -20,6 +25,15 @@ def run_question_classification(training_fn, test_fn):
print("Accuracy on test set: {0:.4f}".format(accuracy_test))
def custom_count_vectorize_questions(min_df, max_df, use_stop_words):
stop_words = [None, 'english'][use_stop_words]
count_vec = CountVectorizer(analyzer='word',
min_df=min_df,
max_df=max_df,
stop_words=stop_words)
return count_vec
def train_and_test_classifier(training_fn, test_fn):
questions, labels = load_dataset(training_fn)
print("Nb questions d'entraînement:", len(questions))
@ -30,25 +44,101 @@ def train_and_test_classifier(training_fn, test_fn):
# Votre code...
mnb = MultinomialNB()
# Sélection des meilleurs paramètres pour le CountVectorizer
# par recherche sur une grille et validation croisée
# sur le modèle multinomial Naive Bayes
avg_score = {}
for min_df in range(1, 5):
for max_df in [x/200.0 for x in range(1, 11)]:
count_vec = CountVectorizer(strip_accents='unicode',
# stop_words='english',
analyzer='word',
min_df=min_df,
max_df=max_df)
count_questions = count_vec.fit_transform(questions)
mnb_fit = cross_validate(mnb, count_questions, labels, cv=10)
avg_score[(min_df, max_df)] = mean(mnb_fit['test_score'])
for min_df in range(1, 21):
for max_df in [x / 200.0 for x in range(1, 21)]:
for use_stop_words in [True, False]:
count_vec = custom_count_vectorize_questions(min_df, max_df, use_stop_words)
count_questions = count_vec.fit_transform(questions)
mnb_fit = cross_validate(mnb, count_questions, labels, cv=5)
avg_score[(min_df, max_df, use_stop_words)] = mean(mnb_fit['test_score'])
# Meilleurs hyper-paramètres pour le CountVectorizer
min_df_opt, max_df_opt, use_stop_words_opt = max(avg_score, key=avg_score.get)
print("\nLes paramètres optimaux pour le CountVectorizer sont:\n" +
'min_df: ' + str(min_df_opt) + "\n" +
'max_df: ' + str(max_df_opt) + "\n" +
'use_stop_words: ' + str(use_stop_words_opt))
# On peut ainsi utiliser le meilleur CountVectorizer pour préparer les données pour le modèle Naive Bayes
count_vec_opt = custom_count_vectorize_questions(min_df_opt, max_df_opt, use_stop_words_opt)
count_questions_train_opt = count_vec_opt.fit_transform(questions)
count_questions_test_opt = count_vec_opt.transform(test_questions)
print("\nRecherche de l'hyperparamètre alpha:")
gvc = GridSearchCV(mnb, param_grid={'alpha': [x / 10.0 for x in range(1, 21)]}, cv=5)
mnb_fit = gvc.fit(count_questions_train_opt, labels)
print("Score: " + str(mnb_fit.best_score_))
print("Paramètres: " + str(mnb_fit.get_params()))
u_labels = unique_labels(labels, test_labels)
labels_predict_train = mnb_fit.predict(count_questions_train_opt)
labels_predict_test = mnb_fit.predict(count_questions_test_opt)
labels_predict_train = mnb_fit.predict(count_questions)
cm_predict = confusion_matrix(y_true=labels,
y_pred=labels_predict_train,
labels=unique_labels(labels, labels_predict_train))
accuracy_train = accuracy_score(y_true=labels, y_pred=labels_predict_train)
accuracy_test = accuracy_score(y_true=test_labels, y_pred=labels_predict_test)
# Section pour le rapport
# Matrice de confusion pour l'entrainement
cm_predict_train = confusion_matrix(y_true=labels, y_pred=labels_predict_train, labels=u_labels)
df_cm_predict_train = DataFrame(cm_predict_train, index=u_labels, columns=u_labels)
f, ax = plt.subplots(figsize=(9, 9))
plt.subplots_adjust(top=0.961,
bottom=0.169,
left=0.169,
right=0.983,
hspace=0.2,
wspace=0.2)
sn_plot_cm_predict_train = sn.heatmap(df_cm_predict_train,
cmap='Oranges',
annot=True,
fmt="d",
ax=ax)
sn_plot_cm_predict_train.set(title="Performance du modèle NB sur l'échantillon d'entrainement",
xlabel="Prédit",
ylabel="Observé")
sn_plot_cm_predict_train.get_figure().savefig("images/3_seaborn_df_cm_predict_train.png")
# Matrice de confusion pour le test
cm_predict_test = confusion_matrix(y_true=test_labels, y_pred=labels_predict_test, labels=u_labels)
df_cm_predict_test = DataFrame(cm_predict_test, index=u_labels, columns=u_labels)
f, ax2 = plt.subplots(figsize=(9, 9))
plt.subplots_adjust(top=0.961,
bottom=0.169,
left=0.169,
right=0.983,
hspace=0.2,
wspace=0.2)
sn_plot_cm_predict_test = sn.heatmap(df_cm_predict_test,
cmap='Oranges',
annot=True,
fmt="d",
ax=ax2)
sn_plot_cm_predict_test.set(title="Performance du modèle NB sur l'échantillon test",
xlabel="Prédit",
ylabel="Observé")
sn_plot_cm_predict_test.get_figure().savefig("images/3_seaborn_df_cm_predict_test.png")
accuracy_test = 0.8 # A modifier
return accuracy_train, accuracy_test