code fonctionnel pour la question 3
Signed-off-by: François Pelletier <francois@francoispelletier.org>
This commit is contained in:
parent
f48c6823b0
commit
4d2b030782
2 changed files with 108 additions and 17 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,5 +1,6 @@
|
|||
.idea/
|
||||
data/
|
||||
images/
|
||||
*.txt
|
||||
*.pdf
|
||||
|
||||
|
|
|
@ -1,10 +1,15 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sn
|
||||
from pandas import DataFrame
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.metrics import confusion_matrix, accuracy_score
|
||||
from sklearn.model_selection import cross_validate, GridSearchCV
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.utils.multiclass import unique_labels
|
||||
from sklearn.model_selection import cross_validate
|
||||
|
||||
# Pour le rapport
|
||||
|
||||
training_questions_fn = "./data/questions-t3.txt"
|
||||
test_questions_fn = "./data/test-questions-t3.txt"
|
||||
|
@ -20,6 +25,15 @@ def run_question_classification(training_fn, test_fn):
|
|||
print("Accuracy on test set: {0:.4f}".format(accuracy_test))
|
||||
|
||||
|
||||
def custom_count_vectorize_questions(min_df, max_df, use_stop_words):
|
||||
stop_words = [None, 'english'][use_stop_words]
|
||||
count_vec = CountVectorizer(analyzer='word',
|
||||
min_df=min_df,
|
||||
max_df=max_df,
|
||||
stop_words=stop_words)
|
||||
return count_vec
|
||||
|
||||
|
||||
def train_and_test_classifier(training_fn, test_fn):
|
||||
questions, labels = load_dataset(training_fn)
|
||||
print("Nb questions d'entraînement:", len(questions))
|
||||
|
@ -30,25 +44,101 @@ def train_and_test_classifier(training_fn, test_fn):
|
|||
# Votre code...
|
||||
|
||||
mnb = MultinomialNB()
|
||||
|
||||
# Sélection des meilleurs paramètres pour le CountVectorizer
|
||||
# par recherche sur une grille et validation croisée
|
||||
# sur le modèle multinomial Naive Bayes
|
||||
|
||||
avg_score = {}
|
||||
for min_df in range(1, 5):
|
||||
for max_df in [x/200.0 for x in range(1, 11)]:
|
||||
count_vec = CountVectorizer(strip_accents='unicode',
|
||||
# stop_words='english',
|
||||
analyzer='word',
|
||||
min_df=min_df,
|
||||
max_df=max_df)
|
||||
count_questions = count_vec.fit_transform(questions)
|
||||
mnb_fit = cross_validate(mnb, count_questions, labels, cv=10)
|
||||
avg_score[(min_df, max_df)] = mean(mnb_fit['test_score'])
|
||||
for min_df in range(1, 21):
|
||||
for max_df in [x / 200.0 for x in range(1, 21)]:
|
||||
for use_stop_words in [True, False]:
|
||||
count_vec = custom_count_vectorize_questions(min_df, max_df, use_stop_words)
|
||||
count_questions = count_vec.fit_transform(questions)
|
||||
mnb_fit = cross_validate(mnb, count_questions, labels, cv=5)
|
||||
avg_score[(min_df, max_df, use_stop_words)] = mean(mnb_fit['test_score'])
|
||||
|
||||
# Meilleurs hyper-paramètres pour le CountVectorizer
|
||||
min_df_opt, max_df_opt, use_stop_words_opt = max(avg_score, key=avg_score.get)
|
||||
|
||||
print("\nLes paramètres optimaux pour le CountVectorizer sont:\n" +
|
||||
'min_df: ' + str(min_df_opt) + "\n" +
|
||||
'max_df: ' + str(max_df_opt) + "\n" +
|
||||
'use_stop_words: ' + str(use_stop_words_opt))
|
||||
|
||||
# On peut ainsi utiliser le meilleur CountVectorizer pour préparer les données pour le modèle Naive Bayes
|
||||
|
||||
count_vec_opt = custom_count_vectorize_questions(min_df_opt, max_df_opt, use_stop_words_opt)
|
||||
count_questions_train_opt = count_vec_opt.fit_transform(questions)
|
||||
count_questions_test_opt = count_vec_opt.transform(test_questions)
|
||||
|
||||
print("\nRecherche de l'hyperparamètre alpha:")
|
||||
|
||||
gvc = GridSearchCV(mnb, param_grid={'alpha': [x / 10.0 for x in range(1, 21)]}, cv=5)
|
||||
|
||||
mnb_fit = gvc.fit(count_questions_train_opt, labels)
|
||||
|
||||
print("Score: " + str(mnb_fit.best_score_))
|
||||
|
||||
print("Paramètres: " + str(mnb_fit.get_params()))
|
||||
|
||||
u_labels = unique_labels(labels, test_labels)
|
||||
|
||||
labels_predict_train = mnb_fit.predict(count_questions_train_opt)
|
||||
labels_predict_test = mnb_fit.predict(count_questions_test_opt)
|
||||
|
||||
labels_predict_train = mnb_fit.predict(count_questions)
|
||||
cm_predict = confusion_matrix(y_true=labels,
|
||||
y_pred=labels_predict_train,
|
||||
labels=unique_labels(labels, labels_predict_train))
|
||||
accuracy_train = accuracy_score(y_true=labels, y_pred=labels_predict_train)
|
||||
accuracy_test = accuracy_score(y_true=test_labels, y_pred=labels_predict_test)
|
||||
|
||||
# Section pour le rapport
|
||||
# Matrice de confusion pour l'entrainement
|
||||
cm_predict_train = confusion_matrix(y_true=labels, y_pred=labels_predict_train, labels=u_labels)
|
||||
df_cm_predict_train = DataFrame(cm_predict_train, index=u_labels, columns=u_labels)
|
||||
|
||||
f, ax = plt.subplots(figsize=(9, 9))
|
||||
|
||||
plt.subplots_adjust(top=0.961,
|
||||
bottom=0.169,
|
||||
left=0.169,
|
||||
right=0.983,
|
||||
hspace=0.2,
|
||||
wspace=0.2)
|
||||
|
||||
sn_plot_cm_predict_train = sn.heatmap(df_cm_predict_train,
|
||||
cmap='Oranges',
|
||||
annot=True,
|
||||
fmt="d",
|
||||
ax=ax)
|
||||
|
||||
sn_plot_cm_predict_train.set(title="Performance du modèle NB sur l'échantillon d'entrainement",
|
||||
xlabel="Prédit",
|
||||
ylabel="Observé")
|
||||
sn_plot_cm_predict_train.get_figure().savefig("images/3_seaborn_df_cm_predict_train.png")
|
||||
|
||||
# Matrice de confusion pour le test
|
||||
cm_predict_test = confusion_matrix(y_true=test_labels, y_pred=labels_predict_test, labels=u_labels)
|
||||
df_cm_predict_test = DataFrame(cm_predict_test, index=u_labels, columns=u_labels)
|
||||
|
||||
f, ax2 = plt.subplots(figsize=(9, 9))
|
||||
|
||||
plt.subplots_adjust(top=0.961,
|
||||
bottom=0.169,
|
||||
left=0.169,
|
||||
right=0.983,
|
||||
hspace=0.2,
|
||||
wspace=0.2)
|
||||
|
||||
sn_plot_cm_predict_test = sn.heatmap(df_cm_predict_test,
|
||||
cmap='Oranges',
|
||||
annot=True,
|
||||
fmt="d",
|
||||
ax=ax2)
|
||||
|
||||
sn_plot_cm_predict_test.set(title="Performance du modèle NB sur l'échantillon test",
|
||||
xlabel="Prédit",
|
||||
ylabel="Observé")
|
||||
sn_plot_cm_predict_test.get_figure().savefig("images/3_seaborn_df_cm_predict_test.png")
|
||||
|
||||
accuracy_test = 0.8 # A modifier
|
||||
return accuracy_train, accuracy_test
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue