From f48c6823b0eede823d3771e522c86136b0ae3306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Sat, 28 Sep 2019 00:47:43 -0400 Subject: [PATCH] =?UTF-8?q?exp=C3=A9rimentation=20avec=20les=20fonctions?= =?UTF-8?q?=20n=C3=A9cessaires?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- t3_classification_questions.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/t3_classification_questions.py b/t3_classification_questions.py index e7967d8..d916cf0 100644 --- a/t3_classification_questions.py +++ b/t3_classification_questions.py @@ -2,11 +2,18 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB +from sklearn.metrics import confusion_matrix, accuracy_score +from sklearn.utils.multiclass import unique_labels +from sklearn.model_selection import cross_validate training_questions_fn = "./data/questions-t3.txt" test_questions_fn = "./data/test-questions-t3.txt" +def mean(numbers): + return float(sum(numbers)) / max(len(numbers), 1) + + def run_question_classification(training_fn, test_fn): accuracy_train, accuracy_test = train_and_test_classifier(training_fn, test_fn) print("Accuracy on training set: {0:.4f}".format(accuracy_train)) @@ -22,7 +29,25 @@ def train_and_test_classifier(training_fn, test_fn): # Insérer ici votre code pour la classification des questions. # Votre code... - accuracy_train = 0.8 # A modifier + mnb = MultinomialNB() + avg_score = {} + for min_df in range(1, 5): + for max_df in [x/200.0 for x in range(1, 11)]: + count_vec = CountVectorizer(strip_accents='unicode', + # stop_words='english', + analyzer='word', + min_df=min_df, + max_df=max_df) + count_questions = count_vec.fit_transform(questions) + mnb_fit = cross_validate(mnb, count_questions, labels, cv=10) + avg_score[(min_df, max_df)] = mean(mnb_fit['test_score']) + + labels_predict_train = mnb_fit.predict(count_questions) + cm_predict = confusion_matrix(y_true=labels, + y_pred=labels_predict_train, + labels=unique_labels(labels, labels_predict_train)) + accuracy_train = accuracy_score(y_true=labels, y_pred=labels_predict_train) + accuracy_test = 0.8 # A modifier return accuracy_train, accuracy_test