diff --git a/t2_classification_noms.py b/t2_classification_noms.py index 616f06d..d30abc8 100644 --- a/t2_classification_noms.py +++ b/t2_classification_noms.py @@ -5,6 +5,10 @@ import os import string import unicodedata import json +import math +import operator +from functools import reduce +import pprint datafiles = "./data/names/*.txt" # les fichiers pour construire vos modèles test_filename = './data/test-names-t2.txt' # le fichier contenant les données de test pour évaluer vos modèles @@ -61,7 +65,7 @@ def load_names(): def normalize_word(word): word = unicode_to_ascii(word) word = word.lower() - word = word.replace(" ","") + word = word.replace(" ", "") return word @@ -113,20 +117,78 @@ def most_probable_origin(name, n=3): # Retourne la langue d'origine la plus probable du nom. # n désigne la longueur des N-grammes. Par ex n=3 --> trigramme # À compléter... - return "French" # À modifier + + # Calculer les log-prob et les perplexités + log_prob_origin = {} + perplexity_origin = {} + for origin in names_by_origin.keys(): + log_prob_origin[origin] = logprob(name, origin, n) + perplexity_origin[origin] = perplexity(name, origin, n) + + # Trouver l'optimum de chacun + most_probable_origin_prob = max(log_prob_origin.items(), key=lambda k: k[1])[0] + most_probable_origin_perp = min(perplexity_origin.items(), key=lambda k: k[1])[0] + + return most_probable_origin_perp def logprob(name, origin, n=3): # Retourne la valeur du logprob d'un nom étant donné une origine # Utilisez une fonction logarithme en base 2. # À compléter... - return -35.6 # À modifier + + unigram_name = make_unigram(name) + kgram_name = make_kgram(unigram_name, n) + k_counts = kgram_models[(origin, n)] + V = 26 + logprob_name = 0 + if n == 1: + N = sum(k_counts.values()) + for kgram in kgram_name: + C = k_counts.get(kgram, 0) + L = math.log((C + 1) / (N + V)) + logprob_name = logprob_name + L + else: + k1_counts = kgram_models[(origin, n - 1)] + for kgram in kgram_name: + N = k1_counts.get(kgram[:-1], 0) + C = k_counts.get(kgram, 0) + L = math.log((C + 1) / (N + V)) + logprob_name = logprob_name + L + return logprob_name + + +def prod(iterable): + return reduce(operator.mul, iterable, 1) def perplexity(name, origin, n=3): # Retourne la valeur de perplexité d'un nom étant donné une origine # À compléter... - return 7.8 # À modifier + + unigram_name = make_unigram(name) + kgram_name = make_kgram(unigram_name, n) + length_name = len(kgram_name) + k_counts = kgram_models[(origin, n)] + V = 26 + probs_name = [] + if n == 1: + N = sum(k_counts.values()) + for kgram in kgram_name: + C = k_counts.get(kgram, 0) + P = (C + 1) / (N + V) + probs_name.append(P) + else: + k1_counts = kgram_models[(origin, n - 1)] + for kgram in kgram_name: + N = k1_counts.get(kgram[:-1], 0) + C = k_counts.get(kgram, 0) + P = (C + 1) / (N + V) + probs_name.append(P) + + perp = math.pow(prod([1 / x for x in probs_name]), 1 / length_name) + + return perp def load_test_names(filename): @@ -144,6 +206,13 @@ def evaluate_models(filename, n=3): # À compléter - Fonction pour l'évaluation des modèles N-grammes. # ... + results = {} + for org, name_list in test_data.items(): + results[org] = {} + for name in name_list: + results[org][most_probable_origin(name)] = results[org].get(most_probable_origin(name), 0) + 1 + return results + if __name__ == '__main__': # Vous pouvez modifier cette section comme bon vous semble @@ -156,14 +225,15 @@ if __name__ == '__main__': train_models() some_name = "Lamontagne" some_origin = most_probable_origin(some_name) - logprob = logprob(some_name, some_origin) - perplexity = perplexity(some_name, some_origin) + some_logprob = logprob(some_name, some_origin) + some_perplexity = perplexity(some_name, some_origin) print("\nLangue d'origine de {}: ".format(some_name), some_origin) - print("logprob({}, {}):".format(some_name, some_origin), logprob) - print("perplexity({}, {}):".format(some_name, some_origin), perplexity) + print("logprob({}, {}):".format(some_name, some_origin), some_logprob) + print("perplexity({}, {}):".format(some_name, some_origin), some_perplexity) test_names = load_test_names(test_filename) - print("\nLes données pour tester vos modèles sont:") - for org, name_list in test_names.items(): - print("\t", org, name_list) - evaluate_models(test_filename, 3) + # print("\nLes données pour tester vos modèles sont:") + # for org, name_list in test_names.items(): + # print("\t", org, name_list) + res = evaluate_models(test_filename, 3) + pprint.pprint(res)