classification des noms, algorithme fonctionnel

This commit is contained in:
François Pelletier 2019-09-22 13:32:49 -04:00
parent 6e865dd88d
commit 71adb32e5a

View file

@ -5,6 +5,10 @@ import os
import string
import unicodedata
import json
import math
import operator
from functools import reduce
import pprint
datafiles = "./data/names/*.txt" # les fichiers pour construire vos modèles
test_filename = './data/test-names-t2.txt' # le fichier contenant les données de test pour évaluer vos modèles
@ -61,7 +65,7 @@ def load_names():
def normalize_word(word):
word = unicode_to_ascii(word)
word = word.lower()
word = word.replace(" ","")
word = word.replace(" ", "")
return word
@ -113,20 +117,78 @@ def most_probable_origin(name, n=3):
# Retourne la langue d'origine la plus probable du nom.
# n désigne la longueur des N-grammes. Par ex n=3 --> trigramme
# À compléter...
return "French" # À modifier
# Calculer les log-prob et les perplexités
log_prob_origin = {}
perplexity_origin = {}
for origin in names_by_origin.keys():
log_prob_origin[origin] = logprob(name, origin, n)
perplexity_origin[origin] = perplexity(name, origin, n)
# Trouver l'optimum de chacun
most_probable_origin_prob = max(log_prob_origin.items(), key=lambda k: k[1])[0]
most_probable_origin_perp = min(perplexity_origin.items(), key=lambda k: k[1])[0]
return most_probable_origin_perp
def logprob(name, origin, n=3):
# Retourne la valeur du logprob d'un nom étant donné une origine
# Utilisez une fonction logarithme en base 2.
# À compléter...
return -35.6 # À modifier
unigram_name = make_unigram(name)
kgram_name = make_kgram(unigram_name, n)
k_counts = kgram_models[(origin, n)]
V = 26
logprob_name = 0
if n == 1:
N = sum(k_counts.values())
for kgram in kgram_name:
C = k_counts.get(kgram, 0)
L = math.log((C + 1) / (N + V))
logprob_name = logprob_name + L
else:
k1_counts = kgram_models[(origin, n - 1)]
for kgram in kgram_name:
N = k1_counts.get(kgram[:-1], 0)
C = k_counts.get(kgram, 0)
L = math.log((C + 1) / (N + V))
logprob_name = logprob_name + L
return logprob_name
def prod(iterable):
return reduce(operator.mul, iterable, 1)
def perplexity(name, origin, n=3):
# Retourne la valeur de perplexité d'un nom étant donné une origine
# À compléter...
return 7.8 # À modifier
unigram_name = make_unigram(name)
kgram_name = make_kgram(unigram_name, n)
length_name = len(kgram_name)
k_counts = kgram_models[(origin, n)]
V = 26
probs_name = []
if n == 1:
N = sum(k_counts.values())
for kgram in kgram_name:
C = k_counts.get(kgram, 0)
P = (C + 1) / (N + V)
probs_name.append(P)
else:
k1_counts = kgram_models[(origin, n - 1)]
for kgram in kgram_name:
N = k1_counts.get(kgram[:-1], 0)
C = k_counts.get(kgram, 0)
P = (C + 1) / (N + V)
probs_name.append(P)
perp = math.pow(prod([1 / x for x in probs_name]), 1 / length_name)
return perp
def load_test_names(filename):
@ -144,6 +206,13 @@ def evaluate_models(filename, n=3):
# À compléter - Fonction pour l'évaluation des modèles N-grammes.
# ...
results = {}
for org, name_list in test_data.items():
results[org] = {}
for name in name_list:
results[org][most_probable_origin(name)] = results[org].get(most_probable_origin(name), 0) + 1
return results
if __name__ == '__main__':
# Vous pouvez modifier cette section comme bon vous semble
@ -156,14 +225,15 @@ if __name__ == '__main__':
train_models()
some_name = "Lamontagne"
some_origin = most_probable_origin(some_name)
logprob = logprob(some_name, some_origin)
perplexity = perplexity(some_name, some_origin)
some_logprob = logprob(some_name, some_origin)
some_perplexity = perplexity(some_name, some_origin)
print("\nLangue d'origine de {}: ".format(some_name), some_origin)
print("logprob({}, {}):".format(some_name, some_origin), logprob)
print("perplexity({}, {}):".format(some_name, some_origin), perplexity)
print("logprob({}, {}):".format(some_name, some_origin), some_logprob)
print("perplexity({}, {}):".format(some_name, some_origin), some_perplexity)
test_names = load_test_names(test_filename)
print("\nLes données pour tester vos modèles sont:")
for org, name_list in test_names.items():
print("\t", org, name_list)
evaluate_models(test_filename, 3)
# print("\nLes données pour tester vos modèles sont:")
# for org, name_list in test_names.items():
# print("\t", org, name_list)
res = evaluate_models(test_filename, 3)
pprint.pprint(res)