classification des noms, algorithme fonctionnel
This commit is contained in:
parent
6e865dd88d
commit
71adb32e5a
1 changed files with 82 additions and 12 deletions
|
@ -5,6 +5,10 @@ import os
|
|||
import string
|
||||
import unicodedata
|
||||
import json
|
||||
import math
|
||||
import operator
|
||||
from functools import reduce
|
||||
import pprint
|
||||
|
||||
datafiles = "./data/names/*.txt" # les fichiers pour construire vos modèles
|
||||
test_filename = './data/test-names-t2.txt' # le fichier contenant les données de test pour évaluer vos modèles
|
||||
|
@ -61,7 +65,7 @@ def load_names():
|
|||
def normalize_word(word):
|
||||
word = unicode_to_ascii(word)
|
||||
word = word.lower()
|
||||
word = word.replace(" ","")
|
||||
word = word.replace(" ", "")
|
||||
return word
|
||||
|
||||
|
||||
|
@ -113,20 +117,78 @@ def most_probable_origin(name, n=3):
|
|||
# Retourne la langue d'origine la plus probable du nom.
|
||||
# n désigne la longueur des N-grammes. Par ex n=3 --> trigramme
|
||||
# À compléter...
|
||||
return "French" # À modifier
|
||||
|
||||
# Calculer les log-prob et les perplexités
|
||||
log_prob_origin = {}
|
||||
perplexity_origin = {}
|
||||
for origin in names_by_origin.keys():
|
||||
log_prob_origin[origin] = logprob(name, origin, n)
|
||||
perplexity_origin[origin] = perplexity(name, origin, n)
|
||||
|
||||
# Trouver l'optimum de chacun
|
||||
most_probable_origin_prob = max(log_prob_origin.items(), key=lambda k: k[1])[0]
|
||||
most_probable_origin_perp = min(perplexity_origin.items(), key=lambda k: k[1])[0]
|
||||
|
||||
return most_probable_origin_perp
|
||||
|
||||
|
||||
def logprob(name, origin, n=3):
|
||||
# Retourne la valeur du logprob d'un nom étant donné une origine
|
||||
# Utilisez une fonction logarithme en base 2.
|
||||
# À compléter...
|
||||
return -35.6 # À modifier
|
||||
|
||||
unigram_name = make_unigram(name)
|
||||
kgram_name = make_kgram(unigram_name, n)
|
||||
k_counts = kgram_models[(origin, n)]
|
||||
V = 26
|
||||
logprob_name = 0
|
||||
if n == 1:
|
||||
N = sum(k_counts.values())
|
||||
for kgram in kgram_name:
|
||||
C = k_counts.get(kgram, 0)
|
||||
L = math.log((C + 1) / (N + V))
|
||||
logprob_name = logprob_name + L
|
||||
else:
|
||||
k1_counts = kgram_models[(origin, n - 1)]
|
||||
for kgram in kgram_name:
|
||||
N = k1_counts.get(kgram[:-1], 0)
|
||||
C = k_counts.get(kgram, 0)
|
||||
L = math.log((C + 1) / (N + V))
|
||||
logprob_name = logprob_name + L
|
||||
return logprob_name
|
||||
|
||||
|
||||
def prod(iterable):
|
||||
return reduce(operator.mul, iterable, 1)
|
||||
|
||||
|
||||
def perplexity(name, origin, n=3):
|
||||
# Retourne la valeur de perplexité d'un nom étant donné une origine
|
||||
# À compléter...
|
||||
return 7.8 # À modifier
|
||||
|
||||
unigram_name = make_unigram(name)
|
||||
kgram_name = make_kgram(unigram_name, n)
|
||||
length_name = len(kgram_name)
|
||||
k_counts = kgram_models[(origin, n)]
|
||||
V = 26
|
||||
probs_name = []
|
||||
if n == 1:
|
||||
N = sum(k_counts.values())
|
||||
for kgram in kgram_name:
|
||||
C = k_counts.get(kgram, 0)
|
||||
P = (C + 1) / (N + V)
|
||||
probs_name.append(P)
|
||||
else:
|
||||
k1_counts = kgram_models[(origin, n - 1)]
|
||||
for kgram in kgram_name:
|
||||
N = k1_counts.get(kgram[:-1], 0)
|
||||
C = k_counts.get(kgram, 0)
|
||||
P = (C + 1) / (N + V)
|
||||
probs_name.append(P)
|
||||
|
||||
perp = math.pow(prod([1 / x for x in probs_name]), 1 / length_name)
|
||||
|
||||
return perp
|
||||
|
||||
|
||||
def load_test_names(filename):
|
||||
|
@ -144,6 +206,13 @@ def evaluate_models(filename, n=3):
|
|||
# À compléter - Fonction pour l'évaluation des modèles N-grammes.
|
||||
# ...
|
||||
|
||||
results = {}
|
||||
for org, name_list in test_data.items():
|
||||
results[org] = {}
|
||||
for name in name_list:
|
||||
results[org][most_probable_origin(name)] = results[org].get(most_probable_origin(name), 0) + 1
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Vous pouvez modifier cette section comme bon vous semble
|
||||
|
@ -156,14 +225,15 @@ if __name__ == '__main__':
|
|||
train_models()
|
||||
some_name = "Lamontagne"
|
||||
some_origin = most_probable_origin(some_name)
|
||||
logprob = logprob(some_name, some_origin)
|
||||
perplexity = perplexity(some_name, some_origin)
|
||||
some_logprob = logprob(some_name, some_origin)
|
||||
some_perplexity = perplexity(some_name, some_origin)
|
||||
print("\nLangue d'origine de {}: ".format(some_name), some_origin)
|
||||
print("logprob({}, {}):".format(some_name, some_origin), logprob)
|
||||
print("perplexity({}, {}):".format(some_name, some_origin), perplexity)
|
||||
print("logprob({}, {}):".format(some_name, some_origin), some_logprob)
|
||||
print("perplexity({}, {}):".format(some_name, some_origin), some_perplexity)
|
||||
|
||||
test_names = load_test_names(test_filename)
|
||||
print("\nLes données pour tester vos modèles sont:")
|
||||
for org, name_list in test_names.items():
|
||||
print("\t", org, name_list)
|
||||
evaluate_models(test_filename, 3)
|
||||
# print("\nLes données pour tester vos modèles sont:")
|
||||
# for org, name_list in test_names.items():
|
||||
# print("\t", org, name_list)
|
||||
res = evaluate_models(test_filename, 3)
|
||||
pprint.pprint(res)
|
||||
|
|
Loading…
Add table
Reference in a new issue