ajout du calcul des tables de fréquences des k grams
This commit is contained in:
parent
90275ef564
commit
6e865dd88d
1 changed files with 21 additions and 1 deletions
|
@ -15,6 +15,8 @@ all_origins = [] # la liste des 18 langues d'origines de noms
|
|||
BOS = "~" # character used to pad the beginning of a name
|
||||
EOS = "!" # character used to pad the end of a name
|
||||
|
||||
kgram_models = {}
|
||||
|
||||
|
||||
def find_files(path):
|
||||
"""Retourne le nom des fichiers contenus dans un répertoire.
|
||||
|
@ -74,6 +76,14 @@ def make_kgram(unigram_word, k):
|
|||
return kgrams
|
||||
|
||||
|
||||
def make_count_dict(kgrams_origin):
|
||||
kgram_count_dict = {}
|
||||
for kgram_name in kgrams_origin:
|
||||
for kgram in kgram_name:
|
||||
kgram_count_dict[kgram] = kgram_count_dict.get(kgram, 0) + 1
|
||||
return kgram_count_dict
|
||||
|
||||
|
||||
def train_models():
|
||||
load_names()
|
||||
# Vous ajoutez à partir d'ici tout le code dont vous avez besoin
|
||||
|
@ -86,7 +96,17 @@ def train_models():
|
|||
#
|
||||
# Votre code à partir d'ici...
|
||||
|
||||
|
||||
# Construire un tableau de fréquence par langue. Chaque tableau est une entrée dans un dictionnaire
|
||||
for origin in names_by_origin.keys():
|
||||
unigrams = []
|
||||
for word in names_by_origin[origin]:
|
||||
word = normalize_word(word)
|
||||
unigrams.append(make_unigram(word))
|
||||
for k in range(1, 4):
|
||||
kgrams_origin = []
|
||||
for unigram_word in unigrams:
|
||||
kgrams_origin.append(make_kgram(unigram_word, k))
|
||||
kgram_models[(origin, k)] = make_count_dict(kgrams_origin)
|
||||
|
||||
|
||||
def most_probable_origin(name, n=3):
|
||||
|
|
Loading…
Add table
Reference in a new issue