From 6e865dd88d99724d1cf238db4a800a11445221b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Sat, 21 Sep 2019 22:56:38 -0400 Subject: [PATCH] =?UTF-8?q?ajout=20du=20calcul=20des=20tables=20de=20fr?= =?UTF-8?q?=C3=A9quences=20des=20k=20grams?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- t2_classification_noms.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/t2_classification_noms.py b/t2_classification_noms.py index c1306e4..616f06d 100644 --- a/t2_classification_noms.py +++ b/t2_classification_noms.py @@ -15,6 +15,8 @@ all_origins = [] # la liste des 18 langues d'origines de noms BOS = "~" # character used to pad the beginning of a name EOS = "!" # character used to pad the end of a name +kgram_models = {} + def find_files(path): """Retourne le nom des fichiers contenus dans un répertoire. @@ -74,6 +76,14 @@ def make_kgram(unigram_word, k): return kgrams +def make_count_dict(kgrams_origin): + kgram_count_dict = {} + for kgram_name in kgrams_origin: + for kgram in kgram_name: + kgram_count_dict[kgram] = kgram_count_dict.get(kgram, 0) + 1 + return kgram_count_dict + + def train_models(): load_names() # Vous ajoutez à partir d'ici tout le code dont vous avez besoin @@ -86,7 +96,17 @@ def train_models(): # # Votre code à partir d'ici... - + # Construire un tableau de fréquence par langue. Chaque tableau est une entrée dans un dictionnaire + for origin in names_by_origin.keys(): + unigrams = [] + for word in names_by_origin[origin]: + word = normalize_word(word) + unigrams.append(make_unigram(word)) + for k in range(1, 4): + kgrams_origin = [] + for unigram_word in unigrams: + kgrams_origin.append(make_kgram(unigram_word, k)) + kgram_models[(origin, k)] = make_count_dict(kgrams_origin) def most_probable_origin(name, n=3):