lemmatisation et wordnet des jetons des commentaires

2019-12-26 01:32:18 -05:00 · 2019-12-26 01:32:18 -05:00 · 4056b8eeaf
commit 4056b8eeaf
parent 1cb7ad75b5
5 changed files with 144 additions and 528 deletions
--- a/Analyse_Articles.ipynb
+++ b/Analyse_Articles.ipynb
@ -13,7 +13,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -22,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -33,177 +33,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>media</th>\n",
-       "      <th>post_id</th>\n",
-       "      <th>text</th>\n",
-       "      <th>ner_dict</th>\n",
-       "      <th>pos_dict</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>0</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>L'ancien international de football Vikash Dhor...</td>\n",
-       "      <td>{('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON...</td>\n",
-       "      <td>{('ancien', 'ADJ'): 3, ('international', 'NOUN...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7acd0d44b1-10157142962296339</td>\n",
-       "      <td>Les personnes qui iront manifester dimanche 10...</td>\n",
-       "      <td>{('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ...</td>\n",
-       "      <td>{('personnes', 'NOUN'): 2, ('iront', 'VERB'): ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7adde8bd8e-10157142482251339</td>\n",
-       "      <td>Selon Jason Farago, la Joconde prend le musée ...</td>\n",
-       "      <td>{('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):...</td>\n",
-       "      <td>{('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ab8df19a0-10157144491741339</td>\n",
-       "      <td>We're just checking that you want to follow a ...</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>4</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac188a6d6-10157143773291339</td>\n",
-       "      <td>Les défections se sont enchaînées, et peu de p...</td>\n",
-       "      <td>{('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER...</td>\n",
-       "      <td>{('défections', 'NOUN'): 2, ('enchaînées', 'VE...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>5</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac51516dc-10157143472656339</td>\n",
-       "      <td>We're just checking that you want to follow a ...</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>6</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ab9fe4530-10157144373586339</td>\n",
-       "      <td>FIGAROVOX/TRIBUNE - Les derniers chiffres offi...</td>\n",
-       "      <td>{('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON...</td>\n",
-       "      <td>{('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>7</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ae3950eea-10157141592561339</td>\n",
-       "      <td>La DGSI est chef de file de la lutte antiterro...</td>\n",
-       "      <td>{('France', 'LOCATION'): 1, ('1200', 'DATE'): ...</td>\n",
-       "      <td>{('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>8</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac9063012-10157143218116339</td>\n",
-       "      <td>Le voyage en Chine est devenu en ce début de X...</td>\n",
-       "      <td>{('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI...</td>\n",
-       "      <td>{('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>9</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7adf1bf8ff-10157142446816339</td>\n",
-       "      <td>Les nouvelles habitudes de consommation font s...</td>\n",
-       "      <td>{('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ...</td>\n",
-       "      <td>{('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  media                          post_id  \\\n",
-       "0   FIG  5dc7ac7f359e2-10157143278136339   \n",
-       "1   FIG  5dc7acd0d44b1-10157142962296339   \n",
-       "2   FIG  5dc7adde8bd8e-10157142482251339   \n",
-       "3   FIG  5dc7ab8df19a0-10157144491741339   \n",
-       "4   FIG  5dc7ac188a6d6-10157143773291339   \n",
-       "5   FIG  5dc7ac51516dc-10157143472656339   \n",
-       "6   FIG  5dc7ab9fe4530-10157144373586339   \n",
-       "7   FIG  5dc7ae3950eea-10157141592561339   \n",
-       "8   FIG  5dc7ac9063012-10157143218116339   \n",
-       "9   FIG  5dc7adf1bf8ff-10157142446816339   \n",
-       "\n",
-       "                                                text  \\\n",
-       "0  L'ancien international de football Vikash Dhor...   \n",
-       "1  Les personnes qui iront manifester dimanche 10...   \n",
-       "2  Selon Jason Farago, la Joconde prend le musée ...   \n",
-       "3  We're just checking that you want to follow a ...   \n",
-       "4  Les défections se sont enchaînées, et peu de p...   \n",
-       "5  We're just checking that you want to follow a ...   \n",
-       "6  FIGAROVOX/TRIBUNE - Les derniers chiffres offi...   \n",
-       "7  La DGSI est chef de file de la lutte antiterro...   \n",
-       "8  Le voyage en Chine est devenu en ce début de X...   \n",
-       "9  Les nouvelles habitudes de consommation font s...   \n",
-       "\n",
-       "                                            ner_dict  \\\n",
-       "0  {('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON...   \n",
-       "1  {('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ...   \n",
-       "2  {('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):...   \n",
-       "3                                                 {}   \n",
-       "4  {('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER...   \n",
-       "5                                                 {}   \n",
-       "6  {('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON...   \n",
-       "7  {('France', 'LOCATION'): 1, ('1200', 'DATE'): ...   \n",
-       "8  {('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI...   \n",
-       "9  {('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ...   \n",
-       "\n",
-       "                                            pos_dict  \n",
-       "0  {('ancien', 'ADJ'): 3, ('international', 'NOUN...  \n",
-       "1  {('personnes', 'NOUN'): 2, ('iront', 'VERB'): ...  \n",
-       "2  {('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8...  \n",
-       "3  {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...  \n",
-       "4  {('défections', 'NOUN'): 2, ('enchaînées', 'VE...  \n",
-       "5  {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...  \n",
-       "6  {('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'...  \n",
-       "7  {('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c...  \n",
-       "8  {('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,...  \n",
-       "9  {('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'...  "
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "textes_articles_df.head(10)"
   ]
@ -217,7 +49,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -228,7 +60,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -237,335 +69,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>comment_id</th>\n",
-       "      <th>nested_id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>id</th>\n",
-       "      <th>date</th>\n",
-       "      <th>likes</th>\n",
-       "      <th>comment</th>\n",
-       "      <th>media</th>\n",
-       "      <th>post_id</th>\n",
-       "      <th>list_names</th>\n",
-       "      <th>auteurs_referes</th>\n",
-       "      <th>comment_clean</th>\n",
-       "      <th>ner_dict</th>\n",
-       "      <th>pos_dict</th>\n",
-       "      <th>emoji_dict</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Ycf Bullit</td>\n",
-       "      <td>ID: 100000615866313</td>\n",
-       "      <td>2019-11-09 14:17:13</td>\n",
-       "      <td>0</td>\n",
-       "      <td>C'est une blague mdr 🤣🤣🤣🤣🤣</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Ycf Bullit]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>C'est une blague mdr 🤣🤣🤣🤣🤣</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...</td>\n",
-       "      <td>{':rolling_on_the_floor_laughing:': [5, 6, 7]}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Steph Alcazar</td>\n",
-       "      <td>ID: 100001175077263</td>\n",
-       "      <td>2019-11-09 14:17:34</td>\n",
-       "      <td>0</td>\n",
-       "      <td>La seule question c'est de savoir s'il fera pl...</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Steph Alcazar]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>La seule question c'est de savoir s'il fera pl...</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Töm Müstäine</td>\n",
-       "      <td>ID: 1365879404</td>\n",
-       "      <td>2019-11-09 14:17:51</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Töm Müstäine]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
-       "      <td>{('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...</td>\n",
-       "      <td>{('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Pierre Crouzet</td>\n",
-       "      <td>ID: 100000270292007</td>\n",
-       "      <td>2019-11-09 14:18:06</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Vasanth Toure 😍</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
-       "      <td>['Vasanth Toure']</td>\n",
-       "      <td>😍</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>4</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Vasanth Toure</td>\n",
-       "      <td>ID: 100001494607801</td>\n",
-       "      <td>2019-11-09 14:20:57</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Pierre Crouzet Paris n'est pas prêt encore...</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
-       "      <td>['Pierre Crouzet']</td>\n",
-       "      <td>Paris n'est pas prêt encore...</td>\n",
-       "      <td>{('Paris', 'LOCATION'): 1}</td>\n",
-       "      <td>{('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>5</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Pierre Crouzet</td>\n",
-       "      <td>ID: 100000270292007</td>\n",
-       "      <td>2019-11-09 14:26:37</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Vasanth Toure le prochain c’est Adrien Rabiot</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
-       "      <td>['Vasanth Toure']</td>\n",
-       "      <td>le prochain c’est Adrien Rabiot</td>\n",
-       "      <td>{('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')...</td>\n",
-       "      <td>{('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ...</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>6</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Stéphane Pirnaci</td>\n",
-       "      <td>ID: 100008541367302</td>\n",
-       "      <td>2019-11-09 14:18:51</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Mdr</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Stéphane Pirnaci]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>Mdr</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>7</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Adil Bennani</td>\n",
-       "      <td>ID: 100006432917292</td>\n",
-       "      <td>2019-11-09 14:19:03</td>\n",
-       "      <td>0</td>\n",
-       "      <td>moi je propose mamadou sissoko</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Adil Bennani]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>moi je propose mamadou sissoko</td>\n",
-       "      <td>{}</td>\n",
-       "      <td>{('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ...</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>8</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Hadrien De Cournon</td>\n",
-       "      <td>ID: 1131290552</td>\n",
-       "      <td>2019-11-09 14:19:09</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Louis Prt Corentin Corman Victor Mdv ah ouais?</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Hadrien De Cournon]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>Louis Prt Corentin Corman Victor Mdv ah ouais?</td>\n",
-       "      <td>{('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,...</td>\n",
-       "      <td>{('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (...</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>9</td>\n",
-       "      <td>8.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Marwa Larose</td>\n",
-       "      <td>ID: 100022577589611</td>\n",
-       "      <td>2019-11-09 14:19:38</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Marier le foot à la mairie est génial</td>\n",
-       "      <td>FIG</td>\n",
-       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
-       "      <td>[Marwa Larose]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>Marier le foot à la mairie est génial</td>\n",
-       "      <td>{('Marier', 'PERSON'): 1}</td>\n",
-       "      <td>{('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (...</td>\n",
-       "      <td>{}</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   comment_id  nested_id                name                   id  \\\n",
-       "0         1.0          0          Ycf Bullit  ID: 100000615866313   \n",
-       "1         2.0          0       Steph Alcazar  ID: 100001175077263   \n",
-       "2         3.0          0        Töm Müstäine       ID: 1365879404   \n",
-       "3         4.0          0      Pierre Crouzet  ID: 100000270292007   \n",
-       "4         4.0          1       Vasanth Toure  ID: 100001494607801   \n",
-       "5         4.0          2      Pierre Crouzet  ID: 100000270292007   \n",
-       "6         5.0          0    Stéphane Pirnaci  ID: 100008541367302   \n",
-       "7         6.0          0        Adil Bennani  ID: 100006432917292   \n",
-       "8         7.0          0  Hadrien De Cournon       ID: 1131290552   \n",
-       "9         8.0          0        Marwa Larose  ID: 100022577589611   \n",
-       "\n",
-       "                  date  likes  \\\n",
-       "0  2019-11-09 14:17:13      0   \n",
-       "1  2019-11-09 14:17:34      0   \n",
-       "2  2019-11-09 14:17:51      0   \n",
-       "3  2019-11-09 14:18:06      0   \n",
-       "4  2019-11-09 14:20:57      0   \n",
-       "5  2019-11-09 14:26:37      0   \n",
-       "6  2019-11-09 14:18:51      0   \n",
-       "7  2019-11-09 14:19:03      0   \n",
-       "8  2019-11-09 14:19:09      0   \n",
-       "9  2019-11-09 14:19:38      0   \n",
-       "\n",
-       "                                             comment media  \\\n",
-       "0                         C'est une blague mdr 🤣🤣🤣🤣🤣   FIG   \n",
-       "1  La seule question c'est de savoir s'il fera pl...   FIG   \n",
-       "2    Romain Debrigode l info du jour qui fait plaise   FIG   \n",
-       "3                                    Vasanth Toure 😍   FIG   \n",
-       "4      Pierre Crouzet Paris n'est pas prêt encore...   FIG   \n",
-       "5      Vasanth Toure le prochain c’est Adrien Rabiot   FIG   \n",
-       "6                                                Mdr   FIG   \n",
-       "7                     moi je propose mamadou sissoko   FIG   \n",
-       "8     Louis Prt Corentin Corman Victor Mdv ah ouais?   FIG   \n",
-       "9              Marier le foot à la mairie est génial   FIG   \n",
-       "\n",
-       "                           post_id                       list_names  \\\n",
-       "0  5dc7ac7f359e2-10157143278136339                     [Ycf Bullit]   \n",
-       "1  5dc7ac7f359e2-10157143278136339                  [Steph Alcazar]   \n",
-       "2  5dc7ac7f359e2-10157143278136339                   [Töm Müstäine]   \n",
-       "3  5dc7ac7f359e2-10157143278136339  [Pierre Crouzet, Vasanth Toure]   \n",
-       "4  5dc7ac7f359e2-10157143278136339  [Pierre Crouzet, Vasanth Toure]   \n",
-       "5  5dc7ac7f359e2-10157143278136339  [Pierre Crouzet, Vasanth Toure]   \n",
-       "6  5dc7ac7f359e2-10157143278136339               [Stéphane Pirnaci]   \n",
-       "7  5dc7ac7f359e2-10157143278136339                   [Adil Bennani]   \n",
-       "8  5dc7ac7f359e2-10157143278136339             [Hadrien De Cournon]   \n",
-       "9  5dc7ac7f359e2-10157143278136339                   [Marwa Larose]   \n",
-       "\n",
-       "      auteurs_referes                                      comment_clean  \\\n",
-       "0                  []                         C'est une blague mdr 🤣🤣🤣🤣🤣   \n",
-       "1                  []  La seule question c'est de savoir s'il fera pl...   \n",
-       "2                  []    Romain Debrigode l info du jour qui fait plaise   \n",
-       "3   ['Vasanth Toure']                                                  😍   \n",
-       "4  ['Pierre Crouzet']                     Paris n'est pas prêt encore...   \n",
-       "5   ['Vasanth Toure']                    le prochain c’est Adrien Rabiot   \n",
-       "6                  []                                                Mdr   \n",
-       "7                  []                     moi je propose mamadou sissoko   \n",
-       "8                  []     Louis Prt Corentin Corman Victor Mdv ah ouais?   \n",
-       "9                  []              Marier le foot à la mairie est génial   \n",
-       "\n",
-       "                                            ner_dict  \\\n",
-       "0                                                 {}   \n",
-       "1                                                 {}   \n",
-       "2  {('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...   \n",
-       "3                                                 {}   \n",
-       "4                         {('Paris', 'LOCATION'): 1}   \n",
-       "5  {('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')...   \n",
-       "6                                                 {}   \n",
-       "7                                                 {}   \n",
-       "8  {('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,...   \n",
-       "9                          {('Marier', 'PERSON'): 1}   \n",
-       "\n",
-       "                                            pos_dict  \\\n",
-       "0  {('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...   \n",
-       "1  {('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...   \n",
-       "2  {('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...   \n",
-       "3                                                 {}   \n",
-       "4  {('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...   \n",
-       "5  {('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ...   \n",
-       "6                                                 {}   \n",
-       "7  {('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ...   \n",
-       "8  {('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (...   \n",
-       "9  {('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (...   \n",
-       "\n",
-       "                                       emoji_dict  \n",
-       "0  {':rolling_on_the_floor_laughing:': [5, 6, 7]}  \n",
-       "1                                              {}  \n",
-       "2                                              {}  \n",
-       "3                                              {}  \n",
-       "4                                              {}  \n",
-       "5                                              {}  \n",
-       "6                                              {}  \n",
-       "7                                              {}  \n",
-       "8                                              {}  \n",
-       "9                                              {}  "
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "commentaires_df.head(10)"
   ]
@ -586,7 +92,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -595,7 +101,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -604,7 +110,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -620,7 +126,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -629,7 +135,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -638,7 +144,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -654,7 +160,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -664,37 +170,125 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.13"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "round(nb_comm_emoji/nb_comm,2)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Niveau de langage\n",
+    "## Nombre de jetons dans WordNet\n",
+    "\n",
+    "On utilise le POS tag identifié depuis Stanford POS Tagger, puis on le convertis en tag compatible pour Wordnet. On recherche ensuite le mot lemmatisé dans Wordnet en français, puis on filtre les résultats avec le POS. Ceci permet d'identifier tous les synsets réalistes pour les mots du commentaire."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "from nltk.corpus import wordnet as wn\n",
+    "from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer\n",
+    "lemmatizer = FrenchLefffLemmatizer()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Conversion du tag de Stanford POS vers Wordnet POS"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "def wn_tag_from_ud(tag):\n",
+    "    if tag=='ADJ':\n",
+    "        return wn.ADJ\n",
+    "    if tag=='NOUN':\n",
+    "        return wn.NOUN\n",
+    "    if tag=='VERB':\n",
+    "        return wn.VERB\n",
+    "    if tag=='ADV':\n",
+    "        return wn.ADV\n",
+    "    else:\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Lemmatisation d'une liste de tokens en français"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lem_fr(tokens):\n",
+    "    list_tokens = []\n",
+    "    for token in tokens:\n",
+    "        wn_pos = wn_tag_from_ud(token[1])\n",
+    "        if wn_pos is not None:\n",
+    "            lem_token = lemmatizer.lemmatize(token[0],pos=wn_pos)\n",
+    "        else:\n",
+    "            lem_token = lemmatizer.lemmatize(token[0])\n",
+    "        list_tokens.append((lem_token,token[1]))\n",
+    "    return set(list_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "commentaires_df[\"pos_dict_lem\"] = commentaires_df.apply(lambda x: lem_fr(x[\"pos_dict\"]), axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Identification des synsets des tokens en français"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def synsets_fr(tokens):\n",
+    "    list_synsets = []\n",
+    "    for token in tokens:\n",
+    "        wn_pos = wn_tag_from_ud(token[1])\n",
+    "        if wn_pos is not None:\n",
+    "            synset = wn.synsets(token[0], lang='fra', pos=wn_pos)\n",
+    "            list_synsets.append(synset)\n",
+    "    return list_synsets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "commentaires_df[\"synsets\"] = commentaires_df.apply(lambda x: synsets_fr(x[\"pos_dict_lem\"]), axis=1)"
+   ]
  },
  {
   "cell_type": "code",
--- a/NLP-TP3.bib
+++ b/NLP-TP3.bib
--- a/README.md
+++ b/README.md
@ -6,6 +6,13 @@ Installer Anaconda3

 pip install newspaper3k
 pip install emoji
+pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git
+
+## Installations des dépendances de nltk
+
+import nltk
+nltk.download('wordnet')
+nltk.download('omw')

 ## Compilation du rapport

--- a/commentaires.ipynb
+++ b/commentaires.ipynb
@ -295,7 +295,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.4"
  }
 },
 "nbformat": 4,
--- a/rapport.md
+++ b/rapport.md
@ -197,6 +197,8 @@ Intertextualité

 ## Méthodologie et algorithmes

+J'ai effectué la lemmatisation en français à l'aide du French LEFFF Lemmatizer de Claude Coulombe [@coulombe_french_2019], qui est compatible avec la syntaxe utilisée dans la librairie NLTK et les étiquettes POS utilisées dans WordNet.
+
 ## Quelques résultats

 # Conclusion