lemmatisation et wordnet des jetons des commentaires

This commit is contained in:
François Pelletier 2019-12-26 01:32:18 -05:00
parent 1cb7ad75b5
commit 4056b8eeaf
5 changed files with 144 additions and 528 deletions

View file

@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -22,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -33,177 +33,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>media</th>\n",
" <th>post_id</th>\n",
" <th>text</th>\n",
" <th>ner_dict</th>\n",
" <th>pos_dict</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>L'ancien international de football Vikash Dhor...</td>\n",
" <td>{('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON...</td>\n",
" <td>{('ancien', 'ADJ'): 3, ('international', 'NOUN...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>FIG</td>\n",
" <td>5dc7acd0d44b1-10157142962296339</td>\n",
" <td>Les personnes qui iront manifester dimanche 10...</td>\n",
" <td>{('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ...</td>\n",
" <td>{('personnes', 'NOUN'): 2, ('iront', 'VERB'): ...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>FIG</td>\n",
" <td>5dc7adde8bd8e-10157142482251339</td>\n",
" <td>Selon Jason Farago, la Joconde prend le musée ...</td>\n",
" <td>{('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):...</td>\n",
" <td>{('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ab8df19a0-10157144491741339</td>\n",
" <td>We're just checking that you want to follow a ...</td>\n",
" <td>{}</td>\n",
" <td>{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac188a6d6-10157143773291339</td>\n",
" <td>Les défections se sont enchaînées, et peu de p...</td>\n",
" <td>{('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER...</td>\n",
" <td>{('défections', 'NOUN'): 2, ('enchaînées', 'VE...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac51516dc-10157143472656339</td>\n",
" <td>We're just checking that you want to follow a ...</td>\n",
" <td>{}</td>\n",
" <td>{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ab9fe4530-10157144373586339</td>\n",
" <td>FIGAROVOX/TRIBUNE - Les derniers chiffres offi...</td>\n",
" <td>{('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON...</td>\n",
" <td>{('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ae3950eea-10157141592561339</td>\n",
" <td>La DGSI est chef de file de la lutte antiterro...</td>\n",
" <td>{('France', 'LOCATION'): 1, ('1200', 'DATE'): ...</td>\n",
" <td>{('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac9063012-10157143218116339</td>\n",
" <td>Le voyage en Chine est devenu en ce début de X...</td>\n",
" <td>{('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI...</td>\n",
" <td>{('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>FIG</td>\n",
" <td>5dc7adf1bf8ff-10157142446816339</td>\n",
" <td>Les nouvelles habitudes de consommation font s...</td>\n",
" <td>{('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ...</td>\n",
" <td>{('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" media post_id \\\n",
"0 FIG 5dc7ac7f359e2-10157143278136339 \n",
"1 FIG 5dc7acd0d44b1-10157142962296339 \n",
"2 FIG 5dc7adde8bd8e-10157142482251339 \n",
"3 FIG 5dc7ab8df19a0-10157144491741339 \n",
"4 FIG 5dc7ac188a6d6-10157143773291339 \n",
"5 FIG 5dc7ac51516dc-10157143472656339 \n",
"6 FIG 5dc7ab9fe4530-10157144373586339 \n",
"7 FIG 5dc7ae3950eea-10157141592561339 \n",
"8 FIG 5dc7ac9063012-10157143218116339 \n",
"9 FIG 5dc7adf1bf8ff-10157142446816339 \n",
"\n",
" text \\\n",
"0 L'ancien international de football Vikash Dhor... \n",
"1 Les personnes qui iront manifester dimanche 10... \n",
"2 Selon Jason Farago, la Joconde prend le musée ... \n",
"3 We're just checking that you want to follow a ... \n",
"4 Les défections se sont enchaînées, et peu de p... \n",
"5 We're just checking that you want to follow a ... \n",
"6 FIGAROVOX/TRIBUNE - Les derniers chiffres offi... \n",
"7 La DGSI est chef de file de la lutte antiterro... \n",
"8 Le voyage en Chine est devenu en ce début de X... \n",
"9 Les nouvelles habitudes de consommation font s... \n",
"\n",
" ner_dict \\\n",
"0 {('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON... \n",
"1 {('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ... \n",
"2 {('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):... \n",
"3 {} \n",
"4 {('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER... \n",
"5 {} \n",
"6 {('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON... \n",
"7 {('France', 'LOCATION'): 1, ('1200', 'DATE'): ... \n",
"8 {('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI... \n",
"9 {('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ... \n",
"\n",
" pos_dict \n",
"0 {('ancien', 'ADJ'): 3, ('international', 'NOUN... \n",
"1 {('personnes', 'NOUN'): 2, ('iront', 'VERB'): ... \n",
"2 {('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8... \n",
"3 {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c... \n",
"4 {('défections', 'NOUN'): 2, ('enchaînées', 'VE... \n",
"5 {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c... \n",
"6 {('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'... \n",
"7 {('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c... \n",
"8 {('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,... \n",
"9 {('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'... "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"textes_articles_df.head(10)"
]
@ -217,7 +49,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -228,7 +60,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -237,335 +69,9 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>comment_id</th>\n",
" <th>nested_id</th>\n",
" <th>name</th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>likes</th>\n",
" <th>comment</th>\n",
" <th>media</th>\n",
" <th>post_id</th>\n",
" <th>list_names</th>\n",
" <th>auteurs_referes</th>\n",
" <th>comment_clean</th>\n",
" <th>ner_dict</th>\n",
" <th>pos_dict</th>\n",
" <th>emoji_dict</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>Ycf Bullit</td>\n",
" <td>ID: 100000615866313</td>\n",
" <td>2019-11-09 14:17:13</td>\n",
" <td>0</td>\n",
" <td>C'est une blague mdr 🤣🤣🤣🤣🤣</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Ycf Bullit]</td>\n",
" <td>[]</td>\n",
" <td>C'est une blague mdr 🤣🤣🤣🤣🤣</td>\n",
" <td>{}</td>\n",
" <td>{('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...</td>\n",
" <td>{':rolling_on_the_floor_laughing:': [5, 6, 7]}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>Steph Alcazar</td>\n",
" <td>ID: 100001175077263</td>\n",
" <td>2019-11-09 14:17:34</td>\n",
" <td>0</td>\n",
" <td>La seule question c'est de savoir s'il fera pl...</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Steph Alcazar]</td>\n",
" <td>[]</td>\n",
" <td>La seule question c'est de savoir s'il fera pl...</td>\n",
" <td>{}</td>\n",
" <td>{('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>Töm Müstäine</td>\n",
" <td>ID: 1365879404</td>\n",
" <td>2019-11-09 14:17:51</td>\n",
" <td>0</td>\n",
" <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Töm Müstäine]</td>\n",
" <td>[]</td>\n",
" <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
" <td>{('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...</td>\n",
" <td>{('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>0</td>\n",
" <td>Pierre Crouzet</td>\n",
" <td>ID: 100000270292007</td>\n",
" <td>2019-11-09 14:18:06</td>\n",
" <td>0</td>\n",
" <td>Vasanth Toure 😍</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
" <td>['Vasanth Toure']</td>\n",
" <td>😍</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>Vasanth Toure</td>\n",
" <td>ID: 100001494607801</td>\n",
" <td>2019-11-09 14:20:57</td>\n",
" <td>0</td>\n",
" <td>Pierre Crouzet Paris n'est pas prêt encore...</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
" <td>['Pierre Crouzet']</td>\n",
" <td>Paris n'est pas prêt encore...</td>\n",
" <td>{('Paris', 'LOCATION'): 1}</td>\n",
" <td>{('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>4.0</td>\n",
" <td>2</td>\n",
" <td>Pierre Crouzet</td>\n",
" <td>ID: 100000270292007</td>\n",
" <td>2019-11-09 14:26:37</td>\n",
" <td>0</td>\n",
" <td>Vasanth Toure le prochain cest Adrien Rabiot</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
" <td>['Vasanth Toure']</td>\n",
" <td>le prochain cest Adrien Rabiot</td>\n",
" <td>{('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')...</td>\n",
" <td>{('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>5.0</td>\n",
" <td>0</td>\n",
" <td>Stéphane Pirnaci</td>\n",
" <td>ID: 100008541367302</td>\n",
" <td>2019-11-09 14:18:51</td>\n",
" <td>0</td>\n",
" <td>Mdr</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Stéphane Pirnaci]</td>\n",
" <td>[]</td>\n",
" <td>Mdr</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>6.0</td>\n",
" <td>0</td>\n",
" <td>Adil Bennani</td>\n",
" <td>ID: 100006432917292</td>\n",
" <td>2019-11-09 14:19:03</td>\n",
" <td>0</td>\n",
" <td>moi je propose mamadou sissoko</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Adil Bennani]</td>\n",
" <td>[]</td>\n",
" <td>moi je propose mamadou sissoko</td>\n",
" <td>{}</td>\n",
" <td>{('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>7.0</td>\n",
" <td>0</td>\n",
" <td>Hadrien De Cournon</td>\n",
" <td>ID: 1131290552</td>\n",
" <td>2019-11-09 14:19:09</td>\n",
" <td>0</td>\n",
" <td>Louis Prt Corentin Corman Victor Mdv ah ouais?</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Hadrien De Cournon]</td>\n",
" <td>[]</td>\n",
" <td>Louis Prt Corentin Corman Victor Mdv ah ouais?</td>\n",
" <td>{('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,...</td>\n",
" <td>{('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>8.0</td>\n",
" <td>0</td>\n",
" <td>Marwa Larose</td>\n",
" <td>ID: 100022577589611</td>\n",
" <td>2019-11-09 14:19:38</td>\n",
" <td>0</td>\n",
" <td>Marier le foot à la mairie est génial</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Marwa Larose]</td>\n",
" <td>[]</td>\n",
" <td>Marier le foot à la mairie est génial</td>\n",
" <td>{('Marier', 'PERSON'): 1}</td>\n",
" <td>{('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" comment_id nested_id name id \\\n",
"0 1.0 0 Ycf Bullit ID: 100000615866313 \n",
"1 2.0 0 Steph Alcazar ID: 100001175077263 \n",
"2 3.0 0 Töm Müstäine ID: 1365879404 \n",
"3 4.0 0 Pierre Crouzet ID: 100000270292007 \n",
"4 4.0 1 Vasanth Toure ID: 100001494607801 \n",
"5 4.0 2 Pierre Crouzet ID: 100000270292007 \n",
"6 5.0 0 Stéphane Pirnaci ID: 100008541367302 \n",
"7 6.0 0 Adil Bennani ID: 100006432917292 \n",
"8 7.0 0 Hadrien De Cournon ID: 1131290552 \n",
"9 8.0 0 Marwa Larose ID: 100022577589611 \n",
"\n",
" date likes \\\n",
"0 2019-11-09 14:17:13 0 \n",
"1 2019-11-09 14:17:34 0 \n",
"2 2019-11-09 14:17:51 0 \n",
"3 2019-11-09 14:18:06 0 \n",
"4 2019-11-09 14:20:57 0 \n",
"5 2019-11-09 14:26:37 0 \n",
"6 2019-11-09 14:18:51 0 \n",
"7 2019-11-09 14:19:03 0 \n",
"8 2019-11-09 14:19:09 0 \n",
"9 2019-11-09 14:19:38 0 \n",
"\n",
" comment media \\\n",
"0 C'est une blague mdr 🤣🤣🤣🤣🤣 FIG \n",
"1 La seule question c'est de savoir s'il fera pl... FIG \n",
"2 Romain Debrigode l info du jour qui fait plaise FIG \n",
"3 Vasanth Toure 😍 FIG \n",
"4 Pierre Crouzet Paris n'est pas prêt encore... FIG \n",
"5 Vasanth Toure le prochain cest Adrien Rabiot FIG \n",
"6 Mdr FIG \n",
"7 moi je propose mamadou sissoko FIG \n",
"8 Louis Prt Corentin Corman Victor Mdv ah ouais? FIG \n",
"9 Marier le foot à la mairie est génial FIG \n",
"\n",
" post_id list_names \\\n",
"0 5dc7ac7f359e2-10157143278136339 [Ycf Bullit] \n",
"1 5dc7ac7f359e2-10157143278136339 [Steph Alcazar] \n",
"2 5dc7ac7f359e2-10157143278136339 [Töm Müstäine] \n",
"3 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n",
"4 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n",
"5 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n",
"6 5dc7ac7f359e2-10157143278136339 [Stéphane Pirnaci] \n",
"7 5dc7ac7f359e2-10157143278136339 [Adil Bennani] \n",
"8 5dc7ac7f359e2-10157143278136339 [Hadrien De Cournon] \n",
"9 5dc7ac7f359e2-10157143278136339 [Marwa Larose] \n",
"\n",
" auteurs_referes comment_clean \\\n",
"0 [] C'est une blague mdr 🤣🤣🤣🤣🤣 \n",
"1 [] La seule question c'est de savoir s'il fera pl... \n",
"2 [] Romain Debrigode l info du jour qui fait plaise \n",
"3 ['Vasanth Toure'] 😍 \n",
"4 ['Pierre Crouzet'] Paris n'est pas prêt encore... \n",
"5 ['Vasanth Toure'] le prochain cest Adrien Rabiot \n",
"6 [] Mdr \n",
"7 [] moi je propose mamadou sissoko \n",
"8 [] Louis Prt Corentin Corman Victor Mdv ah ouais? \n",
"9 [] Marier le foot à la mairie est génial \n",
"\n",
" ner_dict \\\n",
"0 {} \n",
"1 {} \n",
"2 {('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO... \n",
"3 {} \n",
"4 {('Paris', 'LOCATION'): 1} \n",
"5 {('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')... \n",
"6 {} \n",
"7 {} \n",
"8 {('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,... \n",
"9 {('Marier', 'PERSON'): 1} \n",
"\n",
" pos_dict \\\n",
"0 {('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('... \n",
"1 {('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,... \n",
"2 {('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'... \n",
"3 {} \n",
"4 {('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('... \n",
"5 {('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ... \n",
"6 {} \n",
"7 {('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ... \n",
"8 {('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (... \n",
"9 {('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (... \n",
"\n",
" emoji_dict \n",
"0 {':rolling_on_the_floor_laughing:': [5, 6, 7]} \n",
"1 {} \n",
"2 {} \n",
"3 {} \n",
"4 {} \n",
"5 {} \n",
"6 {} \n",
"7 {} \n",
"8 {} \n",
"9 {} "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"commentaires_df.head(10)"
]
@ -586,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -595,7 +101,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -604,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -620,7 +126,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -629,7 +135,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -638,7 +144,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -654,7 +160,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -664,37 +170,125 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.13"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"round(nb_comm_emoji/nb_comm,2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Niveau de langage\n",
"## Nombre de jetons dans WordNet\n",
"\n",
"On utilise le POS tag identifié depuis Stanford POS Tagger, puis on le convertis en tag compatible pour Wordnet. On recherche ensuite le mot lemmatisé dans Wordnet en français, puis on filtre les résultats avec le POS. Ceci permet d'identifier tous les synsets réalistes pour les mots du commentaire."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"from nltk.corpus import wordnet as wn\n",
"from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer\n",
"lemmatizer = FrenchLefffLemmatizer()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Conversion du tag de Stanford POS vers Wordnet POS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"def wn_tag_from_ud(tag):\n",
" if tag=='ADJ':\n",
" return wn.ADJ\n",
" if tag=='NOUN':\n",
" return wn.NOUN\n",
" if tag=='VERB':\n",
" return wn.VERB\n",
" if tag=='ADV':\n",
" return wn.ADV\n",
" else:\n",
" return None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lemmatisation d'une liste de tokens en français"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def lem_fr(tokens):\n",
" list_tokens = []\n",
" for token in tokens:\n",
" wn_pos = wn_tag_from_ud(token[1])\n",
" if wn_pos is not None:\n",
" lem_token = lemmatizer.lemmatize(token[0],pos=wn_pos)\n",
" else:\n",
" lem_token = lemmatizer.lemmatize(token[0])\n",
" list_tokens.append((lem_token,token[1]))\n",
" return set(list_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"commentaires_df[\"pos_dict_lem\"] = commentaires_df.apply(lambda x: lem_fr(x[\"pos_dict\"]), axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Identification des synsets des tokens en français"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def synsets_fr(tokens):\n",
" list_synsets = []\n",
" for token in tokens:\n",
" wn_pos = wn_tag_from_ud(token[1])\n",
" if wn_pos is not None:\n",
" synset = wn.synsets(token[0], lang='fra', pos=wn_pos)\n",
" list_synsets.append(synset)\n",
" return list_synsets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"commentaires_df[\"synsets\"] = commentaires_df.apply(lambda x: synsets_fr(x[\"pos_dict_lem\"]), axis=1)"
]
},
{
"cell_type": "code",

File diff suppressed because one or more lines are too long

View file

@ -6,6 +6,13 @@ Installer Anaconda3
pip install newspaper3k
pip install emoji
pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git
## Installations des dépendances de nltk
import nltk
nltk.download('wordnet')
nltk.download('omw')
## Compilation du rapport

View file

@ -295,7 +295,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.7.4"
}
},
"nbformat": 4,

View file

@ -197,6 +197,8 @@ Intertextualité
## Méthodologie et algorithmes
J'ai effectué la lemmatisation en français à l'aide du French LEFFF Lemmatizer de Claude Coulombe [@coulombe_french_2019], qui est compatible avec la syntaxe utilisée dans la librairie NLTK et les étiquettes POS utilisées dans WordNet.
## Quelques résultats
# Conclusion