diff --git a/Analyse_Articles.ipynb b/Analyse_Articles.ipynb index 0a5775a..6367c5d 100644 --- a/Analyse_Articles.ipynb +++ b/Analyse_Articles.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -33,177 +33,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mediapost_idtextner_dictpos_dict
0FIG5dc7ac7f359e2-10157143278136339L'ancien international de football Vikash Dhor...{('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON...{('ancien', 'ADJ'): 3, ('international', 'NOUN...
1FIG5dc7acd0d44b1-10157142962296339Les personnes qui iront manifester dimanche 10...{('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ...{('personnes', 'NOUN'): 2, ('iront', 'VERB'): ...
2FIG5dc7adde8bd8e-10157142482251339Selon Jason Farago, la Joconde prend le musée ...{('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):...{('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8...
3FIG5dc7ab8df19a0-10157144491741339We're just checking that you want to follow a ...{}{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...
4FIG5dc7ac188a6d6-10157143773291339Les défections se sont enchaînées, et peu de p...{('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER...{('défections', 'NOUN'): 2, ('enchaînées', 'VE...
5FIG5dc7ac51516dc-10157143472656339We're just checking that you want to follow a ...{}{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...
6FIG5dc7ab9fe4530-10157144373586339FIGAROVOX/TRIBUNE - Les derniers chiffres offi...{('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON...{('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'...
7FIG5dc7ae3950eea-10157141592561339La DGSI est chef de file de la lutte antiterro...{('France', 'LOCATION'): 1, ('1200', 'DATE'): ...{('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c...
8FIG5dc7ac9063012-10157143218116339Le voyage en Chine est devenu en ce début de X...{('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI...{('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,...
9FIG5dc7adf1bf8ff-10157142446816339Les nouvelles habitudes de consommation font s...{('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ...{('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'...
\n", - "
" - ], - "text/plain": [ - " media post_id \\\n", - "0 FIG 5dc7ac7f359e2-10157143278136339 \n", - "1 FIG 5dc7acd0d44b1-10157142962296339 \n", - "2 FIG 5dc7adde8bd8e-10157142482251339 \n", - "3 FIG 5dc7ab8df19a0-10157144491741339 \n", - "4 FIG 5dc7ac188a6d6-10157143773291339 \n", - "5 FIG 5dc7ac51516dc-10157143472656339 \n", - "6 FIG 5dc7ab9fe4530-10157144373586339 \n", - "7 FIG 5dc7ae3950eea-10157141592561339 \n", - "8 FIG 5dc7ac9063012-10157143218116339 \n", - "9 FIG 5dc7adf1bf8ff-10157142446816339 \n", - "\n", - " text \\\n", - "0 L'ancien international de football Vikash Dhor... \n", - "1 Les personnes qui iront manifester dimanche 10... \n", - "2 Selon Jason Farago, la Joconde prend le musée ... \n", - "3 We're just checking that you want to follow a ... \n", - "4 Les défections se sont enchaînées, et peu de p... \n", - "5 We're just checking that you want to follow a ... \n", - "6 FIGAROVOX/TRIBUNE - Les derniers chiffres offi... \n", - "7 La DGSI est chef de file de la lutte antiterro... \n", - "8 Le voyage en Chine est devenu en ce début de X... \n", - "9 Les nouvelles habitudes de consommation font s... \n", - "\n", - " ner_dict \\\n", - "0 {('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON... \n", - "1 {('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ... \n", - "2 {('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):... \n", - "3 {} \n", - "4 {('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER... \n", - "5 {} \n", - "6 {('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON... \n", - "7 {('France', 'LOCATION'): 1, ('1200', 'DATE'): ... \n", - "8 {('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI... \n", - "9 {('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ... \n", - "\n", - " pos_dict \n", - "0 {('ancien', 'ADJ'): 3, ('international', 'NOUN... \n", - "1 {('personnes', 'NOUN'): 2, ('iront', 'VERB'): ... \n", - "2 {('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8... \n", - "3 {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c... \n", - "4 {('défections', 'NOUN'): 2, ('enchaînées', 'VE... \n", - "5 {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c... \n", - "6 {('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'... \n", - "7 {('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c... \n", - "8 {('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,... \n", - "9 {('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'... " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "textes_articles_df.head(10)" ] @@ -217,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -228,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -237,335 +69,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
comment_idnested_idnameiddatelikescommentmediapost_idlist_namesauteurs_referescomment_cleanner_dictpos_dictemoji_dict
01.00Ycf BullitID: 1000006158663132019-11-09 14:17:130C'est une blague mdr 🤣🤣🤣🤣🤣FIG5dc7ac7f359e2-10157143278136339[Ycf Bullit][]C'est une blague mdr 🤣🤣🤣🤣🤣{}{('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...{':rolling_on_the_floor_laughing:': [5, 6, 7]}
12.00Steph AlcazarID: 1000011750772632019-11-09 14:17:340La seule question c'est de savoir s'il fera pl...FIG5dc7ac7f359e2-10157143278136339[Steph Alcazar][]La seule question c'est de savoir s'il fera pl...{}{('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...{}
23.00Töm MüstäineID: 13658794042019-11-09 14:17:510Romain Debrigode l info du jour qui fait plaiseFIG5dc7ac7f359e2-10157143278136339[Töm Müstäine][]Romain Debrigode l info du jour qui fait plaise{('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...{('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...{}
34.00Pierre CrouzetID: 1000002702920072019-11-09 14:18:060Vasanth Toure 😍FIG5dc7ac7f359e2-10157143278136339[Pierre Crouzet, Vasanth Toure]['Vasanth Toure']😍{}{}{}
44.01Vasanth ToureID: 1000014946078012019-11-09 14:20:570Pierre Crouzet Paris n'est pas prêt encore...FIG5dc7ac7f359e2-10157143278136339[Pierre Crouzet, Vasanth Toure]['Pierre Crouzet']Paris n'est pas prêt encore...{('Paris', 'LOCATION'): 1}{('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...{}
54.02Pierre CrouzetID: 1000002702920072019-11-09 14:26:370Vasanth Toure le prochain c’est Adrien RabiotFIG5dc7ac7f359e2-10157143278136339[Pierre Crouzet, Vasanth Toure]['Vasanth Toure']le prochain c’est Adrien Rabiot{('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')...{('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ...{}
65.00Stéphane PirnaciID: 1000085413673022019-11-09 14:18:510MdrFIG5dc7ac7f359e2-10157143278136339[Stéphane Pirnaci][]Mdr{}{}{}
76.00Adil BennaniID: 1000064329172922019-11-09 14:19:030moi je propose mamadou sissokoFIG5dc7ac7f359e2-10157143278136339[Adil Bennani][]moi je propose mamadou sissoko{}{('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ...{}
87.00Hadrien De CournonID: 11312905522019-11-09 14:19:090Louis Prt Corentin Corman Victor Mdv ah ouais?FIG5dc7ac7f359e2-10157143278136339[Hadrien De Cournon][]Louis Prt Corentin Corman Victor Mdv ah ouais?{('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,...{('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (...{}
98.00Marwa LaroseID: 1000225775896112019-11-09 14:19:380Marier le foot à la mairie est génialFIG5dc7ac7f359e2-10157143278136339[Marwa Larose][]Marier le foot à la mairie est génial{('Marier', 'PERSON'): 1}{('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (...{}
\n", - "
" - ], - "text/plain": [ - " comment_id nested_id name id \\\n", - "0 1.0 0 Ycf Bullit ID: 100000615866313 \n", - "1 2.0 0 Steph Alcazar ID: 100001175077263 \n", - "2 3.0 0 Töm Müstäine ID: 1365879404 \n", - "3 4.0 0 Pierre Crouzet ID: 100000270292007 \n", - "4 4.0 1 Vasanth Toure ID: 100001494607801 \n", - "5 4.0 2 Pierre Crouzet ID: 100000270292007 \n", - "6 5.0 0 Stéphane Pirnaci ID: 100008541367302 \n", - "7 6.0 0 Adil Bennani ID: 100006432917292 \n", - "8 7.0 0 Hadrien De Cournon ID: 1131290552 \n", - "9 8.0 0 Marwa Larose ID: 100022577589611 \n", - "\n", - " date likes \\\n", - "0 2019-11-09 14:17:13 0 \n", - "1 2019-11-09 14:17:34 0 \n", - "2 2019-11-09 14:17:51 0 \n", - "3 2019-11-09 14:18:06 0 \n", - "4 2019-11-09 14:20:57 0 \n", - "5 2019-11-09 14:26:37 0 \n", - "6 2019-11-09 14:18:51 0 \n", - "7 2019-11-09 14:19:03 0 \n", - "8 2019-11-09 14:19:09 0 \n", - "9 2019-11-09 14:19:38 0 \n", - "\n", - " comment media \\\n", - "0 C'est une blague mdr 🤣🤣🤣🤣🤣 FIG \n", - "1 La seule question c'est de savoir s'il fera pl... FIG \n", - "2 Romain Debrigode l info du jour qui fait plaise FIG \n", - "3 Vasanth Toure 😍 FIG \n", - "4 Pierre Crouzet Paris n'est pas prêt encore... FIG \n", - "5 Vasanth Toure le prochain c’est Adrien Rabiot FIG \n", - "6 Mdr FIG \n", - "7 moi je propose mamadou sissoko FIG \n", - "8 Louis Prt Corentin Corman Victor Mdv ah ouais? FIG \n", - "9 Marier le foot à la mairie est génial FIG \n", - "\n", - " post_id list_names \\\n", - "0 5dc7ac7f359e2-10157143278136339 [Ycf Bullit] \n", - "1 5dc7ac7f359e2-10157143278136339 [Steph Alcazar] \n", - "2 5dc7ac7f359e2-10157143278136339 [Töm Müstäine] \n", - "3 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n", - "4 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n", - "5 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n", - "6 5dc7ac7f359e2-10157143278136339 [Stéphane Pirnaci] \n", - "7 5dc7ac7f359e2-10157143278136339 [Adil Bennani] \n", - "8 5dc7ac7f359e2-10157143278136339 [Hadrien De Cournon] \n", - "9 5dc7ac7f359e2-10157143278136339 [Marwa Larose] \n", - "\n", - " auteurs_referes comment_clean \\\n", - "0 [] C'est une blague mdr 🤣🤣🤣🤣🤣 \n", - "1 [] La seule question c'est de savoir s'il fera pl... \n", - "2 [] Romain Debrigode l info du jour qui fait plaise \n", - "3 ['Vasanth Toure'] 😍 \n", - "4 ['Pierre Crouzet'] Paris n'est pas prêt encore... \n", - "5 ['Vasanth Toure'] le prochain c’est Adrien Rabiot \n", - "6 [] Mdr \n", - "7 [] moi je propose mamadou sissoko \n", - "8 [] Louis Prt Corentin Corman Victor Mdv ah ouais? \n", - "9 [] Marier le foot à la mairie est génial \n", - "\n", - " ner_dict \\\n", - "0 {} \n", - "1 {} \n", - "2 {('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO... \n", - "3 {} \n", - "4 {('Paris', 'LOCATION'): 1} \n", - "5 {('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')... \n", - "6 {} \n", - "7 {} \n", - "8 {('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,... \n", - "9 {('Marier', 'PERSON'): 1} \n", - "\n", - " pos_dict \\\n", - "0 {('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('... \n", - "1 {('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,... \n", - "2 {('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'... \n", - "3 {} \n", - "4 {('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('... \n", - "5 {('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ... \n", - "6 {} \n", - "7 {('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ... \n", - "8 {('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (... \n", - "9 {('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (... \n", - "\n", - " emoji_dict \n", - "0 {':rolling_on_the_floor_laughing:': [5, 6, 7]} \n", - "1 {} \n", - "2 {} \n", - "3 {} \n", - "4 {} \n", - "5 {} \n", - "6 {} \n", - "7 {} \n", - "8 {} \n", - "9 {} " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "commentaires_df.head(10)" ] @@ -586,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -595,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -604,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -620,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -629,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -638,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -654,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -664,37 +170,125 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.13" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "round(nb_comm_emoji/nb_comm,2)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Niveau de langage\n", + "## Nombre de jetons dans WordNet\n", + "\n", + "On utilise le POS tag identifié depuis Stanford POS Tagger, puis on le convertis en tag compatible pour Wordnet. On recherche ensuite le mot lemmatisé dans Wordnet en français, puis on filtre les résultats avec le POS. Ceci permet d'identifier tous les synsets réalistes pour les mots du commentaire." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from nltk.corpus import wordnet as wn\n", + "from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer\n", + "lemmatizer = FrenchLefffLemmatizer()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conversion du tag de Stanford POS vers Wordnet POS" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def wn_tag_from_ud(tag):\n", + " if tag=='ADJ':\n", + " return wn.ADJ\n", + " if tag=='NOUN':\n", + " return wn.NOUN\n", + " if tag=='VERB':\n", + " return wn.VERB\n", + " if tag=='ADV':\n", + " return wn.ADV\n", + " else:\n", + " return None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lemmatisation d'une liste de tokens en français" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def lem_fr(tokens):\n", + " list_tokens = []\n", + " for token in tokens:\n", + " wn_pos = wn_tag_from_ud(token[1])\n", + " if wn_pos is not None:\n", + " lem_token = lemmatizer.lemmatize(token[0],pos=wn_pos)\n", + " else:\n", + " lem_token = lemmatizer.lemmatize(token[0])\n", + " list_tokens.append((lem_token,token[1]))\n", + " return set(list_tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df[\"pos_dict_lem\"] = commentaires_df.apply(lambda x: lem_fr(x[\"pos_dict\"]), axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Identification des synsets des tokens en français" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def synsets_fr(tokens):\n", + " list_synsets = []\n", + " for token in tokens:\n", + " wn_pos = wn_tag_from_ud(token[1])\n", + " if wn_pos is not None:\n", + " synset = wn.synsets(token[0], lang='fra', pos=wn_pos)\n", + " list_synsets.append(synset)\n", + " return list_synsets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df[\"synsets\"] = commentaires_df.apply(lambda x: synsets_fr(x[\"pos_dict_lem\"]), axis=1)" + ] }, { "cell_type": "code", diff --git a/NLP-TP3.bib b/NLP-TP3.bib index 189fcee..2c63a9c 100644 --- a/NLP-TP3.bib +++ b/NLP-TP3.bib @@ -17,7 +17,8 @@ publisher = {Lambert-Lucas}, author = {Halté, Pierre}, year = {2018}, - keywords = {Binettes (Informatique), Clavardage, Français (Langue) Analyse du discours, Interjections, Sémiotique et médias sociaux, Symbolisme phonique} + keywords = {Binettes (Informatique), Clavardage, Français (Langue) Analyse du discours, Interjections, Sémiotique et médias sociaux, Symbolisme phonique}, + annote = {Bibliographie : pages 209-216.} } @book{georgalou_discourse_2017, @@ -29,7 +30,8 @@ publisher = {Bloomsbury Academic, an imprint of Bloomsbury Publishing Plc}, author = {Georgalou, Mariza}, year = {2017}, - keywords = {Analyse du discours Aspect social, Analyse du discours Technologie, Facebook (Site Web) Aspect social, Réseaux sociaux (Internet) Aspect social} + keywords = {Analyse du discours Aspect social, Analyse du discours Technologie, Facebook (Site Web) Aspect social, Réseaux sociaux (Internet) Aspect social}, + annote = {Bibliographie : pages 281-305.} } @inproceedings{liebeskind_comment_2018, @@ -102,6 +104,17 @@ title = {Universal {Dependencies} 2.5}, copyright = {Licence Universal Dependencies v2.5}, url = {http://hdl.handle.net/11234/1-3105}, - author = {Collective}, + author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Aepli, Noëmi and Agić, Željko and Ahrenberg, Lars and Aleksandravičiūtė, Gabrielė and Antonsen, Lene and Aplonova, Katya and Aranzabe, Maria Jesus and Arutie, Gashaw and Asahara, Masayuki and Ateyah, Luma and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bellato, Sandra and Bengoetxea, Kepa and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskienė, Agnė and Blokland, Rogier and Bobicev, Victoria and Boizou, Loïc and Borges Völker, Emanuel and Börstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokaitė, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroğlu Eryiğit, Gülşen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Čéplö, Slavomír and Cetin, Savas and Chalub, Fabricio and Choi, Jinho and Cho, Yongseok and Chun, Jayeol and Cignarella, Alessandra T. and Cinková, Silvie and Collomb, Aurélie and Çöltekin, Çağrı and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaž and Etienne, Aline and Evelyn, Wograine and Farkas, Richárd and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cláudia and Fujita, Kazunori and Gajdošová, Katarína and Galbraith, Daniel and Garcia, Marcos and Gärdenfors, Moa and Garza, Sebastian and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and Gökırmak, Memduh and Goldberg, Yoav and Gómez Guinovart, Xavier and González Saavedra, Berta and Griciūtė, Bernadeta and Grioni, Matias and Gr{\textbackslash}= uzītis, Normunds and Guillaume, Bruno and Guillot-Barbance, Céline and Habash, Nizar and Hajič, Jan and Hajič jr., Jan and Hämäläinen, Mika and Hà Mỹ, Linh and Han, Na-Rae and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hennig, Felix and Hladká, Barbora and Hlaváčová, Jaroslava and Hociung, Florinel and Hohle, Petter and Hwang, Jena and Ikeda, Takumi and Ion, Radu and Irimia, Elena and Ishola, Ọlájídé and Jelínek, Tomáš and Johannsen, Anders and Jørgensen, Fredrik and Juutinen, Markus and Kaşıkara, Hüner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerová, Václava and Kirchner, Jesse and Klementieva, Elena and Köhn, Arne and Kopacewicz, Kamil and Kotsyba, Natalia and Kovalevskaitė, Jolanta and Krek, Simon and Kwak, Sookyoung and Laippala, Veronika and Lambertino, Lorenzo and Lam, Lucia and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and Lê H{\textbackslash}`ông, Phương and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Li, Cheuk Ying and Li, Josie and Li, Keying and Lim, KyungTae and Liovina, Maria and Li, Yuan and Ljubešić, Nikola and Loginova, Olga and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Mărănduc, Cătălina and Mareček, David and Marheinecke, Katrin and Martínez Alonso, Héctor and Martins, André and Mašek, Jan and Matsumoto, Yuji and McDonald, Ryan and McGuinness, Sarah and Mendonça, Gustavo and Miekka, Niko and Misirpashayeva, Margarita and Missilä, Anna and Mititelu, Cătălin and Mitrofan, Maria and Miyao, Yusuke and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Morioka, Tomohiko and Mori, Shinsuke and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and Müürisep, Kaili and Nainwani, Pinkey and Navarro Horñiacek, Juan Ignacio and Nedoluzhko, Anna and Nešpore-Bērzkalne, Gunta and Nguy∼ên Thị, Lương and Nguy∼ên Thị Minh, Huy{\textbackslash}`ên and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Olúòkun, Adédayọ̀ and Omura, Mai and Osenova, Petya and Östling, Robert and Øvrelid, Lilja and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\textbackslash}Lapińska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perrier, Guy and Petrova, Daria and Petrov, Slav and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalniņa, Lauma and Prévost, Sophie and Prokopidis, Prokopis and Przepiórkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and Rääbis, Andriela and Rademaker, Alexandre and Ramasamy, Loganathan and Rama, Taraka and Ramisch, Carlos and Ravishankar, Vinit and Real, Livy and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rießler, Michael and Rimkutė, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and Romanenko, Mykhailo and Rosa, Rudolf and Rovati, Davide and Roșca, Valentin and Rudina, Olga and Rueter, Jack and Sadde, Shoval and Sagot, Benoît and Saleh, Shadi and Salomoni, Alessio and Samardžić, Tanja and Samson, Stephanie and Sanguinetti, Manuela and Särg, Dage and Saulīte, Baiba and Sawanakunanon, Yanin and Schneider, Nathan and Schuster, Sebastian and Seddah, Djamé and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simkó, Katalin and Šimková, Mária and Simov, Kiril and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Stella, Antonio and Straka, Milan and Strnadová, Jana and Suhr, Alane and Sulubacak, Umut and Suzuki, Shingo and Szántó, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tanaka, Takaaki and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and Tyers, Francis and Uematsu, Sumire and Urešová, Zdeňka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Williams, Seyi and Wirén, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wróblewska, Alina and Yako, Mary and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Žabokrtský, Zdeněk and Zeldes, Amir and Zhang, Manying and Zhu, Hanzhi}, + year = {2019}, + annote = {LINDAT/CLARIN digital library at the Institute of Formal and Applied Linguistics (ÚFAL), Faculty of Mathematics and Physics, Charles University} +} + +@misc{coulombe_french_2019, + title = {French {LEFFF} {Lemmatizer}}, + url = {https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer/commit/91c2f469bbd317213b8438072496eb1ed696a8e7}, + abstract = {A French Lemmatizer in Python based on the LEFFF (Lexique des Formes Fléchies du Français / Lexicon of French inflected forms) is a large-scale morphological and syntactic lexicon for French. A lemmatizer retrurns the lemma or more simply the dictionary entry of a word, In French, the lemmatization of a verb returns this verb to the infinitive and for the other words, the lemmatization returns this word to the masculine singular.}, + urldate = {2019-12-26}, + author = {Coulombe, Claude}, + month = jul, year = {2019} } \ No newline at end of file diff --git a/README.md b/README.md index 85556de..04d436c 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,13 @@ Installer Anaconda3 pip install newspaper3k pip install emoji +pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git + +## Installations des dépendances de nltk + +import nltk +nltk.download('wordnet') +nltk.download('omw') ## Compilation du rapport diff --git a/Traitement commentaires.ipynb b/Traitement commentaires.ipynb index c3b3aa2..e8e533d 100644 --- a/Traitement commentaires.ipynb +++ b/Traitement commentaires.ipynb @@ -295,7 +295,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/rapport.md b/rapport.md index b506eb3..301df3a 100644 --- a/rapport.md +++ b/rapport.md @@ -197,6 +197,8 @@ Intertextualité ## Méthodologie et algorithmes +J'ai effectué la lemmatisation en français à l'aide du French LEFFF Lemmatizer de Claude Coulombe [@coulombe_french_2019], qui est compatible avec la syntaxe utilisée dans la librairie NLTK et les étiquettes POS utilisées dans WordNet. + ## Quelques résultats # Conclusion