diff --git a/.gitignore b/.gitignore index 758b29e..f536467 100644 --- a/.gitignore +++ b/.gitignore @@ -123,3 +123,5 @@ dmypy.json *.Rproj *.pdf +# fichiers latex +*.tex diff --git a/Analyse_Articles.ipynb b/Analyse_Articles.ipynb index d24f248..0a5775a 100644 --- a/Analyse_Articles.ipynb +++ b/Analyse_Articles.ipynb @@ -1,8 +1,19 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analyse des données pour le rapport\n", + "\n", + "## Lecture des fichiers de données et affichage d'un échantillon de données\n", + "\n", + "### Articles" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -11,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -22,16 +33,191 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mediapost_idtextner_dictpos_dict
0FIG5dc7ac7f359e2-10157143278136339L'ancien international de football Vikash Dhor...{('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON...{('ancien', 'ADJ'): 3, ('international', 'NOUN...
1FIG5dc7acd0d44b1-10157142962296339Les personnes qui iront manifester dimanche 10...{('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ...{('personnes', 'NOUN'): 2, ('iront', 'VERB'): ...
2FIG5dc7adde8bd8e-10157142482251339Selon Jason Farago, la Joconde prend le musée ...{('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):...{('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8...
3FIG5dc7ab8df19a0-10157144491741339We're just checking that you want to follow a ...{}{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...
4FIG5dc7ac188a6d6-10157143773291339Les défections se sont enchaînées, et peu de p...{('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER...{('défections', 'NOUN'): 2, ('enchaînées', 'VE...
5FIG5dc7ac51516dc-10157143472656339We're just checking that you want to follow a ...{}{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...
6FIG5dc7ab9fe4530-10157144373586339FIGAROVOX/TRIBUNE - Les derniers chiffres offi...{('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON...{('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'...
7FIG5dc7ae3950eea-10157141592561339La DGSI est chef de file de la lutte antiterro...{('France', 'LOCATION'): 1, ('1200', 'DATE'): ...{('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c...
8FIG5dc7ac9063012-10157143218116339Le voyage en Chine est devenu en ce début de X...{('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI...{('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,...
9FIG5dc7adf1bf8ff-10157142446816339Les nouvelles habitudes de consommation font s...{('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ...{('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'...
\n", + "
" + ], + "text/plain": [ + " media post_id \\\n", + "0 FIG 5dc7ac7f359e2-10157143278136339 \n", + "1 FIG 5dc7acd0d44b1-10157142962296339 \n", + "2 FIG 5dc7adde8bd8e-10157142482251339 \n", + "3 FIG 5dc7ab8df19a0-10157144491741339 \n", + "4 FIG 5dc7ac188a6d6-10157143773291339 \n", + "5 FIG 5dc7ac51516dc-10157143472656339 \n", + "6 FIG 5dc7ab9fe4530-10157144373586339 \n", + "7 FIG 5dc7ae3950eea-10157141592561339 \n", + "8 FIG 5dc7ac9063012-10157143218116339 \n", + "9 FIG 5dc7adf1bf8ff-10157142446816339 \n", + "\n", + " text \\\n", + "0 L'ancien international de football Vikash Dhor... \n", + "1 Les personnes qui iront manifester dimanche 10... \n", + "2 Selon Jason Farago, la Joconde prend le musée ... \n", + "3 We're just checking that you want to follow a ... \n", + "4 Les défections se sont enchaînées, et peu de p... \n", + "5 We're just checking that you want to follow a ... \n", + "6 FIGAROVOX/TRIBUNE - Les derniers chiffres offi... \n", + "7 La DGSI est chef de file de la lutte antiterro... \n", + "8 Le voyage en Chine est devenu en ce début de X... \n", + "9 Les nouvelles habitudes de consommation font s... \n", + "\n", + " ner_dict \\\n", + "0 {('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON... \n", + "1 {('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ... \n", + "2 {('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):... \n", + "3 {} \n", + "4 {('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER... \n", + "5 {} \n", + "6 {('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON... \n", + "7 {('France', 'LOCATION'): 1, ('1200', 'DATE'): ... \n", + "8 {('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI... \n", + "9 {('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ... \n", + "\n", + " pos_dict \n", + "0 {('ancien', 'ADJ'): 3, ('international', 'NOUN... \n", + "1 {('personnes', 'NOUN'): 2, ('iront', 'VERB'): ... \n", + "2 {('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8... \n", + "3 {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c... \n", + "4 {('défections', 'NOUN'): 2, ('enchaînées', 'VE... \n", + "5 {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c... \n", + "6 {('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'... \n", + "7 {('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c... \n", + "8 {('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,... \n", + "9 {('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'... " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "textes_articles_df.head(10)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Commentaires" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -42,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -51,9 +237,335 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_idnested_idnameiddatelikescommentmediapost_idlist_namesauteurs_referescomment_cleanner_dictpos_dictemoji_dict
01.00Ycf BullitID: 1000006158663132019-11-09 14:17:130C'est une blague mdr 🤣🤣🤣🤣🤣FIG5dc7ac7f359e2-10157143278136339[Ycf Bullit][]C'est une blague mdr 🤣🤣🤣🤣🤣{}{('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...{':rolling_on_the_floor_laughing:': [5, 6, 7]}
12.00Steph AlcazarID: 1000011750772632019-11-09 14:17:340La seule question c'est de savoir s'il fera pl...FIG5dc7ac7f359e2-10157143278136339[Steph Alcazar][]La seule question c'est de savoir s'il fera pl...{}{('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...{}
23.00Töm MüstäineID: 13658794042019-11-09 14:17:510Romain Debrigode l info du jour qui fait plaiseFIG5dc7ac7f359e2-10157143278136339[Töm Müstäine][]Romain Debrigode l info du jour qui fait plaise{('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...{('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...{}
34.00Pierre CrouzetID: 1000002702920072019-11-09 14:18:060Vasanth Toure 😍FIG5dc7ac7f359e2-10157143278136339[Pierre Crouzet, Vasanth Toure]['Vasanth Toure']😍{}{}{}
44.01Vasanth ToureID: 1000014946078012019-11-09 14:20:570Pierre Crouzet Paris n'est pas prêt encore...FIG5dc7ac7f359e2-10157143278136339[Pierre Crouzet, Vasanth Toure]['Pierre Crouzet']Paris n'est pas prêt encore...{('Paris', 'LOCATION'): 1}{('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...{}
54.02Pierre CrouzetID: 1000002702920072019-11-09 14:26:370Vasanth Toure le prochain c’est Adrien RabiotFIG5dc7ac7f359e2-10157143278136339[Pierre Crouzet, Vasanth Toure]['Vasanth Toure']le prochain c’est Adrien Rabiot{('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')...{('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ...{}
65.00Stéphane PirnaciID: 1000085413673022019-11-09 14:18:510MdrFIG5dc7ac7f359e2-10157143278136339[Stéphane Pirnaci][]Mdr{}{}{}
76.00Adil BennaniID: 1000064329172922019-11-09 14:19:030moi je propose mamadou sissokoFIG5dc7ac7f359e2-10157143278136339[Adil Bennani][]moi je propose mamadou sissoko{}{('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ...{}
87.00Hadrien De CournonID: 11312905522019-11-09 14:19:090Louis Prt Corentin Corman Victor Mdv ah ouais?FIG5dc7ac7f359e2-10157143278136339[Hadrien De Cournon][]Louis Prt Corentin Corman Victor Mdv ah ouais?{('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,...{('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (...{}
98.00Marwa LaroseID: 1000225775896112019-11-09 14:19:380Marier le foot à la mairie est génialFIG5dc7ac7f359e2-10157143278136339[Marwa Larose][]Marier le foot à la mairie est génial{('Marier', 'PERSON'): 1}{('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (...{}
\n", + "
" + ], + "text/plain": [ + " comment_id nested_id name id \\\n", + "0 1.0 0 Ycf Bullit ID: 100000615866313 \n", + "1 2.0 0 Steph Alcazar ID: 100001175077263 \n", + "2 3.0 0 Töm Müstäine ID: 1365879404 \n", + "3 4.0 0 Pierre Crouzet ID: 100000270292007 \n", + "4 4.0 1 Vasanth Toure ID: 100001494607801 \n", + "5 4.0 2 Pierre Crouzet ID: 100000270292007 \n", + "6 5.0 0 Stéphane Pirnaci ID: 100008541367302 \n", + "7 6.0 0 Adil Bennani ID: 100006432917292 \n", + "8 7.0 0 Hadrien De Cournon ID: 1131290552 \n", + "9 8.0 0 Marwa Larose ID: 100022577589611 \n", + "\n", + " date likes \\\n", + "0 2019-11-09 14:17:13 0 \n", + "1 2019-11-09 14:17:34 0 \n", + "2 2019-11-09 14:17:51 0 \n", + "3 2019-11-09 14:18:06 0 \n", + "4 2019-11-09 14:20:57 0 \n", + "5 2019-11-09 14:26:37 0 \n", + "6 2019-11-09 14:18:51 0 \n", + "7 2019-11-09 14:19:03 0 \n", + "8 2019-11-09 14:19:09 0 \n", + "9 2019-11-09 14:19:38 0 \n", + "\n", + " comment media \\\n", + "0 C'est une blague mdr 🤣🤣🤣🤣🤣 FIG \n", + "1 La seule question c'est de savoir s'il fera pl... FIG \n", + "2 Romain Debrigode l info du jour qui fait plaise FIG \n", + "3 Vasanth Toure 😍 FIG \n", + "4 Pierre Crouzet Paris n'est pas prêt encore... FIG \n", + "5 Vasanth Toure le prochain c’est Adrien Rabiot FIG \n", + "6 Mdr FIG \n", + "7 moi je propose mamadou sissoko FIG \n", + "8 Louis Prt Corentin Corman Victor Mdv ah ouais? FIG \n", + "9 Marier le foot à la mairie est génial FIG \n", + "\n", + " post_id list_names \\\n", + "0 5dc7ac7f359e2-10157143278136339 [Ycf Bullit] \n", + "1 5dc7ac7f359e2-10157143278136339 [Steph Alcazar] \n", + "2 5dc7ac7f359e2-10157143278136339 [Töm Müstäine] \n", + "3 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n", + "4 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n", + "5 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n", + "6 5dc7ac7f359e2-10157143278136339 [Stéphane Pirnaci] \n", + "7 5dc7ac7f359e2-10157143278136339 [Adil Bennani] \n", + "8 5dc7ac7f359e2-10157143278136339 [Hadrien De Cournon] \n", + "9 5dc7ac7f359e2-10157143278136339 [Marwa Larose] \n", + "\n", + " auteurs_referes comment_clean \\\n", + "0 [] C'est une blague mdr 🤣🤣🤣🤣🤣 \n", + "1 [] La seule question c'est de savoir s'il fera pl... \n", + "2 [] Romain Debrigode l info du jour qui fait plaise \n", + "3 ['Vasanth Toure'] 😍 \n", + "4 ['Pierre Crouzet'] Paris n'est pas prêt encore... \n", + "5 ['Vasanth Toure'] le prochain c’est Adrien Rabiot \n", + "6 [] Mdr \n", + "7 [] moi je propose mamadou sissoko \n", + "8 [] Louis Prt Corentin Corman Victor Mdv ah ouais? \n", + "9 [] Marier le foot à la mairie est génial \n", + "\n", + " ner_dict \\\n", + "0 {} \n", + "1 {} \n", + "2 {('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO... \n", + "3 {} \n", + "4 {('Paris', 'LOCATION'): 1} \n", + "5 {('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')... \n", + "6 {} \n", + "7 {} \n", + "8 {('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,... \n", + "9 {('Marier', 'PERSON'): 1} \n", + "\n", + " pos_dict \\\n", + "0 {('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('... \n", + "1 {('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,... \n", + "2 {('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'... \n", + "3 {} \n", + "4 {('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('... \n", + "5 {('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ... \n", + "6 {} \n", + "7 {('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ... \n", + "8 {('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (... \n", + "9 {('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (... \n", + "\n", + " emoji_dict \n", + "0 {':rolling_on_the_floor_laughing:': [5, 6, 7]} \n", + "1 {} \n", + "2 {} \n", + "3 {} \n", + "4 {} \n", + "5 {} \n", + "6 {} \n", + "7 {} \n", + "8 {} \n", + "9 {} " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "commentaires_df.head(10)" ] @@ -66,31 +578,108 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "textes_articles_df.groupby(\"media\").count()" + "### Nombre d'articles" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "commentaires_df.groupby(\"media\").count()" + "decompte_medias = textes_articles_df.groupby(\"media\").count()[[\"post_id\"]]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "decompte_medias.columns = [\"Nombre de publications\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "decompte_medias.to_latex(\"decompte_articles_medias.tex\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Nombre de commentaires total par média" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "decompte_commentaires = commentaires_df.groupby(\"media\").count()[[\"comment_id\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "decompte_commentaires.columns = [\"Nombre de commentaires\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "decompte_commentaires.to_latex(\"decompte_comm_medias.tex\",)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Proportion de commentaires contenant des emojis" + ] + }, + { + "cell_type": "code", + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "nb_comm = commentaires_df[\"emoji_dict\"].count()\n", - "nb_comm" + "nb_comm_emoji = sum(commentaires_df[\"emoji_dict\"].apply(lambda x: len(x)) == 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.13" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "round(nb_comm_emoji/nb_comm,2)" ] }, { @@ -98,19 +687,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "nb_comm_emoji = sum(commentaires_df[\"emoji_dict\"].apply(lambda x: len(x)) == 1)\n", - "nb_comm_emoji" - ] + "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "nb_comm_emoji/nb_comm" - ] + "source": [] }, { "cell_type": "code", @@ -136,7 +720,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/Makefile b/Makefile index 32e611c..2623e8a 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,2 @@ build: rapport.md - pandoc --filter=pandoc-citeproc rapport.md -o rapport.pdf + pandoc --filter=pandoc-citeproc -f markdown+raw_tex+latex_macros rapport.md -o rapport.pdf diff --git a/README.md b/README.md index 2f42a78..85556de 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ # nlp_a2019_tp3 -Projet de fin de session \ No newline at end of file +## Installation des dépendances du projet + +Installer Anaconda3 + +pip install newspaper3k +pip install emoji + +## Compilation du rapport + +make \ No newline at end of file diff --git a/rapport.md b/rapport.md index 973f181..b506eb3 100644 --- a/rapport.md +++ b/rapport.md @@ -12,6 +12,12 @@ fontsize: 12pt geometry: margin=1in bibliography: NLP-TP3.bib csl: transactions-on-speech-and-language-processing.csl +fig_caption: yes +header-includes: | + \usepackage{float} + \usepackage{booktabs,siunitx} + \floatplacement{figure}{H} + --- \pagebreak @@ -137,9 +143,21 @@ Selon les observations de Liebeskind et al. [@liebeskind_comment_2018], les prin ## Description des corpus de textes -Nous analyserons les articles provenant des pages Facebook de trois médias écrits francophones : Le Figaro (France), Radio-Canada (Canada) et TVA Nouvelles (Canada). Pour chacun de ces médias, nous avons respectivement 25, 22 et 24 publications contenant un lien vers un article journalistique. +Nous analyserons les articles provenant des pages Facebook de trois médias écrits francophones : Le Figaro (FIG), Radio-Canada (RC) et TVA Nouvelles (TVA). Pour chacun de ces médias, nous avons respectivement une publication Facebook contenant un lien vers un article journalistique, ainsi qu'un corpus de commentaires extraits depuis celle-ci. -Le premier corpus étudié est constitué du texte de chacun des articles qui sont liés dans les publications (l'utilisateur de Facebook devant cliquer sur le lien pour y accéder). Le titre de l'article n'est pas inclus dans ce corpus. Le second corpus est constitué d'un ensemble de commentaires publiés par des utilisateurs du réseau social et associés à chacune des publications précédentes. Il y a respectivement 7155, 2947 et 6262 commentaires pour chacun des trois médias écrits. +\begin{figure} +\centering +\caption{Décompte des articles par médias} +\input{decompte_articles_medias} +\end{figure} + +Le premier corpus étudié est constitué du texte de chacun des articles qui sont liés dans les publications (l'utilisateur de Facebook devant cliquer sur le lien pour y accéder). Le titre de l'article n'est pas inclus dans ce corpus. Le second corpus est constitué d'un ensemble de commentaires publiés par des utilisateurs du réseau social et associés à chacune des publications précédentes. + +\begin{figure} +\centering +\caption{Décompte des commentaires par médias} +\input{decompte_comm_medias} +\end{figure} Ces deux corpus ont été créés à l'aide des données de commentaires extraites depuis l'application en ligne exportcomments.com @noauthor_exportcomments.com_2019 dans des fichiers XLSX. Les fichiers ont par la suite été utilisés par les programmes Python suivants : @@ -174,7 +192,8 @@ Expertise et prise de position Intertextualité -- Réponse à un autre commentaire +- Mention de l'auteur d'un autre commentaire +- Présence de pronoms à la deuxième personne ## Méthodologie et algorithmes diff --git a/textes_articles.ipynb b/textes_articles.ipynb index 677ac6e..d36595e 100644 --- a/textes_articles.ipynb +++ b/textes_articles.ipynb @@ -2,9 +2,22 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'newspaper'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mparsing_functions\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/nlp_a2019_tp3/parsing_functions.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0munquote\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mnewspaper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_comments\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'newspaper'" + ] + } + ], "source": [ "import parsing_functions as pf\n", "import re\n", @@ -76,7 +89,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" } }, "nbformat": 4,