diff --git a/.gitignore b/.gitignore index 084bba9..758b29e 100644 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,8 @@ dmypy.json # Pyre type checker .pyre/ +.Rproj.user +.Rhistory +*.Rproj +*.pdf + diff --git a/Analyse_Articles.ipynb b/Analyse_Articles.ipynb index 352f5e5..acf2dbf 100644 --- a/Analyse_Articles.ipynb +++ b/Analyse_Articles.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -22,11 +22,547 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 True\n", + "1 True\n", + "2 True\n", + "3 True\n", + "4 True\n", + "Name: media, dtype: bool" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "textes_articles_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "textes_articles_df" + "f_comm = open(\"pickle/commentaires_df.pickle\",\"rb\")\n", + "commentaires_df = pickle.load(f_comm)\n", + "f_comm.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df = commentaires_df[commentaires_df[\"media\"]!='CNN']" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_idnested_idnameiddatelikescommentmediapost_idlist_namesauteurs_referescomment_cleanner_dictpos_dictemoji_dict
01.00Ycf BullitID: 1000006158663132019-11-09 14:17:130C'est une blague mdr 🤣🤣🤣🤣🤣FIG5dc7ac7f359e2-10157143278136339[Ycf Bullit][]C'est une blague mdr 🤣🤣🤣🤣🤣{}{('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...{':rolling_on_the_floor_laughing:': [5, 6, 7]}
12.00Steph AlcazarID: 1000011750772632019-11-09 14:17:340La seule question c'est de savoir s'il fera pl...FIG5dc7ac7f359e2-10157143278136339[Steph Alcazar][]La seule question c'est de savoir s'il fera pl...{}{('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...{}
23.00Töm MüstäineID: 13658794042019-11-09 14:17:510Romain Debrigode l info du jour qui fait plaiseFIG5dc7ac7f359e2-10157143278136339[Töm Müstäine][]Romain Debrigode l info du jour qui fait plaise{('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...{('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...{}
34.00Pierre CrouzetID: 1000002702920072019-11-09 14:18:060Vasanth Toure 😍FIG5dc7ac7f359e2-10157143278136339[Pierre Crouzet, Vasanth Toure]['Vasanth Toure']😍{}{}{}
44.01Vasanth ToureID: 1000014946078012019-11-09 14:20:570Pierre Crouzet Paris n'est pas prêt encore...FIG5dc7ac7f359e2-10157143278136339[Pierre Crouzet, Vasanth Toure]['Pierre Crouzet']Paris n'est pas prêt encore...{('Paris', 'LOCATION'): 1}{('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...{}
\n", + "
" + ], + "text/plain": [ + " comment_id nested_id name id \\\n", + "0 1.0 0 Ycf Bullit ID: 100000615866313 \n", + "1 2.0 0 Steph Alcazar ID: 100001175077263 \n", + "2 3.0 0 Töm Müstäine ID: 1365879404 \n", + "3 4.0 0 Pierre Crouzet ID: 100000270292007 \n", + "4 4.0 1 Vasanth Toure ID: 100001494607801 \n", + "\n", + " date likes \\\n", + "0 2019-11-09 14:17:13 0 \n", + "1 2019-11-09 14:17:34 0 \n", + "2 2019-11-09 14:17:51 0 \n", + "3 2019-11-09 14:18:06 0 \n", + "4 2019-11-09 14:20:57 0 \n", + "\n", + " comment media \\\n", + "0 C'est une blague mdr 🤣🤣🤣🤣🤣 FIG \n", + "1 La seule question c'est de savoir s'il fera pl... FIG \n", + "2 Romain Debrigode l info du jour qui fait plaise FIG \n", + "3 Vasanth Toure 😍 FIG \n", + "4 Pierre Crouzet Paris n'est pas prêt encore... FIG \n", + "\n", + " post_id list_names \\\n", + "0 5dc7ac7f359e2-10157143278136339 [Ycf Bullit] \n", + "1 5dc7ac7f359e2-10157143278136339 [Steph Alcazar] \n", + "2 5dc7ac7f359e2-10157143278136339 [Töm Müstäine] \n", + "3 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n", + "4 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n", + "\n", + " auteurs_referes comment_clean \\\n", + "0 [] C'est une blague mdr 🤣🤣🤣🤣🤣 \n", + "1 [] La seule question c'est de savoir s'il fera pl... \n", + "2 [] Romain Debrigode l info du jour qui fait plaise \n", + "3 ['Vasanth Toure'] 😍 \n", + "4 ['Pierre Crouzet'] Paris n'est pas prêt encore... \n", + "\n", + " ner_dict \\\n", + "0 {} \n", + "1 {} \n", + "2 {('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO... \n", + "3 {} \n", + "4 {('Paris', 'LOCATION'): 1} \n", + "\n", + " pos_dict \\\n", + "0 {('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('... \n", + "1 {('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,... \n", + "2 {('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'... \n", + "3 {} \n", + "4 {('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('... \n", + "\n", + " emoji_dict \n", + "0 {':rolling_on_the_floor_laughing:': [5, 6, 7]} \n", + "1 {} \n", + "2 {} \n", + "3 {} \n", + "4 {} " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commentaires_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Description des corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
post_idtextner_dictpos_dict
media
FIG25252525
RC22222222
TVA24242424
\n", + "
" + ], + "text/plain": [ + " post_id text ner_dict pos_dict\n", + "media \n", + "FIG 25 25 25 25\n", + "RC 22 22 22 22\n", + "TVA 24 24 24 24" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "textes_articles_df.groupby(\"media\").count()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_idnested_idnameiddatelikescommentpost_idlist_namesauteurs_referescomment_cleanner_dictpos_dictemoji_dict
media
FIG71557155715571557155715570317155715571557155715571557155
RC39473947394739473947394739053947394739473947394739473947
TVA62626262626262626262626261606262626262626262626262626262
\n", + "
" + ], + "text/plain": [ + " comment_id nested_id name id date likes comment post_id \\\n", + "media \n", + "FIG 7155 7155 7155 7155 7155 7155 7031 7155 \n", + "RC 3947 3947 3947 3947 3947 3947 3905 3947 \n", + "TVA 6262 6262 6262 6262 6262 6262 6160 6262 \n", + "\n", + " list_names auteurs_referes comment_clean ner_dict pos_dict \\\n", + "media \n", + "FIG 7155 7155 7155 7155 7155 \n", + "RC 3947 3947 3947 3947 3947 \n", + "TVA 6262 6262 6262 6262 6262 \n", + "\n", + " emoji_dict \n", + "media \n", + "FIG 7155 \n", + "RC 3947 \n", + "TVA 6262 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commentaires_df.groupby(\"media\").count()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "17364" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb_comm = commentaires_df[\"emoji_dict\"].count()\n", + "nb_comm" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2204" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb_comm_emoji = sum(commentaires_df[\"emoji_dict\"].apply(lambda x: len(x)) == 1)\n", + "nb_comm_emoji" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.12692927896797973" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb_comm_emoji/nb_comm" ] }, { diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..32e611c --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +build: rapport.md + pandoc --filter=pandoc-citeproc rapport.md -o rapport.pdf diff --git a/NLP-TP3.bib b/NLP-TP3.bib new file mode 100644 index 0000000..bf7b6b6 --- /dev/null +++ b/NLP-TP3.bib @@ -0,0 +1,82 @@ + +@inproceedings{schultes_leave_2013, + title = {Leave a {Comment}! {An} {In}-{Depth} {Analysis} of {User} {Comments} on {YouTube}}, + abstract = {User comments are the most popular but also extremely controversial form of communication on YouTube. Their public image is very poor; users generally expect that most comments will be of little value or even in thorough- ly bad taste. Nevertheless, heaps of comments continue to be posted every day. We propose an explanation for this contradiction in user attitudes and behaviour based on a new comment classification approach which captures salient aspects of YouTube comments. We show that, based on our new classification, we are able to perform very fast lightweight semantic video analysis. In addition, our results indicate that users' video perceptions (Likes and Dislikes) are indeed in- fluenced by the dispersion of valuable and inferior comments.}, + booktitle = {Wirtschaftsinformatik}, + author = {Schultes, Peter and Dorner, Verena and Lehner, Franz}, + year = {2013}, + keywords = {Video content analysis} +} + +@book{halte_les_2018, + address = {Limoges}, + title = {Les émoticônes et des interjections dans le tchat}, + isbn = {9782359352399 paperback}, + url = {http://ariane.ulaval.ca/cgi-bin/recherche.cgi?qu=a2767912}, + language = {Français}, + publisher = {Lambert-Lucas}, + author = {Halté, Pierre}, + year = {2018}, + keywords = {Binettes (Informatique), Clavardage, Français (Langue) Analyse du discours, Interjections, Sémiotique et médias sociaux, Symbolisme phonique} +} + +@book{georgalou_discourse_2017, + address = {London}, + title = {Discourse and identity on {Facebook}}, + isbn = {9781474289122 hardback alkaline paper}, + url = {http://ariane.ulaval.ca/cgi-bin/recherche.cgi?qu=a2650955}, + language = {Anglais}, + publisher = {Bloomsbury Academic, an imprint of Bloomsbury Publishing Plc}, + author = {Georgalou, Mariza}, + year = {2017}, + keywords = {Analyse du discours Aspect social, Analyse du discours Technologie, Facebook (Site Web) Aspect social, Réseaux sociaux (Internet) Aspect social} +} + +@inproceedings{liebeskind_comment_2018, + address = {Cham}, + title = {Comment {Relevance} {Classification} in {Facebook}}, + isbn = {978-3-319-77116-8}, + abstract = {Social posts and their comments are rich and interesting social data. In this study, we aim to classify comments as relevant or irrelevant to the content of their posts. Since the comments in social media are usually short, their bag-of-words (BoW) representations are highly sparse. We investigate four semantic vector representations for the relevance classification task. We investigate different types of large unlabeled data for learning the distributional representations. We also empirically demonstrate that expanding the input of the task to include the post text does not improve the classification performance over using only the comment text. We show that representing the comment in the post space is a cheap and good representation for comment relevance classification.}, + booktitle = {Computational {Linguistics} and {Intelligent} {Text} {Processing}}, + publisher = {Springer International Publishing}, + author = {Liebeskind, Chaya and Liebeskind, Shmuel and HaCohen-Kerner, Yaakov}, + editor = {Gelbukh, Alexander}, + year = {2018}, + pages = {241--254} +} + +@misc{noauthor_exportcomments.com_2019, + title = {exportcomments.com}, + url = {https://exportcomments.com/}, + month = nov, + year = {2019} +} + +@misc{ou-yang_newspaper3k:_2019, + title = {Newspaper3k: {Article} scraping \& curation}, + url = {https://github.com/codelucas/newspaper/}, + author = {Ou-Yang, Lucas}, + year = {2019} +} + +@inproceedings{mckinney_data_2010, + title = {Data {Structures} for {Statistical} {Computing} in {Python}}, + booktitle = {Proceedings of the 9th {Python} in {Science} {Conference}}, + author = {McKinney, Wes}, + editor = {Walt, Stéfan van der and Millman, Jarrod}, + year = {2010}, + pages = {51 -- 56} +} + +@incollection{baxter_discourse-analytic_2010, + title = {Discourse-analytic approaches to text and talk}, + isbn = {978-0-8264-8993-7}, + abstract = {This chapter explores the different ways in which discourse-analytic approaches reveal the ‘meaningfulness’ of text and talk. It reviews four diverse approaches to discourse analysis of particular value for current research in linguistics: Conversation Analysis (CA), Discourse Analysis (DA), Critical Discourse Analysis (CDA) and Feminist Post-structuralist Discourse Analysis (FPDA). Each approach is examined in terms of its background, motivation, key features, and possible strengths and limitations in relation to the field of linguistics. A key way to schematize discourse-analytic methodology is in terms of its relationship between microanalytical approaches, which examine the finer detail of linguistic interactions in transcripts, and macroanalytical approaches, which consider how broader social processes work through language (Heller, 2001). This chapter assesses whether there is a strength in a discourse-analytic approach that aligns itself exclusively with either a micro- or macrostrategy, or whether, as Heller suggests, the field needs to fi nd a way of ‘undoing’ the micro–macro dichotomy in order to produce richer, more complex insights within linguistic research.}, + language = {English}, + booktitle = {Research {Methods} in {Linguistics}}, + publisher = {Continuum}, + author = {Baxter, Judith A.}, + editor = {Litosseliti, Lia}, + year = {2010}, + pages = {117--137} +} \ No newline at end of file diff --git a/Traitement Articles.ipynb b/Traitement Articles.ipynb index aa9fba8..4d01720 100644 --- a/Traitement Articles.ipynb +++ b/Traitement Articles.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -85,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/commentaires_reseaux_sociaux.mm b/commentaires_reseaux_sociaux.mm index 43cf601..ce736a9 100644 --- a/commentaires_reseaux_sociaux.mm +++ b/commentaires_reseaux_sociaux.mm @@ -63,7 +63,7 @@ - + @@ -257,5 +257,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/rapport.md b/rapport.md new file mode 100644 index 0000000..724e17a --- /dev/null +++ b/rapport.md @@ -0,0 +1,113 @@ +--- +title: IFT7022 - TP 3 - Commentaires Facebook en lien avec la presse écrite. +subtitle: Revue de littérature et quantification de la pertinence. +author: François Pelletier +date: 16 décembre 2019 +output: + pdf_document: + citation_package: natbib + number_sections: yes + toc: yes +documentclass: "article" +fontsize: 11pt +geometry: margin=1in +bibliography: NLP-TP3.bib +csl: transactions-on-speech-and-language-processing.csl +--- + +\pagebreak + +# Introduction + +# Description des corpus de textes + +Nous analyserons les articles provenant des pages Facebook de trois médias écrits francophones: Le Figaro (France), Radio-Canada (Canada) et TVA Nouvelles (Canada). Pour chacun de ces médias, nous avons respectivement 25, 22 et 24 publications contenant un lien vers un article journalistique. + +Le premier corpus étudié est constitué du texte de chacun des articles qui sont liés dans les publications (l'utilisateur de Facebook devant cliquer sur le lien pour y accéder). Le titre de l'article n'est pas inclus dans ce corpus. + +Le second corpus est constitué d'un ensemble de commentaires publiés par des utilisateurs du réseau social et associés à chacune des publications précédentes. Il y a respectivement 7155, 2947 et 6262 commentaires pour chacun des trois médias écrits. + +Ces deux corpus ont été créées à l'aide des données de commentaires extraites depuis l'application en ligne exportcomments.com @noauthor_exportcomments.com_2019 dans des fichiers XLSX. Les fichiers ont par la suite été utilisés par les programmes Python suivants: + +- `commentaires.ipynb` pour extraire les commentaires depuis les fichiers téléchargés à l'aide de Pandas @mckinney_data_2010. +- `textes_articles.ipynb` pour extraire les textes depuis les URL disponibles dans les fichiers, par récupération de données (*web scraping*), en utilisant la librairie Python `newspaper` @ou-yang_newspaper3k:_2019. + +\pagebreak + +# Attributs linguistiques des commentaires sur les réseaux sociaux + +Les commentaires extraits constituent une nouvelle forme de discours, complètement différent des textes formatés et normalisés provenant du domaine journalistique. Il est donc nécessaire de s'y attarder plus longuement avant de poursuivre nos analyse. + +## Analyse du discours + +Selon Baxter [@baxter_discourse-analytic_2010], l'analyse du discours, qui est principalement dérivée de la sociologie, se découpe en quatre composantes principales (p.11): + +- La variabilité du langage (adaptation à l'audience et au contexte) +- La nature du langage (descriptif, narratif, expressif ou humoristique) +- Le répertoire (vocabulaire, grammaire, figures de style) +- Approches macro et micro-analytiques (contextes sociopolitique et psychologie) + +Comme le sens propre de chacun des commentaires est influencé par ces éléments, il sera pertinent de pouvoir les représenter sous forme d'attributs dans un modèle de classification de la pertinence par rapport à l'article en référence. Sinon, le modèle pourrait être biaisé, par exemple, en favorisant les commentaires qui ont un vocabulaire soutenu, davantage descriptifs et sur un ton professionnel, c'est-à-dire similaire au style journalistique. Toutefois, ce dernier pourrait ne pas être davantage en lien avec le contenu de l'article qu'un commentaire humoristique avec un niveau grammatical faible. + +## Sémiotique + +Selon Liebeskind [@liebeskind_comment_2018], les commentaires sur les réseaux sociaux présentent de nouvelles caractéristiques sémiotiques et linguistiques. En fait, on parle ici de sémiotique, car le discours n'est plus seulement signifié par des mots, mais aussi par des abbréviations, des émojis, des onomatopées, des répétitions de caractères (en particulier la ponctuation). On ajoute une dimension linguistique, car en plus d'un jargon spécifique à ce type de communication, les commentaires sont souvent écrits dans un niveau de langage passant du soutenu au vulgaire dans le même fil de conversation. + +### Emojis et interjections + +Les émojis et les interjections sont une composante essentielle des commentaires retrouvés sur les réseaux sociaux. Plus d'un commentaire sur huit contient un émoji. + +Halté [@halte_les_2018] a étudié en détail le rôle des émoticones (tels que `:-)`) et des interjections (tels que le fameux *lol*), ainsi que leur normalisation inspirée des caractères japonais nommés pour l'occasion `emojis` (néologisme qui relie l'anglais *emotion* et la racine japonaise *-ji* représentant la notion de symbole). Il précise d'ailleurs que des tests de substitution ou de suppression permettent d'identifier le rôle modalisateur de ces expression (une sorte de multiplicateur de la polarité ou valence du texte). La portée d'une émoticone, tout comme la portée d'une négation, peut être déterminée en effectuant une analyse syntaxique par relations ou par constituants. Mais, règle générale, l'auteur remarque que la portée s'étend toujours sur les éléments qui précèdent l'émoticone, ce qui peut parfois limiter la recherche des fragments de la phrase qui en sont affectés (lorsqu'ils ne sont pas à la fin du commentaire). + +### Majuscules et répétitions + +Georgalou [@georgalou_discourse_2017] + +### Ponctuations + +### Impact sur la classification des parties du discours + +La présence de ces nouveaux attributs fait de sorte qu'il ne sera plus possible d'utiliser avec autant de fiabilité les classificateurs de parties du discours (*part of speech*) aussi efficacement, car ils n'ont pas été conçus pour tenir compte de la présence de ces nouveaux éléments dans les phrases. Une approche retenue dans cette analyse pour augmenter la qualité de l'étiquetage est de séparer les émojis du contenu des phrases et de les considérer séparément, tout en conservant un marqueur de leur position dans les phrases. Comme des modèles ne tiennent généralement pas compte de la casse ni de la présence de ponctuation à l'extérieur de la phrase, il n'est pas nécessaire d'apporter d'autres modifications ici. + +## Qualité des commentaires + +### Syntaxique + +### Lexicale + +\pagebreak + +# Entités et parties du discours + +## Lieu et temps + +## Expertise + +## Positionnement + +\pagebreak + +# Relations entre les commentaires + +## Intertextualité + +## Interdiscursivité + +## Multimodalité + +\pagebreak + +# Représentation vectorielle + +\pagebreak + +# Classification de la pertinence + +\pagebreak + +# Conclusion + +\pagebreak + +# Références + diff --git a/transactions-on-speech-and-language-processing.csl b/transactions-on-speech-and-language-processing.csl new file mode 100644 index 0000000..9d0567b --- /dev/null +++ b/transactions-on-speech-and-language-processing.csl @@ -0,0 +1,18 @@ + +