ajout début du rapport

2019-12-19 00:25:23 -05:00 · 2019-12-19 00:25:23 -05:00 · 9787250ec0
commit 9787250ec0
parent c60ee17705
8 changed files with 807 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -118,3 +118,8 @@ dmypy.json
 # Pyre type checker
 .pyre/

+.Rproj.user
+.Rhistory
+*.Rproj
+*.pdf
+
--- a/Analyse_Articles.ipynb
+++ b/Analyse_Articles.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -11,7 +11,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -22,11 +22,547 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    True\n",
+       "1    True\n",
+       "2    True\n",
+       "3    True\n",
+       "4    True\n",
+       "Name: media, dtype: bool"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "textes_articles_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
-    "textes_articles_df"
+    "f_comm = open(\"pickle/commentaires_df.pickle\",\"rb\")\n",
+    "commentaires_df = pickle.load(f_comm)\n",
+    "f_comm.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "commentaires_df = commentaires_df[commentaires_df[\"media\"]!='CNN']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>comment_id</th>\n",
+       "      <th>nested_id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>id</th>\n",
+       "      <th>date</th>\n",
+       "      <th>likes</th>\n",
+       "      <th>comment</th>\n",
+       "      <th>media</th>\n",
+       "      <th>post_id</th>\n",
+       "      <th>list_names</th>\n",
+       "      <th>auteurs_referes</th>\n",
+       "      <th>comment_clean</th>\n",
+       "      <th>ner_dict</th>\n",
+       "      <th>pos_dict</th>\n",
+       "      <th>emoji_dict</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Ycf Bullit</td>\n",
+       "      <td>ID: 100000615866313</td>\n",
+       "      <td>2019-11-09 14:17:13</td>\n",
+       "      <td>0</td>\n",
+       "      <td>C'est une blague mdr 🤣🤣🤣🤣🤣</td>\n",
+       "      <td>FIG</td>\n",
+       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
+       "      <td>[Ycf Bullit]</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>C'est une blague mdr 🤣🤣🤣🤣🤣</td>\n",
+       "      <td>{}</td>\n",
+       "      <td>{('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...</td>\n",
+       "      <td>{':rolling_on_the_floor_laughing:': [5, 6, 7]}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Steph Alcazar</td>\n",
+       "      <td>ID: 100001175077263</td>\n",
+       "      <td>2019-11-09 14:17:34</td>\n",
+       "      <td>0</td>\n",
+       "      <td>La seule question c'est de savoir s'il fera pl...</td>\n",
+       "      <td>FIG</td>\n",
+       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
+       "      <td>[Steph Alcazar]</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>La seule question c'est de savoir s'il fera pl...</td>\n",
+       "      <td>{}</td>\n",
+       "      <td>{('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Töm Müstäine</td>\n",
+       "      <td>ID: 1365879404</td>\n",
+       "      <td>2019-11-09 14:17:51</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
+       "      <td>FIG</td>\n",
+       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
+       "      <td>[Töm Müstäine]</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
+       "      <td>{('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...</td>\n",
+       "      <td>{('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Pierre Crouzet</td>\n",
+       "      <td>ID: 100000270292007</td>\n",
+       "      <td>2019-11-09 14:18:06</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Vasanth Toure 😍</td>\n",
+       "      <td>FIG</td>\n",
+       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
+       "      <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
+       "      <td>['Vasanth Toure']</td>\n",
+       "      <td>😍</td>\n",
+       "      <td>{}</td>\n",
+       "      <td>{}</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Vasanth Toure</td>\n",
+       "      <td>ID: 100001494607801</td>\n",
+       "      <td>2019-11-09 14:20:57</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Pierre Crouzet Paris n'est pas prêt encore...</td>\n",
+       "      <td>FIG</td>\n",
+       "      <td>5dc7ac7f359e2-10157143278136339</td>\n",
+       "      <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
+       "      <td>['Pierre Crouzet']</td>\n",
+       "      <td>Paris n'est pas prêt encore...</td>\n",
+       "      <td>{('Paris', 'LOCATION'): 1}</td>\n",
+       "      <td>{('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   comment_id  nested_id            name                   id  \\\n",
+       "0         1.0          0      Ycf Bullit  ID: 100000615866313   \n",
+       "1         2.0          0   Steph Alcazar  ID: 100001175077263   \n",
+       "2         3.0          0    Töm Müstäine       ID: 1365879404   \n",
+       "3         4.0          0  Pierre Crouzet  ID: 100000270292007   \n",
+       "4         4.0          1   Vasanth Toure  ID: 100001494607801   \n",
+       "\n",
+       "                  date  likes  \\\n",
+       "0  2019-11-09 14:17:13      0   \n",
+       "1  2019-11-09 14:17:34      0   \n",
+       "2  2019-11-09 14:17:51      0   \n",
+       "3  2019-11-09 14:18:06      0   \n",
+       "4  2019-11-09 14:20:57      0   \n",
+       "\n",
+       "                                             comment media  \\\n",
+       "0                         C'est une blague mdr 🤣🤣🤣🤣🤣   FIG   \n",
+       "1  La seule question c'est de savoir s'il fera pl...   FIG   \n",
+       "2    Romain Debrigode l info du jour qui fait plaise   FIG   \n",
+       "3                                    Vasanth Toure 😍   FIG   \n",
+       "4      Pierre Crouzet Paris n'est pas prêt encore...   FIG   \n",
+       "\n",
+       "                           post_id                       list_names  \\\n",
+       "0  5dc7ac7f359e2-10157143278136339                     [Ycf Bullit]   \n",
+       "1  5dc7ac7f359e2-10157143278136339                  [Steph Alcazar]   \n",
+       "2  5dc7ac7f359e2-10157143278136339                   [Töm Müstäine]   \n",
+       "3  5dc7ac7f359e2-10157143278136339  [Pierre Crouzet, Vasanth Toure]   \n",
+       "4  5dc7ac7f359e2-10157143278136339  [Pierre Crouzet, Vasanth Toure]   \n",
+       "\n",
+       "      auteurs_referes                                      comment_clean  \\\n",
+       "0                  []                         C'est une blague mdr 🤣🤣🤣🤣🤣   \n",
+       "1                  []  La seule question c'est de savoir s'il fera pl...   \n",
+       "2                  []    Romain Debrigode l info du jour qui fait plaise   \n",
+       "3   ['Vasanth Toure']                                                  😍   \n",
+       "4  ['Pierre Crouzet']                     Paris n'est pas prêt encore...   \n",
+       "\n",
+       "                                            ner_dict  \\\n",
+       "0                                                 {}   \n",
+       "1                                                 {}   \n",
+       "2  {('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...   \n",
+       "3                                                 {}   \n",
+       "4                         {('Paris', 'LOCATION'): 1}   \n",
+       "\n",
+       "                                            pos_dict  \\\n",
+       "0  {('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...   \n",
+       "1  {('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...   \n",
+       "2  {('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...   \n",
+       "3                                                 {}   \n",
+       "4  {('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...   \n",
+       "\n",
+       "                                       emoji_dict  \n",
+       "0  {':rolling_on_the_floor_laughing:': [5, 6, 7]}  \n",
+       "1                                              {}  \n",
+       "2                                              {}  \n",
+       "3                                              {}  \n",
+       "4                                              {}  "
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "commentaires_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Description des corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>post_id</th>\n",
+       "      <th>text</th>\n",
+       "      <th>ner_dict</th>\n",
+       "      <th>pos_dict</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>media</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>FIG</th>\n",
+       "      <td>25</td>\n",
+       "      <td>25</td>\n",
+       "      <td>25</td>\n",
+       "      <td>25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RC</th>\n",
+       "      <td>22</td>\n",
+       "      <td>22</td>\n",
+       "      <td>22</td>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TVA</th>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       post_id  text  ner_dict  pos_dict\n",
+       "media                                   \n",
+       "FIG         25    25        25        25\n",
+       "RC          22    22        22        22\n",
+       "TVA         24    24        24        24"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "textes_articles_df.groupby(\"media\").count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>comment_id</th>\n",
+       "      <th>nested_id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>id</th>\n",
+       "      <th>date</th>\n",
+       "      <th>likes</th>\n",
+       "      <th>comment</th>\n",
+       "      <th>post_id</th>\n",
+       "      <th>list_names</th>\n",
+       "      <th>auteurs_referes</th>\n",
+       "      <th>comment_clean</th>\n",
+       "      <th>ner_dict</th>\n",
+       "      <th>pos_dict</th>\n",
+       "      <th>emoji_dict</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>media</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>FIG</th>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7031</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "      <td>7155</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RC</th>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3905</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "      <td>3947</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TVA</th>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6160</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "      <td>6262</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       comment_id  nested_id  name    id  date  likes  comment  post_id  \\\n",
+       "media                                                                     \n",
+       "FIG          7155       7155  7155  7155  7155   7155     7031     7155   \n",
+       "RC           3947       3947  3947  3947  3947   3947     3905     3947   \n",
+       "TVA          6262       6262  6262  6262  6262   6262     6160     6262   \n",
+       "\n",
+       "       list_names  auteurs_referes  comment_clean  ner_dict  pos_dict  \\\n",
+       "media                                                                   \n",
+       "FIG          7155             7155           7155      7155      7155   \n",
+       "RC           3947             3947           3947      3947      3947   \n",
+       "TVA          6262             6262           6262      6262      6262   \n",
+       "\n",
+       "       emoji_dict  \n",
+       "media              \n",
+       "FIG          7155  \n",
+       "RC           3947  \n",
+       "TVA          6262  "
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "commentaires_df.groupby(\"media\").count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "17364"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nb_comm = commentaires_df[\"emoji_dict\"].count()\n",
+    "nb_comm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2204"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nb_comm_emoji = sum(commentaires_df[\"emoji_dict\"].apply(lambda x: len(x)) == 1)\n",
+    "nb_comm_emoji"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.12692927896797973"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nb_comm_emoji/nb_comm"
   ]
  },
  {
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+build: rapport.md
+	pandoc --filter=pandoc-citeproc rapport.md -o rapport.pdf
--- a/NLP-TP3.bib
+++ b/NLP-TP3.bib
@ -0,0 +1,82 @@
+
+@inproceedings{schultes_leave_2013,
+	title = {Leave a {Comment}! {An} {In}-{Depth} {Analysis} of {User} {Comments} on {YouTube}},
+	abstract = {User comments are the most popular but also extremely controversial form of communication on YouTube. Their public image is very poor; users generally expect that most comments will be of little value or even in thorough- ly bad taste. Nevertheless, heaps of comments continue to be posted every day. We propose an explanation for this contradiction in user attitudes and behaviour based on a new comment classification approach which captures salient aspects of YouTube comments. We show that, based on our new classification, we are able to perform very fast lightweight semantic video analysis. In addition, our results indicate that users' video perceptions (Likes and Dislikes) are indeed in- fluenced by the dispersion of valuable and inferior comments.},
+	booktitle = {Wirtschaftsinformatik},
+	author = {Schultes, Peter and Dorner, Verena and Lehner, Franz},
+	year = {2013},
+	keywords = {Video content analysis}
+}
+
+@book{halte_les_2018,
+	address = {Limoges},
+	title = {Les émoticônes et des interjections dans le tchat},
+	isbn = {9782359352399 paperback},
+	url = {http://ariane.ulaval.ca/cgi-bin/recherche.cgi?qu=a2767912},
+	language = {Français},
+	publisher = {Lambert-Lucas},
+	author = {Halté, Pierre},
+	year = {2018},
+	keywords = {Binettes (Informatique), Clavardage, Français (Langue) Analyse du discours, Interjections, Sémiotique et médias sociaux, Symbolisme phonique}
+}
+
+@book{georgalou_discourse_2017,
+	address = {London},
+	title = {Discourse and identity on {Facebook}},
+	isbn = {9781474289122 hardback alkaline paper},
+	url = {http://ariane.ulaval.ca/cgi-bin/recherche.cgi?qu=a2650955},
+	language = {Anglais},
+	publisher = {Bloomsbury Academic, an imprint of Bloomsbury Publishing Plc},
+	author = {Georgalou, Mariza},
+	year = {2017},
+	keywords = {Analyse du discours Aspect social, Analyse du discours Technologie, Facebook (Site Web) Aspect social, Réseaux sociaux (Internet) Aspect social}
+}
+
+@inproceedings{liebeskind_comment_2018,
+	address = {Cham},
+	title = {Comment {Relevance} {Classification} in {Facebook}},
+	isbn = {978-3-319-77116-8},
+	abstract = {Social posts and their comments are rich and interesting social data. In this study, we aim to classify comments as relevant or irrelevant to the content of their posts. Since the comments in social media are usually short, their bag-of-words (BoW) representations are highly sparse. We investigate four semantic vector representations for the relevance classification task. We investigate different types of large unlabeled data for learning the distributional representations. We also empirically demonstrate that expanding the input of the task to include the post text does not improve the classification performance over using only the comment text. We show that representing the comment in the post space is a cheap and good representation for comment relevance classification.},
+	booktitle = {Computational {Linguistics} and {Intelligent} {Text} {Processing}},
+	publisher = {Springer International Publishing},
+	author = {Liebeskind, Chaya and Liebeskind, Shmuel and HaCohen-Kerner, Yaakov},
+	editor = {Gelbukh, Alexander},
+	year = {2018},
+	pages = {241--254}
+}
+
+@misc{noauthor_exportcomments.com_2019,
+	title = {exportcomments.com},
+	url = {https://exportcomments.com/},
+	month = nov,
+	year = {2019}
+}
+
+@misc{ou-yang_newspaper3k:_2019,
+	title = {Newspaper3k: {Article} scraping \& curation},
+	url = {https://github.com/codelucas/newspaper/},
+	author = {Ou-Yang, Lucas},
+	year = {2019}
+}
+
+@inproceedings{mckinney_data_2010,
+	title = {Data {Structures} for {Statistical} {Computing} in {Python}},
+	booktitle = {Proceedings of the 9th {Python} in {Science} {Conference}},
+	author = {McKinney, Wes},
+	editor = {Walt, Stéfan van der and Millman, Jarrod},
+	year = {2010},
+	pages = {51 -- 56}
+}
+
+@incollection{baxter_discourse-analytic_2010,
+	title = {Discourse-analytic approaches to text and talk},
+	isbn = {978-0-8264-8993-7},
+	abstract = {This chapter explores the different ways in which discourse-analytic approaches reveal the ‘meaningfulness’ of text and talk. It reviews four diverse approaches to discourse analysis of particular value for current research in linguistics: Conversation Analysis (CA), Discourse Analysis (DA), Critical Discourse Analysis (CDA) and Feminist Post-structuralist Discourse Analysis (FPDA). Each approach is examined in terms of its background, motivation, key features, and possible strengths and limitations in relation to the field of linguistics. A key way to schematize discourse-analytic methodology is in terms of its relationship between microanalytical approaches, which examine the finer detail of linguistic interactions in transcripts, and macroanalytical approaches, which consider how broader social processes work through language (Heller, 2001). This chapter assesses whether there is a strength in a discourse-analytic approach that aligns itself exclusively with either a micro- or macrostrategy, or whether, as Heller suggests, the field needs to fi nd a way of ‘undoing’ the micro–macro dichotomy in order to produce richer, more complex insights within linguistic research.},
+	language = {English},
+	booktitle = {Research {Methods} in {Linguistics}},
+	publisher = {Continuum},
+	author = {Baxter, Judith A.},
+	editor = {Litosseliti, Lia},
+	year = {2010},
+	pages = {117--137}
+}
--- a/Articles.ipynb
+++ b/Articles.ipynb
@ -19,7 +19,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -28,7 +28,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -37,7 +37,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -47,7 +47,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -57,7 +57,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 163,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -66,7 +66,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -75,7 +75,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 165,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -85,7 +85,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 167,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
--- a/commentaires_reseaux_sociaux.mm
+++ b/commentaires_reseaux_sociaux.mm
@ -63,7 +63,7 @@
 </stylenode>
 </map_styles>
 </hook>
-<hook NAME="AutomaticEdgeColor" COUNTER="6" RULE="ON_BRANCH_CREATION"/>
+<hook NAME="AutomaticEdgeColor" COUNTER="7" RULE="ON_BRANCH_CREATION"/>
 <node TEXT="Schultes" POSITION="left" ID="ID_1889975585" CREATED="1573955304008" MODIFIED="1573955306512">
 <edge COLOR="#00ff00"/>
 <node TEXT="Classification Youtube" ID="ID_960774191" CREATED="1573954869370" MODIFIED="1573955309655">
@ -257,5 +257,43 @@
 </node>
 </node>
 </node>
+<node TEXT="Liebeskind - Comment relevance" POSITION="right" ID="ID_459551622" CREATED="1576549993953" MODIFIED="1576550001728">
+<edge COLOR="#00007c"/>
+<node TEXT="Attributs linguistiques" FOLDED="true" ID="ID_1357799857" CREATED="1576550002709" MODIFIED="1576550036378">
+<node TEXT="Abbreviations" ID="ID_1817531479" CREATED="1576550015732" MODIFIED="1576550021276"/>
+<node TEXT="Emojis" ID="ID_1973703164" CREATED="1576550021670" MODIFIED="1576550023922"/>
+<node TEXT="Onomatop&#xe9;es" ID="ID_360315962" CREATED="1576550024857" MODIFIED="1576550028567"/>
+<node TEXT="Caract&#xe8;res r&#xe9;pliqu&#xe9;s" ID="ID_1251269639" CREATED="1576550043313" MODIFIED="1576550050759"/>
+<node TEXT="Mots du jargon/vulgaires" ID="ID_856373733" CREATED="1576550055819" MODIFIED="1576550063652"/>
+</node>
+<node TEXT="Repr&#xe9;sentation vectorielle" ID="ID_314620017" CREATED="1576550137691" MODIFIED="1576550151050">
+<node TEXT="Tr&#xe8;s creuse" ID="ID_1413528711" CREATED="1576550151061" MODIFIED="1576550153561"/>
+<node TEXT="R&#xe9;duction de dimensions" ID="ID_1692921709" CREATED="1576550153928" MODIFIED="1576550355226">
+<node TEXT="LSA" ID="ID_19890854" CREATED="1576550356027" MODIFIED="1576550357759"/>
+<node TEXT="LDA" ID="ID_1303319269" CREATED="1576550358194" MODIFIED="1576550359395"/>
+<node TEXT="RP (Johnson Lindenstrauss)" ID="ID_915849384" CREATED="1576550361325" MODIFIED="1576550382511"/>
+<node TEXT="Plongements de mots" ID="ID_126383761" CREATED="1576550382936" MODIFIED="1576550388492"/>
+</node>
+<node TEXT="Approche non supervis&#xe9;e" ID="ID_925106378" CREATED="1576550202218" MODIFIED="1576550207474">
+<node TEXT="Un mod&#xe8;le de langue commun publication/commentaire" ID="ID_286501849" CREATED="1576550463452" MODIFIED="1576550574551"/>
+<node TEXT="Deux mod&#xe8;les de langue s&#xe9;par&#xe9;s" ID="ID_575232992" CREATED="1576550574882" MODIFIED="1576550581829"/>
+</node>
+<node TEXT="Vecteurs denses et courts (quelques centaines)" ID="ID_1513850178" CREATED="1576550467908" MODIFIED="1576550501753">
+<node TEXT="Apprise de fa&#xe7;on non supervis&#xe9;e depuis un large corpus" ID="ID_494554263" CREATED="1576550515290" MODIFIED="1576550527480"/>
+</node>
+</node>
+<node TEXT="Probl&#xe8;me de classification" ID="ID_230161465" CREATED="1576550316368" MODIFIED="1576550325580"/>
+<node TEXT="Sources d&apos;erreurs" ID="ID_1807623398" CREATED="1576550614388" MODIFIED="1576550620184">
+<node TEXT="Salutations et mots vulgaires" ID="ID_1397948632" CREATED="1576550620195" MODIFIED="1576550630390"/>
+<node TEXT="Contexte latent/implicite" ID="ID_127722742" CREATED="1576550630861" MODIFIED="1576550646635"/>
+<node TEXT="Sarcasme" ID="ID_463544414" CREATED="1576550647657" MODIFIED="1576550858846"/>
+</node>
+<node TEXT="Types de commentaires class&#xe9;s non pertinents" ID="ID_511556244" CREATED="1576550859750" MODIFIED="1576550893739">
+<node TEXT="R&#xe9;f&#xe9;rence implicite" ID="ID_1528238540" CREATED="1576550864053" MODIFIED="1576550872172"/>
+<node TEXT="Salutations" ID="ID_1829732133" CREATED="1576550873672" MODIFIED="1576550884728"/>
+<node TEXT="Expression d&apos;accord" ID="ID_1787160278" CREATED="1576550899353" MODIFIED="1576550913600"/>
+<node TEXT="R&#xe9;f&#xe9;rence au m&#xe9;dia publi&#xe9;" ID="ID_375827810" CREATED="1576550925702" MODIFIED="1576550934220"/>
+</node>
+</node>
 </node>
 </map>
--- a/rapport.md
+++ b/rapport.md
@ -0,0 +1,113 @@
+---
+title: IFT7022 - TP 3 - Commentaires Facebook en lien avec la presse écrite. 
+subtitle: Revue de littérature et quantification de la pertinence.
+author: François Pelletier
+date: 16 décembre 2019
+output: 
+  pdf_document: 
+    citation_package: natbib
+    number_sections: yes
+    toc: yes
+documentclass: "article"
+fontsize: 11pt
+geometry: margin=1in
+bibliography: NLP-TP3.bib
+csl: transactions-on-speech-and-language-processing.csl
+---
+
+\pagebreak
+
+# Introduction
+
+# Description des corpus de textes
+
+Nous analyserons les articles provenant des pages Facebook de trois médias écrits francophones: Le Figaro (France), Radio-Canada (Canada) et TVA Nouvelles (Canada). Pour chacun de ces médias, nous avons respectivement 25, 22 et 24 publications contenant un lien vers un article journalistique.
+
+Le premier corpus étudié est constitué du texte de chacun des articles qui sont liés dans les publications (l'utilisateur de Facebook devant cliquer sur le lien pour y accéder). Le titre de l'article n'est pas inclus dans ce corpus.
+
+Le second corpus est constitué d'un ensemble de commentaires publiés par des utilisateurs du réseau social et associés à chacune des publications précédentes. Il y a respectivement 7155, 2947 et 6262 commentaires pour chacun des trois médias écrits.
+
+Ces deux corpus ont été créées à l'aide des données de commentaires extraites depuis l'application en ligne exportcomments.com @noauthor_exportcomments.com_2019 dans des fichiers XLSX. Les fichiers ont par la suite été utilisés par les programmes Python suivants:
+
+- `commentaires.ipynb` pour extraire les commentaires depuis les fichiers téléchargés à l'aide de Pandas @mckinney_data_2010.
+- `textes_articles.ipynb` pour extraire les textes depuis les URL disponibles dans les fichiers, par récupération de données (*web scraping*), en utilisant la librairie Python `newspaper` @ou-yang_newspaper3k:_2019.
+
+\pagebreak
+
+# Attributs linguistiques des commentaires sur les réseaux sociaux
+
+Les commentaires extraits constituent une nouvelle forme de discours, complètement différent des textes formatés et normalisés provenant du domaine journalistique. Il est donc nécessaire de s'y attarder plus longuement avant de poursuivre nos analyse.
+
+## Analyse du discours
+
+Selon Baxter [@baxter_discourse-analytic_2010], l'analyse du discours, qui est principalement dérivée de la sociologie, se découpe en quatre composantes principales (p.11):
+
+- La variabilité du langage (adaptation à l'audience et au contexte)
+- La nature du langage (descriptif, narratif, expressif ou humoristique)
+- Le répertoire (vocabulaire, grammaire, figures de style)
+- Approches macro et micro-analytiques (contextes sociopolitique et psychologie)
+
+Comme le sens propre de chacun des commentaires est influencé par ces éléments, il sera pertinent de pouvoir les représenter sous forme d'attributs dans un modèle de classification de la pertinence par rapport à l'article en référence. Sinon, le modèle pourrait être biaisé, par exemple, en favorisant les commentaires qui ont un vocabulaire soutenu, davantage descriptifs et sur un ton professionnel, c'est-à-dire similaire au style journalistique. Toutefois, ce dernier pourrait ne pas être davantage en lien avec le contenu de l'article qu'un commentaire humoristique avec un niveau grammatical faible. 
+
+## Sémiotique
+
+Selon Liebeskind [@liebeskind_comment_2018], les commentaires sur les réseaux sociaux présentent de nouvelles caractéristiques sémiotiques et linguistiques. En fait, on parle ici de sémiotique, car le discours n'est plus seulement signifié par des mots, mais aussi par des abbréviations, des émojis, des onomatopées, des répétitions de caractères (en particulier la ponctuation). On ajoute une dimension linguistique, car en plus d'un jargon spécifique à ce type de communication, les commentaires sont souvent écrits dans un niveau de langage passant du soutenu au vulgaire dans le même fil de conversation.
+
+### Emojis et interjections
+
+Les émojis et les interjections sont une composante essentielle des commentaires retrouvés sur les réseaux sociaux. Plus d'un commentaire sur huit contient un émoji.
+
+Halté [@halte_les_2018] a étudié en détail le rôle des émoticones (tels que `:-)`) et des interjections (tels que le fameux *lol*), ainsi que leur normalisation inspirée des caractères japonais nommés pour l'occasion `emojis` (néologisme qui relie l'anglais *emotion* et la racine japonaise *-ji* représentant la notion de symbole). Il précise d'ailleurs que des tests de substitution ou de suppression permettent d'identifier le rôle modalisateur de ces expression (une sorte de multiplicateur de la polarité ou valence du texte). La portée d'une émoticone, tout comme la portée d'une négation, peut être déterminée en effectuant une analyse syntaxique par relations ou par constituants. Mais, règle générale, l'auteur remarque que la portée s'étend toujours sur les éléments qui précèdent l'émoticone, ce qui peut parfois limiter la recherche des fragments de la phrase qui en sont affectés (lorsqu'ils ne sont pas à la fin du commentaire).
+
+### Majuscules et répétitions
+
+Georgalou [@georgalou_discourse_2017]
+
+### Ponctuations
+
+### Impact sur la classification des parties du discours
+
+La présence de ces nouveaux attributs fait de sorte qu'il ne sera plus possible d'utiliser avec autant de fiabilité les classificateurs de parties du discours (*part of speech*) aussi efficacement, car ils n'ont pas été conçus pour tenir compte de la présence de ces nouveaux éléments dans les phrases. Une approche retenue dans cette analyse pour augmenter la qualité de l'étiquetage est de séparer les émojis du contenu des phrases et de les considérer séparément, tout en conservant un marqueur de leur position dans les phrases. Comme des modèles ne tiennent généralement pas compte de la casse ni de la présence de ponctuation à l'extérieur de la phrase, il n'est pas nécessaire d'apporter d'autres modifications ici.
+
+## Qualité des commentaires
+
+### Syntaxique
+
+### Lexicale
+
+\pagebreak
+
+# Entités et parties du discours
+
+## Lieu et temps
+
+## Expertise
+
+## Positionnement
+
+\pagebreak
+
+# Relations entre les commentaires
+
+## Intertextualité
+
+## Interdiscursivité
+
+## Multimodalité
+
+\pagebreak
+
+# Représentation vectorielle
+
+\pagebreak
+
+# Classification de la pertinence
+
+\pagebreak
+
+# Conclusion
+
+\pagebreak
+
+# Références
+
--- a/transactions-on-speech-and-language-processing.csl
+++ b/transactions-on-speech-and-language-processing.csl
@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<style xmlns="http://purl.org/net/xbiblio/csl" version="1.0" default-locale="en-US">
+  <!-- Association for Computing Machinery (ACM), generated from "acm" metadata at https://github.com/citation-style-language/journals -->
+  <info>
+    <title>ACM Transactions on Speech and Language Processing</title>
+    <title-short>TSLP</title-short>
+    <id>http://www.zotero.org/styles/transactions-on-speech-and-language-processing</id>
+    <link href="http://www.zotero.org/styles/transactions-on-speech-and-language-processing" rel="self"/>
+    <link href="http://www.zotero.org/styles/association-for-computing-machinery" rel="independent-parent"/>
+    <link href="https://www.acm.org/publications/authors/reference-formatting" rel="documentation"/>
+    <category citation-format="numeric"/>
+    <category field="engineering"/>
+    <issn>1550-4875</issn>
+    <eissn>1550-4883</eissn>
+    <updated>2017-07-09T19:14:54+00:00</updated>
+    <rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
+  </info>
+</style>