From a95f89a69dc34f5a76ebc4f3b6a5dcf9ecc2c998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Sun, 15 Dec 2019 20:31:38 -0500 Subject: [PATCH] commit initial --- .gitignore | 4 + Commentaires sur les réseaux sociaux.mm | 261 +++ Traitement Articles.ipynb | 188 ++ Traitement commentaires.ipynb | 2036 ++++++++++++++++++++++ commentaires.ipynb | 88 + commentaires_reseaux_sociaux.mm | 261 +++ parsing_functions.py | 57 + textes_articles.ipynb | 113 ++ 8 files changed, 3008 insertions(+) create mode 100644 Commentaires sur les réseaux sociaux.mm create mode 100644 Traitement Articles.ipynb create mode 100644 Traitement commentaires.ipynb create mode 100644 commentaires.ipynb create mode 100644 commentaires_reseaux_sociaux.mm create mode 100644 parsing_functions.py create mode 100644 textes_articles.ipynb diff --git a/.gitignore b/.gitignore index e61bca2..084bba9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +data/ +pickle/ +refined_data/ + # ---> Python # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Commentaires sur les réseaux sociaux.mm b/Commentaires sur les réseaux sociaux.mm new file mode 100644 index 0000000..43cf601 --- /dev/null +++ b/Commentaires sur les réseaux sociaux.mm @@ -0,0 +1,261 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Traitement Articles.ipynb b/Traitement Articles.ipynb new file mode 100644 index 0000000..7518896 --- /dev/null +++ b/Traitement Articles.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import toktok, sent_tokenize\n", + "from nltk.parse import CoreNLPParser\n", + "import re\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "tok = toktok.ToktokTokenizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')\n", + "#https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "textes_articles_df = pd.read_csv(\"refined_data/textes_articles_df.csv\")\n", + "textes_articles_df = textes_articles_df[textes_articles_df[\"text\"].notnull() & (textes_articles_df[\"media\"]!='CNN')]" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [], + "source": [ + "del textes_articles_df['Unnamed: 0']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Prétraitement\n", + "def pretraitement(article):\n", + " # tokeniser par phrases\n", + " article_sentences = sent_tokenize(article)\n", + " article_ner_tokens = []\n", + " article_pos_tokens = []\n", + " compteur_phrase = 0\n", + " for sentence in article_sentences:\n", + " # Tokeniser\n", + " sentence_tokens = tok.tokenize(sentence)\n", + " # Assembler les entités nommées et colocations\n", + " sentence_ner = ner_tagger.tag(sentence_tokens)\n", + " ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']\n", + " # Supprimer les classes fermées avec un POS\n", + " sentence_pos = pos_tagger.tag(sentence_tokens)\n", + " pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]\n", + " # Ajouter à la liste de phrases tokenisées\n", + " article_ner_tokens.append(ner_tokens)\n", + " article_pos_tokens.append(pos_tokens)\n", + " compteur_phrase += 1\n", + " return article_ner_tokens, article_pos_tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "article_pretraite = [pretraitement(x) for x in list(textes_articles_df[\"text\"])]" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "def aggreger_ner_tags(article):\n", + " dict_named_entity = {}\n", + " for sentence in article[0]:\n", + " for entity in sentence:\n", + " dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1\n", + " return dict_named_entity" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [], + "source": [ + "def aggreger_pos_tags(article):\n", + " dict_pos = {}\n", + " for sentence in article[1]:\n", + " for pos in sentence:\n", + " dict_pos[pos] = dict_pos.get(pos,0) + 1\n", + " return dict_pos" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "textes_articles_df['ner_dict']=[aggreger_ner_tags(article) for article in article_pretraite]\n", + "textes_articles_df['pos_dict']=[aggreger_pos_tags(article) for article in article_pretraite]" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "f = open(\"pickle/textes_articles_df.pickle\",\"wb\")\n", + "pickle.dump(textes_articles_df,f)\n", + "f.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Traitement commentaires.ipynb b/Traitement commentaires.ipynb new file mode 100644 index 0000000..76f6579 --- /dev/null +++ b/Traitement commentaires.ipynb @@ -0,0 +1,2036 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "# Aller chercher les synsets\n", + "# Variantes morphologiques\n", + "# Enlever les noms des autres commenteux\n", + "# Traiter les émoticones" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import TweetTokenizer\n", + "from nltk.parse import CoreNLPParser\n", + "import re\n", + "import pickle\n", + "import emoji" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenisation des commentaires\n", + "\n", + "Utilisation du TweetTokenizer, car il est davantage adapté au contenu des utilisateurs sur les médias sociaux" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "# Création de l'objet Tokenizer\n", + "tok = TweetTokenizer(preserve_case=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df = pd.read_csv(\"refined_data/commentaires_df.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0comment_idnested_idnameiddatelikescommentmediapost_id
001.00Ycf BullitID: 1000006158663132019-11-09 14:17:130C'est une blague mdr 🤣🤣🤣🤣🤣FIG5dc7ac7f359e2-10157143278136339
112.00Steph AlcazarID: 1000011750772632019-11-09 14:17:340La seule question c'est de savoir s'il fera pl...FIG5dc7ac7f359e2-10157143278136339
223.00Töm MüstäineID: 13658794042019-11-09 14:17:510Romain Debrigode l info du jour qui fait plaiseFIG5dc7ac7f359e2-10157143278136339
334.00Pierre CrouzetID: 1000002702920072019-11-09 14:18:060Vasanth Toure 😍FIG5dc7ac7f359e2-10157143278136339
444.01Vasanth ToureID: 1000014946078012019-11-09 14:20:570Pierre Crouzet Paris n'est pas prêt encore...FIG5dc7ac7f359e2-10157143278136339
554.02Pierre CrouzetID: 1000002702920072019-11-09 14:26:370Vasanth Toure le prochain c’est Adrien RabiotFIG5dc7ac7f359e2-10157143278136339
665.00Stéphane PirnaciID: 1000085413673022019-11-09 14:18:510MdrFIG5dc7ac7f359e2-10157143278136339
776.00Adil BennaniID: 1000064329172922019-11-09 14:19:030moi je propose mamadou sissokoFIG5dc7ac7f359e2-10157143278136339
887.00Hadrien De CournonID: 11312905522019-11-09 14:19:090Louis Prt Corentin Corman Victor Mdv ah ouais?FIG5dc7ac7f359e2-10157143278136339
998.00Marwa LaroseID: 1000225775896112019-11-09 14:19:380Marier le foot à la mairie est génialFIG5dc7ac7f359e2-10157143278136339
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 comment_id nested_id name id \\\n", + "0 0 1.0 0 Ycf Bullit ID: 100000615866313 \n", + "1 1 2.0 0 Steph Alcazar ID: 100001175077263 \n", + "2 2 3.0 0 Töm Müstäine ID: 1365879404 \n", + "3 3 4.0 0 Pierre Crouzet ID: 100000270292007 \n", + "4 4 4.0 1 Vasanth Toure ID: 100001494607801 \n", + "5 5 4.0 2 Pierre Crouzet ID: 100000270292007 \n", + "6 6 5.0 0 Stéphane Pirnaci ID: 100008541367302 \n", + "7 7 6.0 0 Adil Bennani ID: 100006432917292 \n", + "8 8 7.0 0 Hadrien De Cournon ID: 1131290552 \n", + "9 9 8.0 0 Marwa Larose ID: 100022577589611 \n", + "\n", + " date likes \\\n", + "0 2019-11-09 14:17:13 0 \n", + "1 2019-11-09 14:17:34 0 \n", + "2 2019-11-09 14:17:51 0 \n", + "3 2019-11-09 14:18:06 0 \n", + "4 2019-11-09 14:20:57 0 \n", + "5 2019-11-09 14:26:37 0 \n", + "6 2019-11-09 14:18:51 0 \n", + "7 2019-11-09 14:19:03 0 \n", + "8 2019-11-09 14:19:09 0 \n", + "9 2019-11-09 14:19:38 0 \n", + "\n", + " comment media \\\n", + "0 C'est une blague mdr 🤣🤣🤣🤣🤣 FIG \n", + "1 La seule question c'est de savoir s'il fera pl... FIG \n", + "2 Romain Debrigode l info du jour qui fait plaise FIG \n", + "3 Vasanth Toure 😍 FIG \n", + "4 Pierre Crouzet Paris n'est pas prêt encore... FIG \n", + "5 Vasanth Toure le prochain c’est Adrien Rabiot FIG \n", + "6 Mdr FIG \n", + "7 moi je propose mamadou sissoko FIG \n", + "8 Louis Prt Corentin Corman Victor Mdv ah ouais? FIG \n", + "9 Marier le foot à la mairie est génial FIG \n", + "\n", + " post_id \n", + "0 5dc7ac7f359e2-10157143278136339 \n", + "1 5dc7ac7f359e2-10157143278136339 \n", + "2 5dc7ac7f359e2-10157143278136339 \n", + "3 5dc7ac7f359e2-10157143278136339 \n", + "4 5dc7ac7f359e2-10157143278136339 \n", + "5 5dc7ac7f359e2-10157143278136339 \n", + "6 5dc7ac7f359e2-10157143278136339 \n", + "7 5dc7ac7f359e2-10157143278136339 \n", + "8 5dc7ac7f359e2-10157143278136339 \n", + "9 5dc7ac7f359e2-10157143278136339 " + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commentaires_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "#suppression de la première colonne qui ne sert à rien\n", + "del commentaires_df['Unnamed: 0']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Noms des auteurs\n", + "\n", + "Extraction du nom des auteurs pour chaque commentaire et ses sous-commentaires" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [], + "source": [ + "names_df = pd.DataFrame(commentaires_df.groupby(['post_id','comment_id'])['name'], columns=['post_comment','list_names'])" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "names_df['list_names'] = names_df.apply(lambda x: list(set(x['list_names'])), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "names_df['post_id'] = names_df.apply(lambda x: x['post_comment'][0], axis=1)\n", + "names_df['comment_id'] = names_df.apply(lambda x: x['post_comment'][1], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "del names_df['post_comment']" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
list_namespost_idcomment_id
0[Musta Pha]5dc645a8cab5a-28954070838376131.0
1[Pierre-Luc Thibaudeau]5dc645a8cab5a-28954070838376132.0
2[Brandon Fontana]5dc645a8cab5a-28954070838376133.0
3[Maxime Tremblay]5dc645a8cab5a-28954070838376134.0
4[Zahir Idir]5dc645a8cab5a-28954070838376135.0
5[Zahir Idir]5dc645a8cab5a-28954070838376136.0
6[Mohammed Elmorshedy]5dc645a8cab5a-28954070838376137.0
7[Ahmed Dades]5dc645a8cab5a-28954070838376138.0
8[Martin Chevalier]5dc645a8cab5a-28954070838376139.0
9[Maxime Tremblay, Aliza Attias, Étienne Gagné,...5dc645a8cab5a-289540708383761310.0
\n", + "
" + ], + "text/plain": [ + " list_names \\\n", + "0 [Musta Pha] \n", + "1 [Pierre-Luc Thibaudeau] \n", + "2 [Brandon Fontana] \n", + "3 [Maxime Tremblay] \n", + "4 [Zahir Idir] \n", + "5 [Zahir Idir] \n", + "6 [Mohammed Elmorshedy] \n", + "7 [Ahmed Dades] \n", + "8 [Martin Chevalier] \n", + "9 [Maxime Tremblay, Aliza Attias, Étienne Gagné,... \n", + "\n", + " post_id comment_id \n", + "0 5dc645a8cab5a-2895407083837613 1.0 \n", + "1 5dc645a8cab5a-2895407083837613 2.0 \n", + "2 5dc645a8cab5a-2895407083837613 3.0 \n", + "3 5dc645a8cab5a-2895407083837613 4.0 \n", + "4 5dc645a8cab5a-2895407083837613 5.0 \n", + "5 5dc645a8cab5a-2895407083837613 6.0 \n", + "6 5dc645a8cab5a-2895407083837613 7.0 \n", + "7 5dc645a8cab5a-2895407083837613 8.0 \n", + "8 5dc645a8cab5a-2895407083837613 9.0 \n", + "9 5dc645a8cab5a-2895407083837613 10.0 " + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "names_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Traitement du nom des auteurs dans les textes des commentaires" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df_names = commentaires_df.merge(names_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "def list_auteurs_referes(comment,names):\n", + " auteurs_referes = []\n", + " try:\n", + " if len(names) > 0:\n", + " for i in range(len(names)):\n", + " if (comment.find(names[i]) >=0):\n", + " auteurs_referes.append(names[i])\n", + " return list(set(auteurs_referes))\n", + " except:\n", + " return auteurs_referes\n", + "\n", + "def remove_names(comment,names):\n", + " try:\n", + " if len(names) > 0:\n", + " for i in range(len(names)):\n", + " comment = comment.replace(names[i],'')\n", + " return comment\n", + " except:\n", + " return comment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Nettoyage des commentaires et traitement des émoticones" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df_names['auteurs_referes'] = commentaires_df_names.apply(lambda x: str(list_auteurs_referes(x['comment'],x['list_names'])), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 []\n", + "1 []\n", + "2 []\n", + "3 ['Vasanth Toure']\n", + "4 ['Pierre Crouzet']\n", + "5 ['Vasanth Toure']\n", + "6 []\n", + "7 []\n", + "8 []\n", + "9 []\n", + "10 []\n", + "11 []\n", + "12 []\n", + "13 []\n", + "14 []\n", + "15 []\n", + "16 []\n", + "17 []\n", + "18 ['Pierre Trichet']\n", + "19 []\n", + "20 []\n", + "21 []\n", + "22 ['Yann Gilles']\n", + "23 []\n", + "24 []\n", + "25 ['Yann Gilles']\n", + "26 ['Gen Lys']\n", + "27 []\n", + "28 ['Jackie Petit']\n", + "29 []\n", + " ... \n", + "37884 []\n", + "37885 []\n", + "37886 []\n", + "37887 ['Luc Pellerin']\n", + "37888 []\n", + "37889 []\n", + "37890 []\n", + "37891 []\n", + "37892 []\n", + "37893 []\n", + "37894 []\n", + "37895 []\n", + "37896 []\n", + "37897 []\n", + "37898 []\n", + "37899 []\n", + "37900 []\n", + "37901 []\n", + "37902 []\n", + "37903 []\n", + "37904 []\n", + "37905 []\n", + "37906 []\n", + "37907 []\n", + "37908 []\n", + "37909 []\n", + "37910 []\n", + "37911 []\n", + "37912 []\n", + "37913 []\n", + "Name: auteurs_referes, Length: 37914, dtype: object" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commentaires_df_names['auteurs_referes']" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df_names['comment_clean'] = commentaires_df_names.apply(lambda x: str(remove_names(x['comment'],x['list_names'])), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df_names['comment_clean_tok'] = commentaires_df_names.apply(lambda x: tok.tokenize(x['comment_clean']), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df_names['comment_tok_demojize'] = commentaires_df_names.apply(lambda x: [emoji.demojize(token) for token in x['comment_clean_tok']], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "del commentaires_df_names['comment_clean']\n", + "del commentaires_df_names['comment_clean_tok']" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_idnested_idnameiddatelikescommentmediapost_idlist_namesauteurs_referescomment_tok_demojize
01.00Ycf BullitID: 1000006158663132019-11-09 14:17:130C'est une blague mdr 🤣🤣🤣🤣🤣FIG5dc7ac7f359e2-10157143278136339[Ycf Bullit][][C'est, une, blague, mdr, :rolling_on_the_floo...
12.00Steph AlcazarID: 1000011750772632019-11-09 14:17:340La seule question c'est de savoir s'il fera pl...FIG5dc7ac7f359e2-10157143278136339[Steph Alcazar][][La, seule, question, c'est, de, savoir, s'il,...
23.00Töm MüstäineID: 13658794042019-11-09 14:17:510Romain Debrigode l info du jour qui fait plaiseFIG5dc7ac7f359e2-10157143278136339[Töm Müstäine][][Romain, Debrigode, l, info, du, jour, qui, fa...
34.00Pierre CrouzetID: 1000002702920072019-11-09 14:18:060Vasanth Toure 😍FIG5dc7ac7f359e2-10157143278136339[Vasanth Toure, Pierre Crouzet]['Vasanth Toure'][:smiling_face_with_heart-eyes:]
44.01Vasanth ToureID: 1000014946078012019-11-09 14:20:570Pierre Crouzet Paris n'est pas prêt encore...FIG5dc7ac7f359e2-10157143278136339[Vasanth Toure, Pierre Crouzet]['Pierre Crouzet'][Paris, n'est, pas, prêt, encore, ...]
54.02Pierre CrouzetID: 1000002702920072019-11-09 14:26:370Vasanth Toure le prochain c’est Adrien RabiotFIG5dc7ac7f359e2-10157143278136339[Vasanth Toure, Pierre Crouzet]['Vasanth Toure'][le, prochain, c, ’, est, Adrien, Rabiot]
65.00Stéphane PirnaciID: 1000085413673022019-11-09 14:18:510MdrFIG5dc7ac7f359e2-10157143278136339[Stéphane Pirnaci][][Mdr]
76.00Adil BennaniID: 1000064329172922019-11-09 14:19:030moi je propose mamadou sissokoFIG5dc7ac7f359e2-10157143278136339[Adil Bennani][][moi, je, propose, mamadou, sissoko]
87.00Hadrien De CournonID: 11312905522019-11-09 14:19:090Louis Prt Corentin Corman Victor Mdv ah ouais?FIG5dc7ac7f359e2-10157143278136339[Hadrien De Cournon][][Louis, Prt, Corentin, Corman, Victor, Mdv, ah...
98.00Marwa LaroseID: 1000225775896112019-11-09 14:19:380Marier le foot à la mairie est génialFIG5dc7ac7f359e2-10157143278136339[Marwa Larose][][Marier, le, foot, à, la, mairie, est, génial]
109.00Luca SpadaID: 1000024373451502019-11-09 14:19:520Benoît ZivanovicFIG5dc7ac7f359e2-10157143278136339[Luca Spada][][Benoît, Zivanovic]
1110.00Louis ReyID: 11528040212019-11-09 14:20:000Eugénie Rey avec Simonet !!!FIG5dc7ac7f359e2-10157143278136339[Louis Rey][][Eugénie, Rey, avec, Simonet, !, !, !]
1211.00Mariam Aurelie KonéID: 1000010687953522019-11-09 14:20:020Moi aussi je candidate ras le bol la place est...FIG5dc7ac7f359e2-10157143278136339[Mariam Aurelie Koné][][Moi, aussi, je, candidate, ras, le, bol, la, ...
1312.00Cedric CmnID: 1000367647373282019-11-09 14:20:145Ah bah vu qu’il a déclaré y’a pas longtemps qu...FIG5dc7ac7f359e2-10157143278136339[Cedric Cmn][][Ah, bah, vu, qu, ’, il, a, déclaré, y, ’, a, ...
1413.00Olivia FuentesID: 13442778802019-11-09 14:21:080Catheline Lr Victoire Bailly Hannah Jenn ce me...FIG5dc7ac7f359e2-10157143278136339[Olivia Fuentes][][Catheline, Lr, Victoire, Bailly, Hannah, Jenn...
1514.00Marie MadeleineID: 1000114690227902019-11-09 14:21:151🤮🤮👎👎FIG5dc7ac7f359e2-10157143278136339[Marie Madeleine][][:face_vomiting:, :face_vomiting:, :thumbs_dow...
1615.00Yohann LévêqueID: 12390559892019-11-09 14:21:272CharlesDuquesne, il a pris trop de ballon sur ...FIG5dc7ac7f359e2-10157143278136339[Yohann Lévêque][][CharlesDuquesne, ,, il, a, pris, trop, de, ba...
1716.00Pierre TrichetID: 13246012882019-11-09 14:21:501Martin tu sais pour qui tu vas voter ?FIG5dc7ac7f359e2-10157143278136339[Martin Trichet, Pierre Trichet][][Martin, tu, sais, pour, qui, tu, vas, voter, ?]
1816.01Martin TrichetID: 12101546852019-11-09 14:33:190Pierre Trichet j'avais déjà vu, mon bulletin e...FIG5dc7ac7f359e2-10157143278136339[Martin Trichet, Pierre Trichet]['Pierre Trichet'][j'avais, déjà, vu, ,, mon, bulletin, est, prê...
1917.00Moumou SoussiID: 16078398642019-11-09 14:22:040Il va jouer le loto avec l'argent de la mairi...FIG5dc7ac7f359e2-10157143278136339[Moumou Soussi][][Il, va, jouer, le, loto, avec, l'argent, de, ...
2018.00Alexandre KhadirID: 12690446642019-11-09 14:22:1728En espérant qu’il fasse une meilleure carrière 🤣FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...[][En, espérant, qu, ’, il, fasse, une, meilleur...
2118.01Yann GillesID: 1000040915901402019-11-09 14:38:148En clubChampion de France en 2003 et en 2004 a...FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...[][En, clubChampion, de, France, en, 2003, et, e...
2218.02Claudine Laurent GirardID: 12114361112019-11-09 14:46:022Yann Gilles C'est pas pour ça qu'il seras un b...FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...['Yann Gilles'][C'est, pas, pour, ça, qu'il, seras, un, bon, ...
2318.03Gen LysID: 1000098465835532019-11-09 14:49:200Mais où sont les compètences d'un footballeur....FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...[][Mais, où, sont, les, compètences, d'un, footb...
2418.04Sylviane VaudevireID: 1000009087681412019-11-09 14:49:281son palmarès n'a rien à voir avec les qualités...FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...[][son, palmarès, n'a, rien, à, voir, avec, les,...
2518.05Rosette DelionID: 1000112837377772019-11-09 14:54:131Yann Gilles C'est vrai que comme cireur de ban...FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...['Yann Gilles'][C'est, vrai, que, comme, cireur, de, bancs, i...
2618.06Jackie PetitID: 1000002350014862019-11-09 15:12:010Gen Lys c' est un type extraordinaire... lisez...FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...['Gen Lys'][c, ', est, un, type, extraordinaire, ..., lis...
2718.07Alexandre KhadirID: 12690446642019-11-09 15:21:451Jacques Poulain d’accord Jacques 🙂 mais je me ...FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...[][Jacques, Poulain, d, ’, accord, Jacques, :sli...
2818.08Gerard BrunetID: 1000221365073262019-11-09 15:29:260Jackie Petit 😂😂😂😂😂😂😂😂😂FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...['Jackie Petit'][:face_with_tears_of_joy:, :face_with_tears_of...
2918.09Gerard BrunetID: 1000221365073262019-11-09 15:29:490Jacques Poulain 😂😂😂😂😂😂😂😂😂😂FIG5dc7ac7f359e2-10157143278136339[Jean-Marie Robini, Adelaide AF, Gen Lys, Phil...[][Jacques, Poulain, :face_with_tears_of_joy:, :...
.......................................
37884168.00Lise CurcumaID: 7618296652019-11-09 02:37:201Elle adore provoquer ,c'est son trouble d'oppo...RC5dc64663dc269-2894961790548809[Lise Curcuma][][Elle, adore, provoquer, ,, c'est, son, troubl...
37885169.00Denis TremblayID: 1000061128086512019-11-09 02:42:510Quand on s'accroche à ses années étudiantes......RC5dc64663dc269-2894961790548809[Denis Tremblay][][Quand, on, s'accroche, à, ses, années, étudia...
37886170.00Luc PellerinID: 11728703652019-11-09 02:50:361Dire qu'avant le gros bon sens régnait....faut...RC5dc64663dc269-2894961790548809[Luc Pellerin, Martin Plourde][][Dire, qu'avant, le, gros, bon, sens, régnait,...
37887170.01Martin PlourdeID: 10217680602019-11-09 03:03:540Luc Pellerin faut sauver les apparences qu'ils...RC5dc64663dc269-2894961790548809[Luc Pellerin, Martin Plourde]['Luc Pellerin'][faut, sauver, les, apparences, qu'ils, disent...
37888171.00Philippe PhilippeID: 1000043952436772019-11-09 02:52:250Au lieu de décrire ce qui est acceptable....il...RC5dc64663dc269-2894961790548809[Philippe Philippe][][Au, lieu, de, décrire, ce, qui, est, acceptab...
37889172.00Monique C. LevasseurID: 1000015097061682019-11-09 02:53:160Toujours pour ce faire remarquer et parler d’e...RC5dc64663dc269-2894961790548809[Monique C. Levasseur][][Toujours, pour, ce, faire, remarquer, et, par...
37890173.00Loulou Boudreault-ethierID: 1000077367193642019-11-09 02:53:440Je crois plutôt que l'habit cravate donne u...RC5dc64663dc269-2894961790548809[Loulou Boudreault-ethier][][Je, crois, plutôt, que, l'habit, cravate, don...
37891174.00Johanne RichardID: 1000044693149002019-11-09 02:55:560Malheureusement avoir de la classe n’est pas d...RC5dc64663dc269-2894961790548809[Johanne Richard][][Malheureusement, avoir, de, la, classe, n, ’,...
37892175.00Lynda HébertID: 1000032381174432019-11-09 03:00:060Ce n'est pas l'étiquette vestimentaire qui ani...RC5dc64663dc269-2894961790548809[Lynda Hébert][][Ce, n'est, pas, l'étiquette, vestimentaire, q...
37893176.00Gaston PelletierID: 1000170205662092019-11-09 03:00:080123 soldats marchent. 122 au pas et c’est la 1...RC5dc64663dc269-2894961790548809[Gaston Pelletier][][123, soldats, marchent, ., 122, au, pas, et, ...
37894177.00Norm GilbertID: 7281061182019-11-09 03:01:010Au Québec on parle vraiment des vrais dossiers...RC5dc64663dc269-2894961790548809[Norm Gilbert][][Au, Québec, on, parle, vraiment, des, vrais, ...
37895178.00France Martel PatrieID: 1000008452243632019-11-09 03:01:220La je peut dire que ta raisonRC5dc64663dc269-2894961790548809[France Martel Patrie][][La, je, peut, dire, que, ta, raison]
37896179.00Susan RookeID: 1000073477655642019-11-09 03:08:300Elle m’énerve cette femme qui manque grandemen...RC5dc64663dc269-2894961790548809[Susan Rooke][][Elle, m, ’, énerve, cette, femme, qui, manque...
37897180.00Francois BerubeID: 1000036892353782019-11-09 03:09:060On doit choisir nos députés pour ce qu'il on d...RC5dc64663dc269-2894961790548809[Francois Berube][][On, doit, choisir, nos, députés, pour, ce, qu...
37898181.00Jonathan BrissonID: 5120535862019-11-09 03:29:350Tse quand le commis chez IGA est mieux habillé...RC5dc64663dc269-2894961790548809[Jonathan Brisson][][Tse, quand, le, commis, chez, IGA, est, mieux...
37899182.00Karlos BellID: 1000046353988832019-11-09 03:29:380Il y a un minimum d'éthique, elle manque de sa...RC5dc64663dc269-2894961790548809[Karlos Bell][][Il, y, a, un, minimum, d'éthique, ,, elle, ma...
37900183.00Yanick DesrosiersID: 1000012380924612019-11-09 03:30:080Débat inutile! Elle a été élu pour ces idées!RC5dc64663dc269-2894961790548809[Yanick Desrosiers][][Débat, inutile, !, Elle, a, été, élu, pour, c...
37901184.00Marielle CormierID: 10047831812019-11-09 03:37:220Il y a un code vestimentaire à avoir au parlementRC5dc64663dc269-2894961790548809[Marielle Cormier][][Il, y, a, un, code, vestimentaire, à, avoir, ...
37902185.00Marie-Claire DoraisID: 1000005209967532019-11-09 03:43:080je crois qu'il y a des sujets plus intéressant...RC5dc64663dc269-2894961790548809[Marie-Claire Dorais][][je, crois, qu'il, y, a, des, sujets, plus, in...
37903186.00Yanvon DubucID: 1000039136871992019-11-09 03:51:120👹👿👺👹👿RC5dc64663dc269-2894961790548809[Yanvon Dubuc][][:ogre:, :angry_face_with_horns:, :goblin:, :o...
37904187.00Ahmed TaalbiID: 1000011529700152019-11-09 03:52:110Ça lui va très bien en plus!!!RC5dc64663dc269-2894961790548809[Ahmed Taalbi][][Ça, lui, va, très, bien, en, plus, !, !, !]
37905188.00Mathilde SimardID: 1000020928160152019-11-09 04:09:160Daryann Lacombe👀RC5dc64663dc269-2894961790548809[Mathilde Simard][][Daryann, Lacombe, :eyes:]
37906189.00Max PetersonID: 5990733372019-11-09 04:16:230son toupet yai tro kourRC5dc64663dc269-2894961790548809[Max Peterson][][son, toupet, yai, tro, kour]
37907190.00Philip TristramID: 10721290522019-11-09 04:34:450Une enfant d'école dans une cours de grand !RC5dc64663dc269-2894961790548809[Philip Tristram][][Une, enfant, d'école, dans, une, cours, de, g...
37908191.00Josée CaronID: 1000032692827802019-11-09 04:36:250La tempête dans un verre d'eau. Comment attire...RC5dc64663dc269-2894961790548809[Josée Caron][][La, tempête, dans, un, verre, d'eau, ., Comme...
37909192.00Etienne BordeleauID: 5220726112019-11-09 04:42:000Je perds mon temps à commenter...blablablablab...RC5dc64663dc269-2894961790548809[Etienne Bordeleau][][Je, perds, mon, temps, à, commenter, ..., bla...
37910193.00Josée MathieuID: 10722845562019-11-09 04:42:110Bravo, Madame.RC5dc64663dc269-2894961790548809[Josée Mathieu][][Bravo, ,, Madame, .]
37911194.00Madeleine BlaisID: 1000006803300602019-11-09 04:47:200Quand c’est ton vêtement qui attire l’attentio...RC5dc64663dc269-2894961790548809[Madeleine Blais][][Quand, c, ’, est, ton, vêtement, qui, attire,...
37912195.00Geneviève RaymondID: 7215684362019-11-09 04:49:130Elle pourrait exprimer sa créativité dans ses ...RC5dc64663dc269-2894961790548809[Geneviève Raymond][][Elle, pourrait, exprimer, sa, créativité, dan...
37913196.00Madeleine BlaisID: 1000006803300602019-11-09 04:54:020« Ton vêtement parle si fort que je n’entend p...RC5dc64663dc269-2894961790548809[Madeleine Blais][][«, Ton, vêtement, parle, si, fort, que, je, n...
\n", + "

37914 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " comment_id nested_id name id \\\n", + "0 1.0 0 Ycf Bullit ID: 100000615866313 \n", + "1 2.0 0 Steph Alcazar ID: 100001175077263 \n", + "2 3.0 0 Töm Müstäine ID: 1365879404 \n", + "3 4.0 0 Pierre Crouzet ID: 100000270292007 \n", + "4 4.0 1 Vasanth Toure ID: 100001494607801 \n", + "5 4.0 2 Pierre Crouzet ID: 100000270292007 \n", + "6 5.0 0 Stéphane Pirnaci ID: 100008541367302 \n", + "7 6.0 0 Adil Bennani ID: 100006432917292 \n", + "8 7.0 0 Hadrien De Cournon ID: 1131290552 \n", + "9 8.0 0 Marwa Larose ID: 100022577589611 \n", + "10 9.0 0 Luca Spada ID: 100002437345150 \n", + "11 10.0 0 Louis Rey ID: 1152804021 \n", + "12 11.0 0 Mariam Aurelie Koné ID: 100001068795352 \n", + "13 12.0 0 Cedric Cmn ID: 100036764737328 \n", + "14 13.0 0 Olivia Fuentes ID: 1344277880 \n", + "15 14.0 0 Marie Madeleine ID: 100011469022790 \n", + "16 15.0 0 Yohann Lévêque ID: 1239055989 \n", + "17 16.0 0 Pierre Trichet ID: 1324601288 \n", + "18 16.0 1 Martin Trichet ID: 1210154685 \n", + "19 17.0 0 Moumou Soussi ID: 1607839864 \n", + "20 18.0 0 Alexandre Khadir ID: 1269044664 \n", + "21 18.0 1 Yann Gilles ID: 100004091590140 \n", + "22 18.0 2 Claudine Laurent Girard ID: 1211436111 \n", + "23 18.0 3 Gen Lys ID: 100009846583553 \n", + "24 18.0 4 Sylviane Vaudevire ID: 100000908768141 \n", + "25 18.0 5 Rosette Delion ID: 100011283737777 \n", + "26 18.0 6 Jackie Petit ID: 100000235001486 \n", + "27 18.0 7 Alexandre Khadir ID: 1269044664 \n", + "28 18.0 8 Gerard Brunet ID: 100022136507326 \n", + "29 18.0 9 Gerard Brunet ID: 100022136507326 \n", + "... ... ... ... ... \n", + "37884 168.0 0 Lise Curcuma ID: 761829665 \n", + "37885 169.0 0 Denis Tremblay ID: 100006112808651 \n", + "37886 170.0 0 Luc Pellerin ID: 1172870365 \n", + "37887 170.0 1 Martin Plourde ID: 1021768060 \n", + "37888 171.0 0 Philippe Philippe ID: 100004395243677 \n", + "37889 172.0 0 Monique C. Levasseur ID: 100001509706168 \n", + "37890 173.0 0 Loulou Boudreault-ethier ID: 100007736719364 \n", + "37891 174.0 0 Johanne Richard ID: 100004469314900 \n", + "37892 175.0 0 Lynda Hébert ID: 100003238117443 \n", + "37893 176.0 0 Gaston Pelletier ID: 100017020566209 \n", + "37894 177.0 0 Norm Gilbert ID: 728106118 \n", + "37895 178.0 0 France Martel Patrie ID: 100000845224363 \n", + "37896 179.0 0 Susan Rooke ID: 100007347765564 \n", + "37897 180.0 0 Francois Berube ID: 100003689235378 \n", + "37898 181.0 0 Jonathan Brisson ID: 512053586 \n", + "37899 182.0 0 Karlos Bell ID: 100004635398883 \n", + "37900 183.0 0 Yanick Desrosiers ID: 100001238092461 \n", + "37901 184.0 0 Marielle Cormier ID: 1004783181 \n", + "37902 185.0 0 Marie-Claire Dorais ID: 100000520996753 \n", + "37903 186.0 0 Yanvon Dubuc ID: 100003913687199 \n", + "37904 187.0 0 Ahmed Taalbi ID: 100001152970015 \n", + "37905 188.0 0 Mathilde Simard ID: 100002092816015 \n", + "37906 189.0 0 Max Peterson ID: 599073337 \n", + "37907 190.0 0 Philip Tristram ID: 1072129052 \n", + "37908 191.0 0 Josée Caron ID: 100003269282780 \n", + "37909 192.0 0 Etienne Bordeleau ID: 522072611 \n", + "37910 193.0 0 Josée Mathieu ID: 1072284556 \n", + "37911 194.0 0 Madeleine Blais ID: 100000680330060 \n", + "37912 195.0 0 Geneviève Raymond ID: 721568436 \n", + "37913 196.0 0 Madeleine Blais ID: 100000680330060 \n", + "\n", + " date likes \\\n", + "0 2019-11-09 14:17:13 0 \n", + "1 2019-11-09 14:17:34 0 \n", + "2 2019-11-09 14:17:51 0 \n", + "3 2019-11-09 14:18:06 0 \n", + "4 2019-11-09 14:20:57 0 \n", + "5 2019-11-09 14:26:37 0 \n", + "6 2019-11-09 14:18:51 0 \n", + "7 2019-11-09 14:19:03 0 \n", + "8 2019-11-09 14:19:09 0 \n", + "9 2019-11-09 14:19:38 0 \n", + "10 2019-11-09 14:19:52 0 \n", + "11 2019-11-09 14:20:00 0 \n", + "12 2019-11-09 14:20:02 0 \n", + "13 2019-11-09 14:20:14 5 \n", + "14 2019-11-09 14:21:08 0 \n", + "15 2019-11-09 14:21:15 1 \n", + "16 2019-11-09 14:21:27 2 \n", + "17 2019-11-09 14:21:50 1 \n", + "18 2019-11-09 14:33:19 0 \n", + "19 2019-11-09 14:22:04 0 \n", + "20 2019-11-09 14:22:17 28 \n", + "21 2019-11-09 14:38:14 8 \n", + "22 2019-11-09 14:46:02 2 \n", + "23 2019-11-09 14:49:20 0 \n", + "24 2019-11-09 14:49:28 1 \n", + "25 2019-11-09 14:54:13 1 \n", + "26 2019-11-09 15:12:01 0 \n", + "27 2019-11-09 15:21:45 1 \n", + "28 2019-11-09 15:29:26 0 \n", + "29 2019-11-09 15:29:49 0 \n", + "... ... ... \n", + "37884 2019-11-09 02:37:20 1 \n", + "37885 2019-11-09 02:42:51 0 \n", + "37886 2019-11-09 02:50:36 1 \n", + "37887 2019-11-09 03:03:54 0 \n", + "37888 2019-11-09 02:52:25 0 \n", + "37889 2019-11-09 02:53:16 0 \n", + "37890 2019-11-09 02:53:44 0 \n", + "37891 2019-11-09 02:55:56 0 \n", + "37892 2019-11-09 03:00:06 0 \n", + "37893 2019-11-09 03:00:08 0 \n", + "37894 2019-11-09 03:01:01 0 \n", + "37895 2019-11-09 03:01:22 0 \n", + "37896 2019-11-09 03:08:30 0 \n", + "37897 2019-11-09 03:09:06 0 \n", + "37898 2019-11-09 03:29:35 0 \n", + "37899 2019-11-09 03:29:38 0 \n", + "37900 2019-11-09 03:30:08 0 \n", + "37901 2019-11-09 03:37:22 0 \n", + "37902 2019-11-09 03:43:08 0 \n", + "37903 2019-11-09 03:51:12 0 \n", + "37904 2019-11-09 03:52:11 0 \n", + "37905 2019-11-09 04:09:16 0 \n", + "37906 2019-11-09 04:16:23 0 \n", + "37907 2019-11-09 04:34:45 0 \n", + "37908 2019-11-09 04:36:25 0 \n", + "37909 2019-11-09 04:42:00 0 \n", + "37910 2019-11-09 04:42:11 0 \n", + "37911 2019-11-09 04:47:20 0 \n", + "37912 2019-11-09 04:49:13 0 \n", + "37913 2019-11-09 04:54:02 0 \n", + "\n", + " comment media \\\n", + "0 C'est une blague mdr 🤣🤣🤣🤣🤣 FIG \n", + "1 La seule question c'est de savoir s'il fera pl... FIG \n", + "2 Romain Debrigode l info du jour qui fait plaise FIG \n", + "3 Vasanth Toure 😍 FIG \n", + "4 Pierre Crouzet Paris n'est pas prêt encore... FIG \n", + "5 Vasanth Toure le prochain c’est Adrien Rabiot FIG \n", + "6 Mdr FIG \n", + "7 moi je propose mamadou sissoko FIG \n", + "8 Louis Prt Corentin Corman Victor Mdv ah ouais? FIG \n", + "9 Marier le foot à la mairie est génial FIG \n", + "10 Benoît Zivanovic FIG \n", + "11 Eugénie Rey avec Simonet !!! FIG \n", + "12 Moi aussi je candidate ras le bol la place est... FIG \n", + "13 Ah bah vu qu’il a déclaré y’a pas longtemps qu... FIG \n", + "14 Catheline Lr Victoire Bailly Hannah Jenn ce me... FIG \n", + "15 🤮🤮👎👎 FIG \n", + "16 CharlesDuquesne, il a pris trop de ballon sur ... FIG \n", + "17 Martin tu sais pour qui tu vas voter ? FIG \n", + "18 Pierre Trichet j'avais déjà vu, mon bulletin e... FIG \n", + "19 Il va jouer le loto avec l'argent de la mairi... FIG \n", + "20 En espérant qu’il fasse une meilleure carrière 🤣 FIG \n", + "21 En clubChampion de France en 2003 et en 2004 a... FIG \n", + "22 Yann Gilles C'est pas pour ça qu'il seras un b... FIG \n", + "23 Mais où sont les compètences d'un footballeur.... FIG \n", + "24 son palmarès n'a rien à voir avec les qualités... FIG \n", + "25 Yann Gilles C'est vrai que comme cireur de ban... FIG \n", + "26 Gen Lys c' est un type extraordinaire... lisez... FIG \n", + "27 Jacques Poulain d’accord Jacques 🙂 mais je me ... FIG \n", + "28 Jackie Petit 😂😂😂😂😂😂😂😂😂 FIG \n", + "29 Jacques Poulain 😂😂😂😂😂😂😂😂😂😂 FIG \n", + "... ... ... \n", + "37884 Elle adore provoquer ,c'est son trouble d'oppo... RC \n", + "37885 Quand on s'accroche à ses années étudiantes...... RC \n", + "37886 Dire qu'avant le gros bon sens régnait....faut... RC \n", + "37887 Luc Pellerin faut sauver les apparences qu'ils... RC \n", + "37888 Au lieu de décrire ce qui est acceptable....il... RC \n", + "37889 Toujours pour ce faire remarquer et parler d’e... RC \n", + "37890 Je crois plutôt que l'habit cravate donne u... RC \n", + "37891 Malheureusement avoir de la classe n’est pas d... RC \n", + "37892 Ce n'est pas l'étiquette vestimentaire qui ani... RC \n", + "37893 123 soldats marchent. 122 au pas et c’est la 1... RC \n", + "37894 Au Québec on parle vraiment des vrais dossiers... RC \n", + "37895 La je peut dire que ta raison RC \n", + "37896 Elle m’énerve cette femme qui manque grandemen... RC \n", + "37897 On doit choisir nos députés pour ce qu'il on d... RC \n", + "37898 Tse quand le commis chez IGA est mieux habillé... RC \n", + "37899 Il y a un minimum d'éthique, elle manque de sa... RC \n", + "37900 Débat inutile! Elle a été élu pour ces idées! RC \n", + "37901 Il y a un code vestimentaire à avoir au parlement RC \n", + "37902 je crois qu'il y a des sujets plus intéressant... RC \n", + "37903 👹👿👺👹👿 RC \n", + "37904 Ça lui va très bien en plus!!! RC \n", + "37905 Daryann Lacombe👀 RC \n", + "37906 son toupet yai tro kour RC \n", + "37907 Une enfant d'école dans une cours de grand ! RC \n", + "37908 La tempête dans un verre d'eau. Comment attire... RC \n", + "37909 Je perds mon temps à commenter...blablablablab... RC \n", + "37910 Bravo, Madame. RC \n", + "37911 Quand c’est ton vêtement qui attire l’attentio... RC \n", + "37912 Elle pourrait exprimer sa créativité dans ses ... RC \n", + "37913 « Ton vêtement parle si fort que je n’entend p... RC \n", + "\n", + " post_id \\\n", + "0 5dc7ac7f359e2-10157143278136339 \n", + "1 5dc7ac7f359e2-10157143278136339 \n", + "2 5dc7ac7f359e2-10157143278136339 \n", + "3 5dc7ac7f359e2-10157143278136339 \n", + "4 5dc7ac7f359e2-10157143278136339 \n", + "5 5dc7ac7f359e2-10157143278136339 \n", + "6 5dc7ac7f359e2-10157143278136339 \n", + "7 5dc7ac7f359e2-10157143278136339 \n", + "8 5dc7ac7f359e2-10157143278136339 \n", + "9 5dc7ac7f359e2-10157143278136339 \n", + "10 5dc7ac7f359e2-10157143278136339 \n", + "11 5dc7ac7f359e2-10157143278136339 \n", + "12 5dc7ac7f359e2-10157143278136339 \n", + "13 5dc7ac7f359e2-10157143278136339 \n", + "14 5dc7ac7f359e2-10157143278136339 \n", + "15 5dc7ac7f359e2-10157143278136339 \n", + "16 5dc7ac7f359e2-10157143278136339 \n", + "17 5dc7ac7f359e2-10157143278136339 \n", + "18 5dc7ac7f359e2-10157143278136339 \n", + "19 5dc7ac7f359e2-10157143278136339 \n", + "20 5dc7ac7f359e2-10157143278136339 \n", + "21 5dc7ac7f359e2-10157143278136339 \n", + "22 5dc7ac7f359e2-10157143278136339 \n", + "23 5dc7ac7f359e2-10157143278136339 \n", + "24 5dc7ac7f359e2-10157143278136339 \n", + "25 5dc7ac7f359e2-10157143278136339 \n", + "26 5dc7ac7f359e2-10157143278136339 \n", + "27 5dc7ac7f359e2-10157143278136339 \n", + "28 5dc7ac7f359e2-10157143278136339 \n", + "29 5dc7ac7f359e2-10157143278136339 \n", + "... ... \n", + "37884 5dc64663dc269-2894961790548809 \n", + "37885 5dc64663dc269-2894961790548809 \n", + "37886 5dc64663dc269-2894961790548809 \n", + "37887 5dc64663dc269-2894961790548809 \n", + "37888 5dc64663dc269-2894961790548809 \n", + "37889 5dc64663dc269-2894961790548809 \n", + "37890 5dc64663dc269-2894961790548809 \n", + "37891 5dc64663dc269-2894961790548809 \n", + "37892 5dc64663dc269-2894961790548809 \n", + "37893 5dc64663dc269-2894961790548809 \n", + "37894 5dc64663dc269-2894961790548809 \n", + "37895 5dc64663dc269-2894961790548809 \n", + "37896 5dc64663dc269-2894961790548809 \n", + "37897 5dc64663dc269-2894961790548809 \n", + "37898 5dc64663dc269-2894961790548809 \n", + "37899 5dc64663dc269-2894961790548809 \n", + "37900 5dc64663dc269-2894961790548809 \n", + "37901 5dc64663dc269-2894961790548809 \n", + "37902 5dc64663dc269-2894961790548809 \n", + "37903 5dc64663dc269-2894961790548809 \n", + "37904 5dc64663dc269-2894961790548809 \n", + "37905 5dc64663dc269-2894961790548809 \n", + "37906 5dc64663dc269-2894961790548809 \n", + "37907 5dc64663dc269-2894961790548809 \n", + "37908 5dc64663dc269-2894961790548809 \n", + "37909 5dc64663dc269-2894961790548809 \n", + "37910 5dc64663dc269-2894961790548809 \n", + "37911 5dc64663dc269-2894961790548809 \n", + "37912 5dc64663dc269-2894961790548809 \n", + "37913 5dc64663dc269-2894961790548809 \n", + "\n", + " list_names auteurs_referes \\\n", + "0 [Ycf Bullit] [] \n", + "1 [Steph Alcazar] [] \n", + "2 [Töm Müstäine] [] \n", + "3 [Vasanth Toure, Pierre Crouzet] ['Vasanth Toure'] \n", + "4 [Vasanth Toure, Pierre Crouzet] ['Pierre Crouzet'] \n", + "5 [Vasanth Toure, Pierre Crouzet] ['Vasanth Toure'] \n", + "6 [Stéphane Pirnaci] [] \n", + "7 [Adil Bennani] [] \n", + "8 [Hadrien De Cournon] [] \n", + "9 [Marwa Larose] [] \n", + "10 [Luca Spada] [] \n", + "11 [Louis Rey] [] \n", + "12 [Mariam Aurelie Koné] [] \n", + "13 [Cedric Cmn] [] \n", + "14 [Olivia Fuentes] [] \n", + "15 [Marie Madeleine] [] \n", + "16 [Yohann Lévêque] [] \n", + "17 [Martin Trichet, Pierre Trichet] [] \n", + "18 [Martin Trichet, Pierre Trichet] ['Pierre Trichet'] \n", + "19 [Moumou Soussi] [] \n", + "20 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... [] \n", + "21 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... [] \n", + "22 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... ['Yann Gilles'] \n", + "23 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... [] \n", + "24 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... [] \n", + "25 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... ['Yann Gilles'] \n", + "26 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... ['Gen Lys'] \n", + "27 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... [] \n", + "28 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... ['Jackie Petit'] \n", + "29 [Jean-Marie Robini, Adelaide AF, Gen Lys, Phil... [] \n", + "... ... ... \n", + "37884 [Lise Curcuma] [] \n", + "37885 [Denis Tremblay] [] \n", + "37886 [Luc Pellerin, Martin Plourde] [] \n", + "37887 [Luc Pellerin, Martin Plourde] ['Luc Pellerin'] \n", + "37888 [Philippe Philippe] [] \n", + "37889 [Monique C. Levasseur] [] \n", + "37890 [Loulou Boudreault-ethier] [] \n", + "37891 [Johanne Richard] [] \n", + "37892 [Lynda Hébert] [] \n", + "37893 [Gaston Pelletier] [] \n", + "37894 [Norm Gilbert] [] \n", + "37895 [France Martel Patrie] [] \n", + "37896 [Susan Rooke] [] \n", + "37897 [Francois Berube] [] \n", + "37898 [Jonathan Brisson] [] \n", + "37899 [Karlos Bell] [] \n", + "37900 [Yanick Desrosiers] [] \n", + "37901 [Marielle Cormier] [] \n", + "37902 [Marie-Claire Dorais] [] \n", + "37903 [Yanvon Dubuc] [] \n", + "37904 [Ahmed Taalbi] [] \n", + "37905 [Mathilde Simard] [] \n", + "37906 [Max Peterson] [] \n", + "37907 [Philip Tristram] [] \n", + "37908 [Josée Caron] [] \n", + "37909 [Etienne Bordeleau] [] \n", + "37910 [Josée Mathieu] [] \n", + "37911 [Madeleine Blais] [] \n", + "37912 [Geneviève Raymond] [] \n", + "37913 [Madeleine Blais] [] \n", + "\n", + " comment_tok_demojize \n", + "0 [C'est, une, blague, mdr, :rolling_on_the_floo... \n", + "1 [La, seule, question, c'est, de, savoir, s'il,... \n", + "2 [Romain, Debrigode, l, info, du, jour, qui, fa... \n", + "3 [:smiling_face_with_heart-eyes:] \n", + "4 [Paris, n'est, pas, prêt, encore, ...] \n", + "5 [le, prochain, c, ’, est, Adrien, Rabiot] \n", + "6 [Mdr] \n", + "7 [moi, je, propose, mamadou, sissoko] \n", + "8 [Louis, Prt, Corentin, Corman, Victor, Mdv, ah... \n", + "9 [Marier, le, foot, à, la, mairie, est, génial] \n", + "10 [Benoît, Zivanovic] \n", + "11 [Eugénie, Rey, avec, Simonet, !, !, !] \n", + "12 [Moi, aussi, je, candidate, ras, le, bol, la, ... \n", + "13 [Ah, bah, vu, qu, ’, il, a, déclaré, y, ’, a, ... \n", + "14 [Catheline, Lr, Victoire, Bailly, Hannah, Jenn... \n", + "15 [:face_vomiting:, :face_vomiting:, :thumbs_dow... \n", + "16 [CharlesDuquesne, ,, il, a, pris, trop, de, ba... \n", + "17 [Martin, tu, sais, pour, qui, tu, vas, voter, ?] \n", + "18 [j'avais, déjà, vu, ,, mon, bulletin, est, prê... \n", + "19 [Il, va, jouer, le, loto, avec, l'argent, de, ... \n", + "20 [En, espérant, qu, ’, il, fasse, une, meilleur... \n", + "21 [En, clubChampion, de, France, en, 2003, et, e... \n", + "22 [C'est, pas, pour, ça, qu'il, seras, un, bon, ... \n", + "23 [Mais, où, sont, les, compètences, d'un, footb... \n", + "24 [son, palmarès, n'a, rien, à, voir, avec, les,... \n", + "25 [C'est, vrai, que, comme, cireur, de, bancs, i... \n", + "26 [c, ', est, un, type, extraordinaire, ..., lis... \n", + "27 [Jacques, Poulain, d, ’, accord, Jacques, :sli... \n", + "28 [:face_with_tears_of_joy:, :face_with_tears_of... \n", + "29 [Jacques, Poulain, :face_with_tears_of_joy:, :... \n", + "... ... \n", + "37884 [Elle, adore, provoquer, ,, c'est, son, troubl... \n", + "37885 [Quand, on, s'accroche, à, ses, années, étudia... \n", + "37886 [Dire, qu'avant, le, gros, bon, sens, régnait,... \n", + "37887 [faut, sauver, les, apparences, qu'ils, disent... \n", + "37888 [Au, lieu, de, décrire, ce, qui, est, acceptab... \n", + "37889 [Toujours, pour, ce, faire, remarquer, et, par... \n", + "37890 [Je, crois, plutôt, que, l'habit, cravate, don... \n", + "37891 [Malheureusement, avoir, de, la, classe, n, ’,... \n", + "37892 [Ce, n'est, pas, l'étiquette, vestimentaire, q... \n", + "37893 [123, soldats, marchent, ., 122, au, pas, et, ... \n", + "37894 [Au, Québec, on, parle, vraiment, des, vrais, ... \n", + "37895 [La, je, peut, dire, que, ta, raison] \n", + "37896 [Elle, m, ’, énerve, cette, femme, qui, manque... \n", + "37897 [On, doit, choisir, nos, députés, pour, ce, qu... \n", + "37898 [Tse, quand, le, commis, chez, IGA, est, mieux... \n", + "37899 [Il, y, a, un, minimum, d'éthique, ,, elle, ma... \n", + "37900 [Débat, inutile, !, Elle, a, été, élu, pour, c... \n", + "37901 [Il, y, a, un, code, vestimentaire, à, avoir, ... \n", + "37902 [je, crois, qu'il, y, a, des, sujets, plus, in... \n", + "37903 [:ogre:, :angry_face_with_horns:, :goblin:, :o... \n", + "37904 [Ça, lui, va, très, bien, en, plus, !, !, !] \n", + "37905 [Daryann, Lacombe, :eyes:] \n", + "37906 [son, toupet, yai, tro, kour] \n", + "37907 [Une, enfant, d'école, dans, une, cours, de, g... \n", + "37908 [La, tempête, dans, un, verre, d'eau, ., Comme... \n", + "37909 [Je, perds, mon, temps, à, commenter, ..., bla... \n", + "37910 [Bravo, ,, Madame, .] \n", + "37911 [Quand, c, ’, est, ton, vêtement, qui, attire,... \n", + "37912 [Elle, pourrait, exprimer, sa, créativité, dan... \n", + "37913 [«, Ton, vêtement, parle, si, fort, que, je, n... \n", + "\n", + "[37914 rows x 12 columns]" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commentaires_df_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/commentaires.ipynb b/commentaires.ipynb new file mode 100644 index 0000000..bd63692 --- /dev/null +++ b/commentaires.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import parsing_functions as pf\n", + "import re\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "listOfFiles = pf.getListOfFiles(\"data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires = []\n", + "\n", + "for xlpath in listOfFiles:\n", + " comments_df = []\n", + " media, post_id = re.match(r\"data/([A-Z]+)/comments([0-9a-z\\-]+)\\.xlsx\",xlpath).groups()\n", + " comments_df = pf.get_comments(xlpath)\n", + " comments_df['media']=media\n", + " comments_df['post_id']=post_id\n", + " commentaires.append(comments_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df = pd.concat(commentaires, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "commentaires_df.to_csv(\"refined_data/commentaires_df.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/commentaires_reseaux_sociaux.mm b/commentaires_reseaux_sociaux.mm new file mode 100644 index 0000000..43cf601 --- /dev/null +++ b/commentaires_reseaux_sociaux.mm @@ -0,0 +1,261 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/parsing_functions.py b/parsing_functions.py new file mode 100644 index 0000000..e149e5c --- /dev/null +++ b/parsing_functions.py @@ -0,0 +1,57 @@ +import os +import re +import pandas as pd +import requests +from urllib.parse import unquote +import newspaper + +def get_comments(file_path): + df = pd.read_excel(file_path, skiprows=5, + names=['comment_id', + 'nested_id', + 'name', + 'id', + 'date', + 'likes', + 'comment', + 'source']) + df["comment_id"] = df["comment_id"].mask(pd.isnull,df["nested_id"].mask(pd.isnull,"0-0").apply(lambda x: re.match(r"([0-9]+)\-([0-9]+)",x).group(1))) + df["nested_id"] = df["nested_id"].mask(pd.isnull,"0-0").apply(lambda x: re.match(r"([0-9]+)\-([0-9]+)",x).group(2)) + del df["source"] + return df + +def get_text_article(file_path): + url = pd.read_excel(file_path,skiprows=1,nrows=1,header=None,names=['source','url']+['']*6)['url'][0] + request_url = requests.get(url) + html_content = str(request_url.content) + link_urls = re.findall(r'http[s]?://l\.facebook(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',html_content) + u_link_urls = [unquote(unquote(u.replace("https://l.facebook.com/l.php?u=",""))) for u in link_urls] + article = newspaper.Article('') + try: + url_article = [re.search("(.*)&.*",u).group(1) for u in u_link_urls][-3] + request_url_article = requests.get(url_article) + article.set_html(request_url_article.content) + except: + article.set_html(html_content) + try: + article.parse() + text_article = article.text.replace("\n\n"," ") + except: + text_article = "" + return text_article + +def getListOfFiles(dirName): + # create a list of file and sub directories + # names in the given directory + listOfFile = os.listdir(dirName) + allFiles = list() + # Iterate over all the entries + for entry in listOfFile: + # Create full path + fullPath = os.path.join(dirName, entry) + # If entry is a directory then get the list of files in this directory + if os.path.isdir(fullPath): + allFiles = allFiles + getListOfFiles(fullPath) + else: + allFiles.append(fullPath) + return allFiles \ No newline at end of file diff --git a/textes_articles.ipynb b/textes_articles.ipynb new file mode 100644 index 0000000..cd1c3c1 --- /dev/null +++ b/textes_articles.ipynb @@ -0,0 +1,113 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import parsing_functions as pf\n", + "import re\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "textes_articles = []\n", + "for xlpath in listOfFiles:\n", + " time.sleep(3)\n", + " media, post_id = re.match(r\"data/([A-Z]+)/comments([0-9a-z\\-]+)\\.xlsx\",xlpath).groups()\n", + " textes_articles.append([media,post_id,pf.get_text_article(xlpath)])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'textes_articles' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtextes_articles_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtextes_articles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'media'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'post_id'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'textes_articles' is not defined" + ] + } + ], + "source": [ + "textes_articles_df = pd.DataFrame(textes_articles, columns=['media','post_id','text'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'textes_articles_df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtextes_articles_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"textes_articles_df.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'textes_articles_df' is not defined" + ] + } + ], + "source": [ + "textes_articles_df.to_csv(\"textes_articles_df.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "textes_articles_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}