{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Aller chercher les synsets\n", "# Variantes morphologiques\n", "# Enlever les noms des autres commenteux\n", "# Traiter les émoticones" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import TweetTokenizer\n", "from nltk.parse import CoreNLPParser\n", "import re\n", "import pickle\n", "import emoji\n", "import pretraitement as pr" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenisation des commentaires\n", "\n", "Utilisation du TweetTokenizer, car il est davantage adapté au contenu des utilisateurs sur les médias sociaux" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Création de l'objet Tokenizer\n", "tok = TweetTokenizer(preserve_case=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_df = pd.read_csv(\"refined_data/commentaires_df.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#suppression de la première colonne qui ne sert à rien\n", "del commentaires_df['Unnamed: 0']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Noms des auteurs\n", "\n", "Extraction du nom des auteurs pour chaque commentaire et ses sous-commentaires" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "names_df = pd.DataFrame(commentaires_df.groupby(['post_id','comment_id'])['name'], columns=['post_comment','list_names'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "names_df['list_names'] = names_df.apply(lambda x: list(set(x['list_names'])), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "names_df['post_id'] = names_df.apply(lambda x: x['post_comment'][0], axis=1)\n", "names_df['comment_id'] = names_df.apply(lambda x: x['post_comment'][1], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "del names_df['post_comment']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "names_df.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Traitement du nom des auteurs dans les textes des commentaires" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_df_names = commentaires_df.merge(names_df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def list_auteurs_referes(comment,names):\n", " auteurs_referes = []\n", " try:\n", " if len(names) > 0:\n", " for i in range(len(names)):\n", " if (comment.find(names[i]) >=0):\n", " auteurs_referes.append(names[i])\n", " return list(set(auteurs_referes))\n", " except:\n", " return auteurs_referes\n", "\n", "def remove_names(comment,names):\n", " try:\n", " if len(names) > 0:\n", " for i in range(len(names)):\n", " comment = comment.replace(names[i],'')\n", " return comment\n", " except:\n", " return comment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Nettoyage des commentaires et traitement des émoticones" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_df_names['auteurs_referes'] = commentaires_df_names.apply(lambda x: str(list_auteurs_referes(x['comment'],x['list_names'])), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_df_names['comment_clean'] = commentaires_df_names.apply(lambda x: str(remove_names(x['comment'],x['list_names'])), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_df_names.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_pretraite = []\n", "compteur=0\n", "for x in list(commentaires_df_names[\"comment_clean\"]):\n", " print(str(compteur)+\": \"+x)\n", " commentaires_pretraite.append(pr.pretraitement(x,tok,ner_tagger,pos_tagger))\n", " compteur += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_pretraite" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commentaires_df_names['ner_dict']=[pr.aggreger_ner_tags(article) for article in commentaires_pretraite]\n", "commentaires_df_names['pos_dict']=[pr.aggreger_pos_tags(article) for article in commentaires_pretraite]\n", "commentaires_df_names['emoji_dict']=[pr.aggreger_emoji(article) for article in commentaires_pretraite]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = open(\"pickle/commentaires_df.pickle\",\"wb\")\n", "pickle.dump(commentaires_df_names,f)\n", "f.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }