303 lines
7 KiB
Text
303 lines
7 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Aller chercher les synsets\n",
|
|
"# Variantes morphologiques\n",
|
|
"# Enlever les noms des autres commenteux\n",
|
|
"# Traiter les émoticones"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from nltk.corpus import stopwords\n",
|
|
"from nltk.tokenize import TweetTokenizer\n",
|
|
"from nltk.parse import CoreNLPParser\n",
|
|
"import re\n",
|
|
"import pickle\n",
|
|
"import emoji\n",
|
|
"import pretraitement as pr"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Tokenisation des commentaires\n",
|
|
"\n",
|
|
"Utilisation du TweetTokenizer, car il est davantage adapté au contenu des utilisateurs sur les médias sociaux"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Création de l'objet Tokenizer\n",
|
|
"tok = TweetTokenizer(preserve_case=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_df = pd.read_csv(\"refined_data/commentaires_df.csv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_df.head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#suppression de la première colonne qui ne sert à rien\n",
|
|
"del commentaires_df['Unnamed: 0']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Noms des auteurs\n",
|
|
"\n",
|
|
"Extraction du nom des auteurs pour chaque commentaire et ses sous-commentaires"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"names_df = pd.DataFrame(commentaires_df.groupby(['post_id','comment_id'])['name'], columns=['post_comment','list_names'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"names_df['list_names'] = names_df.apply(lambda x: list(set(x['list_names'])), axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"names_df['post_id'] = names_df.apply(lambda x: x['post_comment'][0], axis=1)\n",
|
|
"names_df['comment_id'] = names_df.apply(lambda x: x['post_comment'][1], axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"del names_df['post_comment']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"names_df.head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Traitement du nom des auteurs dans les textes des commentaires"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_df_names = commentaires_df.merge(names_df)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def list_auteurs_referes(comment,names):\n",
|
|
" auteurs_referes = []\n",
|
|
" try:\n",
|
|
" if len(names) > 0:\n",
|
|
" for i in range(len(names)):\n",
|
|
" if (comment.find(names[i]) >=0):\n",
|
|
" auteurs_referes.append(names[i])\n",
|
|
" return list(set(auteurs_referes))\n",
|
|
" except:\n",
|
|
" return auteurs_referes\n",
|
|
"\n",
|
|
"def remove_names(comment,names):\n",
|
|
" try:\n",
|
|
" if len(names) > 0:\n",
|
|
" for i in range(len(names)):\n",
|
|
" comment = comment.replace(names[i],'')\n",
|
|
" return comment\n",
|
|
" except:\n",
|
|
" return comment"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Nettoyage des commentaires et traitement des émoticones"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_df_names['auteurs_referes'] = commentaires_df_names.apply(lambda x: str(list_auteurs_referes(x['comment'],x['list_names'])), axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_df_names['comment_clean'] = commentaires_df_names.apply(lambda x: str(remove_names(x['comment'],x['list_names'])), axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_df_names.head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_pretraite = []\n",
|
|
"compteur=0\n",
|
|
"for x in list(commentaires_df_names[\"comment_clean\"]):\n",
|
|
" print(str(compteur)+\": \"+x)\n",
|
|
" commentaires_pretraite.append(pr.pretraitement(x,tok,ner_tagger,pos_tagger))\n",
|
|
" compteur += 1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_pretraite"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"commentaires_df_names['ner_dict']=[pr.aggreger_ner_tags(article) for article in commentaires_pretraite]\n",
|
|
"commentaires_df_names['pos_dict']=[pr.aggreger_pos_tags(article) for article in commentaires_pretraite]\n",
|
|
"commentaires_df_names['emoji_dict']=[pr.aggreger_emoji(article) for article in commentaires_pretraite]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"f = open(\"pickle/commentaires_df.pickle\",\"wb\")\n",
|
|
"pickle.dump(commentaires_df_names,f)\n",
|
|
"f.close()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|