2019-12-16 01:31:38 +00:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2019-12-16 23:25:47 +00:00
|
|
|
"execution_count": null,
|
2019-12-16 01:31:38 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"import numpy as np\n",
|
|
|
|
"from nltk.corpus import stopwords\n",
|
|
|
|
"from nltk.tokenize import toktok, sent_tokenize\n",
|
|
|
|
"from nltk.parse import CoreNLPParser\n",
|
|
|
|
"import re\n",
|
2019-12-16 23:25:47 +00:00
|
|
|
"import pickle\n",
|
|
|
|
"import emoji\n",
|
|
|
|
"import pretraitement as pr"
|
2019-12-16 01:31:38 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 2,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"tok = toktok.ToktokTokenizer()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 3,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 4,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')\n",
|
|
|
|
"#https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 6,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"textes_articles_df = pd.read_csv(\"refined_data/textes_articles_df.csv\")\n",
|
|
|
|
"textes_articles_df = textes_articles_df[textes_articles_df[\"text\"].notnull() & (textes_articles_df[\"media\"]!='CNN')]"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 163,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"del textes_articles_df['Unnamed: 0']"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 21,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2019-12-16 23:25:47 +00:00
|
|
|
"article_pretraite = [pr.pretraitement(x,tok,ner_tagger,pos_tagger) for x in list(textes_articles_df[\"text\"])]"
|
2019-12-16 01:31:38 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 165,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2019-12-16 23:25:47 +00:00
|
|
|
"textes_articles_df['ner_dict']=[pr.aggreger_ner_tags(article) for article in article_pretraite]\n",
|
|
|
|
"textes_articles_df['pos_dict']=[pr.aggreger_pos_tags(article) for article in article_pretraite]"
|
2019-12-16 01:31:38 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 167,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"f = open(\"pickle/textes_articles_df.pickle\",\"wb\")\n",
|
|
|
|
"pickle.dump(textes_articles_df,f)\n",
|
|
|
|
"f.close()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": []
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.7.3"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 4
|
|
|
|
}
|