nlp_a2019_tp3/Traitement Articles.ipynb

127 lines
2.8 KiB
Text
Raw Normal View History

2019-12-16 01:31:38 +00:00
{
"cells": [
{
"cell_type": "code",
2019-12-16 23:25:47 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import toktok, sent_tokenize\n",
"from nltk.parse import CoreNLPParser\n",
"import re\n",
2019-12-16 23:25:47 +00:00
"import pickle\n",
"import emoji\n",
"import pretraitement as pr"
2019-12-16 01:31:38 +00:00
]
},
{
"cell_type": "code",
2019-12-19 05:25:23 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
"tok = toktok.ToktokTokenizer()"
]
},
{
"cell_type": "code",
2019-12-19 05:25:23 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
"pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')"
]
},
{
"cell_type": "code",
2019-12-19 05:25:23 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
"ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')\n",
"#https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK"
]
},
{
"cell_type": "code",
2019-12-19 05:25:23 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
"textes_articles_df = pd.read_csv(\"refined_data/textes_articles_df.csv\")\n",
"textes_articles_df = textes_articles_df[textes_articles_df[\"text\"].notnull() & (textes_articles_df[\"media\"]!='CNN')]"
]
},
{
"cell_type": "code",
2019-12-19 05:25:23 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
"del textes_articles_df['Unnamed: 0']"
]
},
{
"cell_type": "code",
2019-12-19 05:25:23 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
2019-12-16 23:25:47 +00:00
"article_pretraite = [pr.pretraitement(x,tok,ner_tagger,pos_tagger) for x in list(textes_articles_df[\"text\"])]"
2019-12-16 01:31:38 +00:00
]
},
{
"cell_type": "code",
2019-12-19 05:25:23 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
2019-12-16 23:25:47 +00:00
"textes_articles_df['ner_dict']=[pr.aggreger_ner_tags(article) for article in article_pretraite]\n",
"textes_articles_df['pos_dict']=[pr.aggreger_pos_tags(article) for article in article_pretraite]"
2019-12-16 01:31:38 +00:00
]
},
{
"cell_type": "code",
2019-12-19 05:25:23 +00:00
"execution_count": null,
2019-12-16 01:31:38 +00:00
"metadata": {},
"outputs": [],
"source": [
"f = open(\"pickle/textes_articles_df.pickle\",\"wb\")\n",
"pickle.dump(textes_articles_df,f)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}