189 lines
4.8 KiB
Text
189 lines
4.8 KiB
Text
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 153,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"import numpy as np\n",
|
||
|
"from nltk.corpus import stopwords\n",
|
||
|
"from nltk.tokenize import toktok, sent_tokenize\n",
|
||
|
"from nltk.parse import CoreNLPParser\n",
|
||
|
"import re\n",
|
||
|
"import pickle"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"tok = toktok.ToktokTokenizer()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')\n",
|
||
|
"#https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"textes_articles_df = pd.read_csv(\"refined_data/textes_articles_df.csv\")\n",
|
||
|
"textes_articles_df = textes_articles_df[textes_articles_df[\"text\"].notnull() & (textes_articles_df[\"media\"]!='CNN')]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 163,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"del textes_articles_df['Unnamed: 0']"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Prétraitement\n",
|
||
|
"def pretraitement(article):\n",
|
||
|
" # tokeniser par phrases\n",
|
||
|
" article_sentences = sent_tokenize(article)\n",
|
||
|
" article_ner_tokens = []\n",
|
||
|
" article_pos_tokens = []\n",
|
||
|
" compteur_phrase = 0\n",
|
||
|
" for sentence in article_sentences:\n",
|
||
|
" # Tokeniser\n",
|
||
|
" sentence_tokens = tok.tokenize(sentence)\n",
|
||
|
" # Assembler les entités nommées et colocations\n",
|
||
|
" sentence_ner = ner_tagger.tag(sentence_tokens)\n",
|
||
|
" ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']\n",
|
||
|
" # Supprimer les classes fermées avec un POS\n",
|
||
|
" sentence_pos = pos_tagger.tag(sentence_tokens)\n",
|
||
|
" pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]\n",
|
||
|
" # Ajouter à la liste de phrases tokenisées\n",
|
||
|
" article_ner_tokens.append(ner_tokens)\n",
|
||
|
" article_pos_tokens.append(pos_tokens)\n",
|
||
|
" compteur_phrase += 1\n",
|
||
|
" return article_ner_tokens, article_pos_tokens"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 21,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"article_pretraite = [pretraitement(x) for x in list(textes_articles_df[\"text\"])]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 131,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def aggreger_ner_tags(article):\n",
|
||
|
" dict_named_entity = {}\n",
|
||
|
" for sentence in article[0]:\n",
|
||
|
" for entity in sentence:\n",
|
||
|
" dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1\n",
|
||
|
" return dict_named_entity"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 132,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def aggreger_pos_tags(article):\n",
|
||
|
" dict_pos = {}\n",
|
||
|
" for sentence in article[1]:\n",
|
||
|
" for pos in sentence:\n",
|
||
|
" dict_pos[pos] = dict_pos.get(pos,0) + 1\n",
|
||
|
" return dict_pos"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 165,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"textes_articles_df['ner_dict']=[aggreger_ner_tags(article) for article in article_pretraite]\n",
|
||
|
"textes_articles_df['pos_dict']=[aggreger_pos_tags(article) for article in article_pretraite]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 167,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"f = open(\"pickle/textes_articles_df.pickle\",\"wb\")\n",
|
||
|
"pickle.dump(textes_articles_df,f)\n",
|
||
|
"f.close()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.7.3"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 4
|
||
|
}
|