{ "cells": [ { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import toktok, sent_tokenize\n", "from nltk.parse import CoreNLPParser\n", "import re\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "tok = toktok.ToktokTokenizer()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')\n", "#https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "textes_articles_df = pd.read_csv(\"refined_data/textes_articles_df.csv\")\n", "textes_articles_df = textes_articles_df[textes_articles_df[\"text\"].notnull() & (textes_articles_df[\"media\"]!='CNN')]" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "del textes_articles_df['Unnamed: 0']" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Prétraitement\n", "def pretraitement(article):\n", " # tokeniser par phrases\n", " article_sentences = sent_tokenize(article)\n", " article_ner_tokens = []\n", " article_pos_tokens = []\n", " compteur_phrase = 0\n", " for sentence in article_sentences:\n", " # Tokeniser\n", " sentence_tokens = tok.tokenize(sentence)\n", " # Assembler les entités nommées et colocations\n", " sentence_ner = ner_tagger.tag(sentence_tokens)\n", " ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']\n", " # Supprimer les classes fermées avec un POS\n", " sentence_pos = pos_tagger.tag(sentence_tokens)\n", " pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]\n", " # Ajouter à la liste de phrases tokenisées\n", " article_ner_tokens.append(ner_tokens)\n", " article_pos_tokens.append(pos_tokens)\n", " compteur_phrase += 1\n", " return article_ner_tokens, article_pos_tokens" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "article_pretraite = [pretraitement(x) for x in list(textes_articles_df[\"text\"])]" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [], "source": [ "def aggreger_ner_tags(article):\n", " dict_named_entity = {}\n", " for sentence in article[0]:\n", " for entity in sentence:\n", " dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1\n", " return dict_named_entity" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [], "source": [ "def aggreger_pos_tags(article):\n", " dict_pos = {}\n", " for sentence in article[1]:\n", " for pos in sentence:\n", " dict_pos[pos] = dict_pos.get(pos,0) + 1\n", " return dict_pos" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [], "source": [ "textes_articles_df['ner_dict']=[aggreger_ner_tags(article) for article in article_pretraite]\n", "textes_articles_df['pos_dict']=[aggreger_pos_tags(article) for article in article_pretraite]" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [], "source": [ "f = open(\"pickle/textes_articles_df.pickle\",\"wb\")\n", "pickle.dump(textes_articles_df,f)\n", "f.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }