{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import toktok, sent_tokenize\n", "from nltk.parse import CoreNLPParser\n", "import re\n", "import pickle\n", "import emoji\n", "import pretraitement as pr" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tok = toktok.ToktokTokenizer()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')\n", "#https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "textes_articles_df = pd.read_csv(\"refined_data/textes_articles_df.csv\")\n", "textes_articles_df = textes_articles_df[textes_articles_df[\"text\"].notnull() & (textes_articles_df[\"media\"]!='CNN')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "del textes_articles_df['Unnamed: 0']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "article_pretraite = [pr.pretraitement(x,tok,ner_tagger,pos_tagger) for x in list(textes_articles_df[\"text\"])]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "textes_articles_df['ner_dict']=[pr.aggreger_ner_tags(article) for article in article_pretraite]\n", "textes_articles_df['pos_dict']=[pr.aggreger_pos_tags(article) for article in article_pretraite]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = open(\"pickle/textes_articles_df.pickle\",\"wb\")\n", "pickle.dump(textes_articles_df,f)\n", "f.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }