From c96c75b8c7ec6843b222fd87a3a469960725840f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Tue, 25 Dec 2018 21:23:56 -0500 Subject: [PATCH] Initial commit --- .gitignore | 1 + Projets d'infrastructure.ipynb | 321 +++++++++++++++++++++++++++++++++ download.py | 7 + 3 files changed, 329 insertions(+) create mode 100644 .gitignore create mode 100644 Projets d'infrastructure.ipynb create mode 100644 download.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..afed073 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/Projets d'infrastructure.ipynb b/Projets d'infrastructure.ipynb new file mode 100644 index 0000000..98389ec --- /dev/null +++ b/Projets d'infrastructure.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import urllib.request\n", + "import re\n", + "from urllib.error import URLError, HTTPError, ContentTooShortError\n", + "from bs4 import BeautifulSoup\n", + "from pprint import pprint\n", + "from functools import reduce\n", + "import operator" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"download.py\") as f:\n", + " code = compile(f.read(), \"download.py\", 'exec')\n", + " exec(code)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [], + "source": [ + "myPrefix='https://www.tresor.gouv.qc.ca'\n", + "liens_dict_cum=[]\n", + "infos_dict=[]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_liens(page_num):\n", + " myURL=\"https://www.tresor.gouv.qc.ca/infrastructures-publiques/tableau-de-bord/?tx_tdbpip_tdbpip%5BshowResults%5D=1&tx_tdbpip_tdbpip%5Btous%5D=1&tx_tdbpip_tdbpip%5Baction%5D=list&tx_tdbpip_tdbpip%5Bcontroller%5D=Projet&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BcurrentPage%5D=\"+str(page_num)+\"&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5BmotsCles%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bavancement%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bactivite%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bstatut%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bregion%5D=&cHash=694148348751eff6b5e3706577a186e5\"\n", + " firstpage = download(myURL)\n", + " soup = BeautifulSoup(firstpage, 'html5lib')\n", + " firstpage_html = soup.prettify()\n", + " mon_tableau = soup.find(\"tbody\").findAll(\"tr\")\n", + " liens_dict=[]\n", + " for item in mon_tableau:\n", + " liens = item.findAll(\"a\")\n", + " if (len(liens)==0):\n", + " continue\n", + " element={}\n", + " element['lien']=myPrefix+liens[0]['href']\n", + " element['titre']=liens[0].get_text()\n", + " element['etat']=liens[1].get_text()\n", + " print(element)\n", + " liens_dict.append(element)\n", + " return(liens_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(1,13):\n", + " liens_dict_cum.append(get_liens(i))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "liens_dict_all = reduce(operator.concat, liens_dict_cum)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for lien in liens_dict_all:\n", + " print(lien)\n", + " page_descr = download(lien['lien'])\n", + " soup_descr = BeautifulSoup(page_descr, 'html5lib')\n", + " soup_descr_infos = soup_descr.select('.infos-projet')[0].findAll('tr')\n", + " element={}\n", + " for ligne in soup_descr_infos:\n", + " items = ligne.findAll('td')\n", + " element[items[0].get_text()] = re.sub(\"[\\n]\",\";\",re.sub(\"[\\t]\",\"\",items[1].get_text()))\n", + " try:\n", + " soup_descr_compl_lab = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('label')]\n", + " soup_descr_compl_span = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('span')]\n", + " element.update(dict(zip(soup_descr_compl_lab,soup_descr_compl_span)))\n", + " except: \n", + " pass\n", + " infos_dict.append(element)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "liens_infos_projets = pd.concat([pd.DataFrame(liens_dict_all),pd.DataFrame(infos_dict)],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "liens_infos_projets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows#17116976\n", + "## https://stackoverflow.com/questions/38152389/coalesce-values-from-2-columns-into-a-single-column-in-a-pandas-dataframe#38152458\n", + "region_split = (\n", + " liens_infos_projets['Régions'].\n", + " str.\n", + " split(r'([0-9]{2}[\\s\\-]{3}[^0-9]+);;').\n", + " apply(pd.Series, 1).\n", + " loc[:,[1,3]].\n", + " stack()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region_split.index = region_split.index.droplevel(-1)\n", + "region_split.name = 'Régions'\n", + "del liens_infos_projets['Régions']\n", + "liens_infos_projets = liens_infos_projets.join(region_split)\n", + "liens_infos_projets['Région'] = liens_infos_projets['Région'].combine_first(liens_infos_projets['Régions'])\n", + "del liens_infos_projets['Régions']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ministre_split = (\n", + " liens_infos_projets['Ministres'].\n", + " str.\n", + " split(r'([^;]+);;').\n", + " apply(pd.Series, 1).\n", + " loc[:,[1,3]].\n", + " stack()\n", + " )\n", + "\n", + "ministre_split.index = ministre_split.index.droplevel(-1)\n", + "ministre_split.name = 'Ministres'\n", + "del liens_infos_projets['Ministres']\n", + "liens_infos_projets = liens_infos_projets.join(ministre_split)\n", + "liens_infos_projets['Ministre'] = liens_infos_projets['Ministre'].combine_first(liens_infos_projets['Ministres'])\n", + "del liens_infos_projets['Ministres']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "organisme_split = (\n", + " liens_infos_projets['Organismes'].\n", + " str.\n", + " split(r'([^;]+)').\n", + " apply(pd.Series, 1).\n", + " loc[:,[1,3]].\n", + " stack()\n", + " )\n", + "\n", + "organisme_split.index = organisme_split.index.droplevel(-1)\n", + "organisme_split.name = 'Organismes'\n", + "del liens_infos_projets['Organismes']\n", + "liens_infos_projets = liens_infos_projets.join(organisme_split)\n", + "liens_infos_projets['Organisme'] = liens_infos_projets['Organisme'].combine_first(liens_infos_projets['Organismes'])\n", + "del liens_infos_projets['Organismes']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## https://stackoverflow.com/questions/36072626/pandas-replace-multiple-values-at-once\n", + "replacements = {\n", + " 'Gestionnaire du projet': {\n", + " r';;': ''\n", + " },\n", + " 'Ministre': {\n", + " r';;': ''\n", + " },\n", + " 'Organisme': {\n", + " r';;': ''\n", + " },\n", + " 'Région': {\n", + " r';;': ''\n", + " },\n", + " 'Secteur': {\n", + " r';;': ''\n", + " },\n", + " 'Contribution des partenaires': {\n", + " r'\\s*M\\$': '',\n", + " r'\\,':'.',\n", + " r'\\s':''\n", + " },\n", + " 'Contribution du Québec': {\n", + " r'\\s*M\\$': '',\n", + " r'\\,':'.',\n", + " r'\\s':''\n", + " },\n", + " 'Coût': {\n", + " r'\\s*M\\$': '',\n", + " r'\\,':'.',\n", + " r'\\s':''\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "liens_infos_projets = liens_infos_projets.replace(replacements,regex=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "liens_infos_projets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "liens_infos_projets.to_csv(\"projets_infrastructures.csv\",index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/download.py b/download.py new file mode 100644 index 0000000..98258d0 --- /dev/null +++ b/download.py @@ -0,0 +1,7 @@ +def download(url): + try: + html = urllib.request.urlopen(urllib.request.Request(url,headers={'User-Agent': 'Mozilla'})).read() + except (URLError, HTTPError, ContentTooShortError) as e: + print('Download error:', e.reason) + html = None + return html \ No newline at end of file