{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "import re\n", "from urllib.error import URLError, HTTPError, ContentTooShortError\n", "from bs4 import BeautifulSoup\n", "from pprint import pprint\n", "from functools import reduce\n", "import operator" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [], "source": [ "with open(\"download.py\") as f:\n", " code = compile(f.read(), \"download.py\", 'exec')\n", " exec(code)" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "myPrefix='https://www.tresor.gouv.qc.ca'\n", "liens_dict_cum=[]\n", "infos_dict=[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_liens(page_num):\n", " myURL=\"https://www.tresor.gouv.qc.ca/infrastructures-publiques/tableau-de-bord/?tx_tdbpip_tdbpip%5BshowResults%5D=1&tx_tdbpip_tdbpip%5Btous%5D=1&tx_tdbpip_tdbpip%5Baction%5D=list&tx_tdbpip_tdbpip%5Bcontroller%5D=Projet&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BcurrentPage%5D=\"+str(page_num)+\"&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5BmotsCles%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bavancement%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bactivite%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bstatut%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bregion%5D=&cHash=694148348751eff6b5e3706577a186e5\"\n", " firstpage = download(myURL)\n", " soup = BeautifulSoup(firstpage, 'html5lib')\n", " firstpage_html = soup.prettify()\n", " mon_tableau = soup.find(\"tbody\").findAll(\"tr\")\n", " liens_dict=[]\n", " for item in mon_tableau:\n", " liens = item.findAll(\"a\")\n", " if (len(liens)==0):\n", " continue\n", " element={}\n", " element['lien']=myPrefix+liens[0]['href']\n", " element['titre']=liens[0].get_text()\n", " element['etat']=liens[1].get_text()\n", " print(element)\n", " liens_dict.append(element)\n", " return(liens_dict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i in range(1,13):\n", " liens_dict_cum.append(get_liens(i))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "liens_dict_all = reduce(operator.concat, liens_dict_cum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for lien in liens_dict_all:\n", " print(lien)\n", " page_descr = download(lien['lien'])\n", " soup_descr = BeautifulSoup(page_descr, 'html5lib')\n", " soup_descr_infos = soup_descr.select('.infos-projet')[0].findAll('tr')\n", " element={}\n", " for ligne in soup_descr_infos:\n", " items = ligne.findAll('td')\n", " element[items[0].get_text()] = re.sub(\"[\\n]\",\";\",re.sub(\"[\\t]\",\"\",items[1].get_text()))\n", " try:\n", " soup_descr_compl_lab = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('label')]\n", " soup_descr_compl_span = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('span')]\n", " element.update(dict(zip(soup_descr_compl_lab,soup_descr_compl_span)))\n", " except: \n", " pass\n", " infos_dict.append(element)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "liens_infos_projets = pd.concat([pd.DataFrame(liens_dict_all),pd.DataFrame(infos_dict)],axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "liens_infos_projets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows#17116976\n", "## https://stackoverflow.com/questions/38152389/coalesce-values-from-2-columns-into-a-single-column-in-a-pandas-dataframe#38152458\n", "region_split = (\n", " liens_infos_projets['Régions'].\n", " str.\n", " split(r'([0-9]{2}[\\s\\-]{3}[^0-9]+);;').\n", " apply(pd.Series, 1).\n", " loc[:,[1,3]].\n", " stack()\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "region_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "region_split.index = region_split.index.droplevel(-1)\n", "region_split.name = 'Régions'\n", "del liens_infos_projets['Régions']\n", "liens_infos_projets = liens_infos_projets.join(region_split)\n", "liens_infos_projets['Région'] = liens_infos_projets['Région'].combine_first(liens_infos_projets['Régions'])\n", "del liens_infos_projets['Régions']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ministre_split = (\n", " liens_infos_projets['Ministres'].\n", " str.\n", " split(r'([^;]+);;').\n", " apply(pd.Series, 1).\n", " loc[:,[1,3]].\n", " stack()\n", " )\n", "\n", "ministre_split.index = ministre_split.index.droplevel(-1)\n", "ministre_split.name = 'Ministres'\n", "del liens_infos_projets['Ministres']\n", "liens_infos_projets = liens_infos_projets.join(ministre_split)\n", "liens_infos_projets['Ministre'] = liens_infos_projets['Ministre'].combine_first(liens_infos_projets['Ministres'])\n", "del liens_infos_projets['Ministres']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "organisme_split = (\n", " liens_infos_projets['Organismes'].\n", " str.\n", " split(r'([^;]+)').\n", " apply(pd.Series, 1).\n", " loc[:,[1,3]].\n", " stack()\n", " )\n", "\n", "organisme_split.index = organisme_split.index.droplevel(-1)\n", "organisme_split.name = 'Organismes'\n", "del liens_infos_projets['Organismes']\n", "liens_infos_projets = liens_infos_projets.join(organisme_split)\n", "liens_infos_projets['Organisme'] = liens_infos_projets['Organisme'].combine_first(liens_infos_projets['Organismes'])\n", "del liens_infos_projets['Organismes']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## https://stackoverflow.com/questions/36072626/pandas-replace-multiple-values-at-once\n", "replacements = {\n", " 'Gestionnaire du projet': {\n", " r';;': ''\n", " },\n", " 'Ministre': {\n", " r';;': ''\n", " },\n", " 'Organisme': {\n", " r';;': ''\n", " },\n", " 'Région': {\n", " r';;': ''\n", " },\n", " 'Secteur': {\n", " r';;': ''\n", " },\n", " 'Contribution des partenaires': {\n", " r'\\s*M\\$': '',\n", " r'\\,':'.',\n", " r'\\s':''\n", " },\n", " 'Contribution du Québec': {\n", " r'\\s*M\\$': '',\n", " r'\\,':'.',\n", " r'\\s':''\n", " },\n", " 'Coût': {\n", " r'\\s*M\\$': '',\n", " r'\\,':'.',\n", " r'\\s':''\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "liens_infos_projets = liens_infos_projets.replace(replacements,regex=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "liens_infos_projets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "liens_infos_projets.to_csv(\"projets_infrastructures.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }