From c96c75b8c7ec6843b222fd87a3a469960725840f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= <francois@francoispelletier.org>
Date: Tue, 25 Dec 2018 21:23:56 -0500
Subject: [PATCH] Initial commit

---
 .gitignore                     |   1 +
 Projets d'infrastructure.ipynb | 321 +++++++++++++++++++++++++++++++++
 download.py                    |   7 +
 3 files changed, 329 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Projets d'infrastructure.ipynb
 create mode 100644 download.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..afed073
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/Projets d'infrastructure.ipynb b/Projets d'infrastructure.ipynb
new file mode 100644
index 0000000..98389ec
--- /dev/null
+++ b/Projets d'infrastructure.ipynb	
@@ -0,0 +1,321 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import urllib.request\n",
+    "import re\n",
+    "from urllib.error import URLError, HTTPError, ContentTooShortError\n",
+    "from bs4 import BeautifulSoup\n",
+    "from pprint import pprint\n",
+    "from functools import reduce\n",
+    "import operator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 162,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"download.py\") as f:\n",
+    "    code = compile(f.read(), \"download.py\", 'exec')\n",
+    "    exec(code)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 163,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "myPrefix='https://www.tresor.gouv.qc.ca'\n",
+    "liens_dict_cum=[]\n",
+    "infos_dict=[]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_liens(page_num):\n",
+    "    myURL=\"https://www.tresor.gouv.qc.ca/infrastructures-publiques/tableau-de-bord/?tx_tdbpip_tdbpip%5BshowResults%5D=1&tx_tdbpip_tdbpip%5Btous%5D=1&tx_tdbpip_tdbpip%5Baction%5D=list&tx_tdbpip_tdbpip%5Bcontroller%5D=Projet&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BcurrentPage%5D=\"+str(page_num)+\"&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5BmotsCles%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bavancement%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bactivite%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bstatut%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bregion%5D=&cHash=694148348751eff6b5e3706577a186e5\"\n",
+    "    firstpage = download(myURL)\n",
+    "    soup = BeautifulSoup(firstpage, 'html5lib')\n",
+    "    firstpage_html = soup.prettify()\n",
+    "    mon_tableau = soup.find(\"tbody\").findAll(\"tr\")\n",
+    "    liens_dict=[]\n",
+    "    for item in mon_tableau:\n",
+    "        liens = item.findAll(\"a\")\n",
+    "        if (len(liens)==0):\n",
+    "            continue\n",
+    "        element={}\n",
+    "        element['lien']=myPrefix+liens[0]['href']\n",
+    "        element['titre']=liens[0].get_text()\n",
+    "        element['etat']=liens[1].get_text()\n",
+    "        print(element)\n",
+    "        liens_dict.append(element)\n",
+    "    return(liens_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(1,13):\n",
+    "    liens_dict_cum.append(get_liens(i))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "liens_dict_all = reduce(operator.concat, liens_dict_cum)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for lien in liens_dict_all:\n",
+    "    print(lien)\n",
+    "    page_descr = download(lien['lien'])\n",
+    "    soup_descr = BeautifulSoup(page_descr, 'html5lib')\n",
+    "    soup_descr_infos = soup_descr.select('.infos-projet')[0].findAll('tr')\n",
+    "    element={}\n",
+    "    for ligne in soup_descr_infos:\n",
+    "        items = ligne.findAll('td')\n",
+    "        element[items[0].get_text()] = re.sub(\"[\\n]\",\";\",re.sub(\"[\\t]\",\"\",items[1].get_text()))\n",
+    "    try:\n",
+    "        soup_descr_compl_lab = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('label')]\n",
+    "        soup_descr_compl_span = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('span')]\n",
+    "        element.update(dict(zip(soup_descr_compl_lab,soup_descr_compl_span)))\n",
+    "    except: \n",
+    "        pass\n",
+    "    infos_dict.append(element)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "liens_infos_projets = pd.concat([pd.DataFrame(liens_dict_all),pd.DataFrame(infos_dict)],axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "liens_infos_projets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows#17116976\n",
+    "## https://stackoverflow.com/questions/38152389/coalesce-values-from-2-columns-into-a-single-column-in-a-pandas-dataframe#38152458\n",
+    "region_split = (\n",
+    "    liens_infos_projets['Régions'].\n",
+    "    str.\n",
+    "    split(r'([0-9]{2}[\\s\\-]{3}[^0-9]+);;').\n",
+    "    apply(pd.Series, 1).\n",
+    "    loc[:,[1,3]].\n",
+    "    stack()\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "region_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "region_split.index = region_split.index.droplevel(-1)\n",
+    "region_split.name = 'Régions'\n",
+    "del liens_infos_projets['Régions']\n",
+    "liens_infos_projets = liens_infos_projets.join(region_split)\n",
+    "liens_infos_projets['Région'] = liens_infos_projets['Région'].combine_first(liens_infos_projets['Régions'])\n",
+    "del liens_infos_projets['Régions']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ministre_split = (\n",
+    "    liens_infos_projets['Ministres'].\n",
+    "    str.\n",
+    "    split(r'([^;]+);;').\n",
+    "    apply(pd.Series, 1).\n",
+    "    loc[:,[1,3]].\n",
+    "    stack()\n",
+    "    )\n",
+    "\n",
+    "ministre_split.index = ministre_split.index.droplevel(-1)\n",
+    "ministre_split.name = 'Ministres'\n",
+    "del liens_infos_projets['Ministres']\n",
+    "liens_infos_projets = liens_infos_projets.join(ministre_split)\n",
+    "liens_infos_projets['Ministre'] = liens_infos_projets['Ministre'].combine_first(liens_infos_projets['Ministres'])\n",
+    "del liens_infos_projets['Ministres']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "organisme_split = (\n",
+    "    liens_infos_projets['Organismes'].\n",
+    "    str.\n",
+    "    split(r'([^;]+)').\n",
+    "    apply(pd.Series, 1).\n",
+    "    loc[:,[1,3]].\n",
+    "    stack()\n",
+    "    )\n",
+    "\n",
+    "organisme_split.index = organisme_split.index.droplevel(-1)\n",
+    "organisme_split.name = 'Organismes'\n",
+    "del liens_infos_projets['Organismes']\n",
+    "liens_infos_projets = liens_infos_projets.join(organisme_split)\n",
+    "liens_infos_projets['Organisme'] = liens_infos_projets['Organisme'].combine_first(liens_infos_projets['Organismes'])\n",
+    "del liens_infos_projets['Organismes']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## https://stackoverflow.com/questions/36072626/pandas-replace-multiple-values-at-once\n",
+    "replacements = {\n",
+    "   'Gestionnaire du projet': {\n",
+    "        r';;': ''\n",
+    "    },\n",
+    "    'Ministre': {\n",
+    "        r';;': ''\n",
+    "    },\n",
+    "    'Organisme': {\n",
+    "        r';;': ''\n",
+    "    },\n",
+    "    'Région': {\n",
+    "        r';;': ''\n",
+    "    },\n",
+    "    'Secteur': {\n",
+    "        r';;': ''\n",
+    "    },\n",
+    "    'Contribution des partenaires': {\n",
+    "        r'\\s*M\\$': '',\n",
+    "        r'\\,':'.',\n",
+    "        r'\\s':''\n",
+    "    },\n",
+    "    'Contribution du Québec': {\n",
+    "        r'\\s*M\\$': '',\n",
+    "        r'\\,':'.',\n",
+    "        r'\\s':''\n",
+    "    },\n",
+    "    'Coût': {\n",
+    "        r'\\s*M\\$': '',\n",
+    "        r'\\,':'.',\n",
+    "        r'\\s':''\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "liens_infos_projets = liens_infos_projets.replace(replacements,regex=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "liens_infos_projets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "liens_infos_projets.to_csv(\"projets_infrastructures.csv\",index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/download.py b/download.py
new file mode 100644
index 0000000..98258d0
--- /dev/null
+++ b/download.py
@@ -0,0 +1,7 @@
+def download(url):
+    try:
+        html = urllib.request.urlopen(urllib.request.Request(url,headers={'User-Agent': 'Mozilla'})).read()
+    except (URLError, HTTPError, ContentTooShortError) as e:
+        print('Download error:', e.reason)
+        html = None
+    return html
\ No newline at end of file