Initial commit

This commit is contained in:
François Pelletier 2018-12-25 21:23:56 -05:00
commit c96c75b8c7
3 changed files with 329 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*.csv

View file

@ -0,0 +1,321 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import urllib.request\n",
"import re\n",
"from urllib.error import URLError, HTTPError, ContentTooShortError\n",
"from bs4 import BeautifulSoup\n",
"from pprint import pprint\n",
"from functools import reduce\n",
"import operator"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"with open(\"download.py\") as f:\n",
" code = compile(f.read(), \"download.py\", 'exec')\n",
" exec(code)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
"myPrefix='https://www.tresor.gouv.qc.ca'\n",
"liens_dict_cum=[]\n",
"infos_dict=[]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_liens(page_num):\n",
" myURL=\"https://www.tresor.gouv.qc.ca/infrastructures-publiques/tableau-de-bord/?tx_tdbpip_tdbpip%5BshowResults%5D=1&tx_tdbpip_tdbpip%5Btous%5D=1&tx_tdbpip_tdbpip%5Baction%5D=list&tx_tdbpip_tdbpip%5Bcontroller%5D=Projet&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BcurrentPage%5D=\"+str(page_num)+\"&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5BmotsCles%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bavancement%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bactivite%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bstatut%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bregion%5D=&cHash=694148348751eff6b5e3706577a186e5\"\n",
" firstpage = download(myURL)\n",
" soup = BeautifulSoup(firstpage, 'html5lib')\n",
" firstpage_html = soup.prettify()\n",
" mon_tableau = soup.find(\"tbody\").findAll(\"tr\")\n",
" liens_dict=[]\n",
" for item in mon_tableau:\n",
" liens = item.findAll(\"a\")\n",
" if (len(liens)==0):\n",
" continue\n",
" element={}\n",
" element['lien']=myPrefix+liens[0]['href']\n",
" element['titre']=liens[0].get_text()\n",
" element['etat']=liens[1].get_text()\n",
" print(element)\n",
" liens_dict.append(element)\n",
" return(liens_dict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i in range(1,13):\n",
" liens_dict_cum.append(get_liens(i))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"liens_dict_all = reduce(operator.concat, liens_dict_cum)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for lien in liens_dict_all:\n",
" print(lien)\n",
" page_descr = download(lien['lien'])\n",
" soup_descr = BeautifulSoup(page_descr, 'html5lib')\n",
" soup_descr_infos = soup_descr.select('.infos-projet')[0].findAll('tr')\n",
" element={}\n",
" for ligne in soup_descr_infos:\n",
" items = ligne.findAll('td')\n",
" element[items[0].get_text()] = re.sub(\"[\\n]\",\";\",re.sub(\"[\\t]\",\"\",items[1].get_text()))\n",
" try:\n",
" soup_descr_compl_lab = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('label')]\n",
" soup_descr_compl_span = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('span')]\n",
" element.update(dict(zip(soup_descr_compl_lab,soup_descr_compl_span)))\n",
" except: \n",
" pass\n",
" infos_dict.append(element)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"liens_infos_projets = pd.concat([pd.DataFrame(liens_dict_all),pd.DataFrame(infos_dict)],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"liens_infos_projets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows#17116976\n",
"## https://stackoverflow.com/questions/38152389/coalesce-values-from-2-columns-into-a-single-column-in-a-pandas-dataframe#38152458\n",
"region_split = (\n",
" liens_infos_projets['Régions'].\n",
" str.\n",
" split(r'([0-9]{2}[\\s\\-]{3}[^0-9]+);;').\n",
" apply(pd.Series, 1).\n",
" loc[:,[1,3]].\n",
" stack()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"region_split"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"region_split.index = region_split.index.droplevel(-1)\n",
"region_split.name = 'Régions'\n",
"del liens_infos_projets['Régions']\n",
"liens_infos_projets = liens_infos_projets.join(region_split)\n",
"liens_infos_projets['Région'] = liens_infos_projets['Région'].combine_first(liens_infos_projets['Régions'])\n",
"del liens_infos_projets['Régions']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ministre_split = (\n",
" liens_infos_projets['Ministres'].\n",
" str.\n",
" split(r'([^;]+);;').\n",
" apply(pd.Series, 1).\n",
" loc[:,[1,3]].\n",
" stack()\n",
" )\n",
"\n",
"ministre_split.index = ministre_split.index.droplevel(-1)\n",
"ministre_split.name = 'Ministres'\n",
"del liens_infos_projets['Ministres']\n",
"liens_infos_projets = liens_infos_projets.join(ministre_split)\n",
"liens_infos_projets['Ministre'] = liens_infos_projets['Ministre'].combine_first(liens_infos_projets['Ministres'])\n",
"del liens_infos_projets['Ministres']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"organisme_split = (\n",
" liens_infos_projets['Organismes'].\n",
" str.\n",
" split(r'([^;]+)').\n",
" apply(pd.Series, 1).\n",
" loc[:,[1,3]].\n",
" stack()\n",
" )\n",
"\n",
"organisme_split.index = organisme_split.index.droplevel(-1)\n",
"organisme_split.name = 'Organismes'\n",
"del liens_infos_projets['Organismes']\n",
"liens_infos_projets = liens_infos_projets.join(organisme_split)\n",
"liens_infos_projets['Organisme'] = liens_infos_projets['Organisme'].combine_first(liens_infos_projets['Organismes'])\n",
"del liens_infos_projets['Organismes']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## https://stackoverflow.com/questions/36072626/pandas-replace-multiple-values-at-once\n",
"replacements = {\n",
" 'Gestionnaire du projet': {\n",
" r';;': ''\n",
" },\n",
" 'Ministre': {\n",
" r';;': ''\n",
" },\n",
" 'Organisme': {\n",
" r';;': ''\n",
" },\n",
" 'Région': {\n",
" r';;': ''\n",
" },\n",
" 'Secteur': {\n",
" r';;': ''\n",
" },\n",
" 'Contribution des partenaires': {\n",
" r'\\s*M\\$': '',\n",
" r'\\,':'.',\n",
" r'\\s':''\n",
" },\n",
" 'Contribution du Québec': {\n",
" r'\\s*M\\$': '',\n",
" r'\\,':'.',\n",
" r'\\s':''\n",
" },\n",
" 'Coût': {\n",
" r'\\s*M\\$': '',\n",
" r'\\,':'.',\n",
" r'\\s':''\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"liens_infos_projets = liens_infos_projets.replace(replacements,regex=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"liens_infos_projets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"liens_infos_projets.to_csv(\"projets_infrastructures.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

7
download.py Normal file
View file

@ -0,0 +1,7 @@
def download(url):
try:
html = urllib.request.urlopen(urllib.request.Request(url,headers={'User-Agent': 'Mozilla'})).read()
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
return html