Initial commit
This commit is contained in:
commit
c96c75b8c7
3 changed files with 329 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
*.csv
|
321
Projets d'infrastructure.ipynb
Normal file
321
Projets d'infrastructure.ipynb
Normal file
|
@ -0,0 +1,321 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import urllib.request\n",
|
||||
"import re\n",
|
||||
"from urllib.error import URLError, HTTPError, ContentTooShortError\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from pprint import pprint\n",
|
||||
"from functools import reduce\n",
|
||||
"import operator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 162,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"download.py\") as f:\n",
|
||||
" code = compile(f.read(), \"download.py\", 'exec')\n",
|
||||
" exec(code)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 163,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"myPrefix='https://www.tresor.gouv.qc.ca'\n",
|
||||
"liens_dict_cum=[]\n",
|
||||
"infos_dict=[]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_liens(page_num):\n",
|
||||
" myURL=\"https://www.tresor.gouv.qc.ca/infrastructures-publiques/tableau-de-bord/?tx_tdbpip_tdbpip%5BshowResults%5D=1&tx_tdbpip_tdbpip%5Btous%5D=1&tx_tdbpip_tdbpip%5Baction%5D=list&tx_tdbpip_tdbpip%5Bcontroller%5D=Projet&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BcurrentPage%5D=\"+str(page_num)+\"&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5BmotsCles%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bavancement%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bactivite%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bstatut%5D=&tx_tdbpip_tdbpip%5B%40widget_0%5D%5BrechercheInfo%5D%5Bregion%5D=&cHash=694148348751eff6b5e3706577a186e5\"\n",
|
||||
" firstpage = download(myURL)\n",
|
||||
" soup = BeautifulSoup(firstpage, 'html5lib')\n",
|
||||
" firstpage_html = soup.prettify()\n",
|
||||
" mon_tableau = soup.find(\"tbody\").findAll(\"tr\")\n",
|
||||
" liens_dict=[]\n",
|
||||
" for item in mon_tableau:\n",
|
||||
" liens = item.findAll(\"a\")\n",
|
||||
" if (len(liens)==0):\n",
|
||||
" continue\n",
|
||||
" element={}\n",
|
||||
" element['lien']=myPrefix+liens[0]['href']\n",
|
||||
" element['titre']=liens[0].get_text()\n",
|
||||
" element['etat']=liens[1].get_text()\n",
|
||||
" print(element)\n",
|
||||
" liens_dict.append(element)\n",
|
||||
" return(liens_dict)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range(1,13):\n",
|
||||
" liens_dict_cum.append(get_liens(i))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"liens_dict_all = reduce(operator.concat, liens_dict_cum)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for lien in liens_dict_all:\n",
|
||||
" print(lien)\n",
|
||||
" page_descr = download(lien['lien'])\n",
|
||||
" soup_descr = BeautifulSoup(page_descr, 'html5lib')\n",
|
||||
" soup_descr_infos = soup_descr.select('.infos-projet')[0].findAll('tr')\n",
|
||||
" element={}\n",
|
||||
" for ligne in soup_descr_infos:\n",
|
||||
" items = ligne.findAll('td')\n",
|
||||
" element[items[0].get_text()] = re.sub(\"[\\n]\",\";\",re.sub(\"[\\t]\",\"\",items[1].get_text()))\n",
|
||||
" try:\n",
|
||||
" soup_descr_compl_lab = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('label')]\n",
|
||||
" soup_descr_compl_span = [i.get_text() for i in soup_descr.select('.droite')[0].findAll('span')]\n",
|
||||
" element.update(dict(zip(soup_descr_compl_lab,soup_descr_compl_span)))\n",
|
||||
" except: \n",
|
||||
" pass\n",
|
||||
" infos_dict.append(element)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"liens_infos_projets = pd.concat([pd.DataFrame(liens_dict_all),pd.DataFrame(infos_dict)],axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"liens_infos_projets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows#17116976\n",
|
||||
"## https://stackoverflow.com/questions/38152389/coalesce-values-from-2-columns-into-a-single-column-in-a-pandas-dataframe#38152458\n",
|
||||
"region_split = (\n",
|
||||
" liens_infos_projets['Régions'].\n",
|
||||
" str.\n",
|
||||
" split(r'([0-9]{2}[\\s\\-]{3}[^0-9]+);;').\n",
|
||||
" apply(pd.Series, 1).\n",
|
||||
" loc[:,[1,3]].\n",
|
||||
" stack()\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"region_split"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"region_split.index = region_split.index.droplevel(-1)\n",
|
||||
"region_split.name = 'Régions'\n",
|
||||
"del liens_infos_projets['Régions']\n",
|
||||
"liens_infos_projets = liens_infos_projets.join(region_split)\n",
|
||||
"liens_infos_projets['Région'] = liens_infos_projets['Région'].combine_first(liens_infos_projets['Régions'])\n",
|
||||
"del liens_infos_projets['Régions']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ministre_split = (\n",
|
||||
" liens_infos_projets['Ministres'].\n",
|
||||
" str.\n",
|
||||
" split(r'([^;]+);;').\n",
|
||||
" apply(pd.Series, 1).\n",
|
||||
" loc[:,[1,3]].\n",
|
||||
" stack()\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"ministre_split.index = ministre_split.index.droplevel(-1)\n",
|
||||
"ministre_split.name = 'Ministres'\n",
|
||||
"del liens_infos_projets['Ministres']\n",
|
||||
"liens_infos_projets = liens_infos_projets.join(ministre_split)\n",
|
||||
"liens_infos_projets['Ministre'] = liens_infos_projets['Ministre'].combine_first(liens_infos_projets['Ministres'])\n",
|
||||
"del liens_infos_projets['Ministres']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"organisme_split = (\n",
|
||||
" liens_infos_projets['Organismes'].\n",
|
||||
" str.\n",
|
||||
" split(r'([^;]+)').\n",
|
||||
" apply(pd.Series, 1).\n",
|
||||
" loc[:,[1,3]].\n",
|
||||
" stack()\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"organisme_split.index = organisme_split.index.droplevel(-1)\n",
|
||||
"organisme_split.name = 'Organismes'\n",
|
||||
"del liens_infos_projets['Organismes']\n",
|
||||
"liens_infos_projets = liens_infos_projets.join(organisme_split)\n",
|
||||
"liens_infos_projets['Organisme'] = liens_infos_projets['Organisme'].combine_first(liens_infos_projets['Organismes'])\n",
|
||||
"del liens_infos_projets['Organismes']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## https://stackoverflow.com/questions/36072626/pandas-replace-multiple-values-at-once\n",
|
||||
"replacements = {\n",
|
||||
" 'Gestionnaire du projet': {\n",
|
||||
" r';;': ''\n",
|
||||
" },\n",
|
||||
" 'Ministre': {\n",
|
||||
" r';;': ''\n",
|
||||
" },\n",
|
||||
" 'Organisme': {\n",
|
||||
" r';;': ''\n",
|
||||
" },\n",
|
||||
" 'Région': {\n",
|
||||
" r';;': ''\n",
|
||||
" },\n",
|
||||
" 'Secteur': {\n",
|
||||
" r';;': ''\n",
|
||||
" },\n",
|
||||
" 'Contribution des partenaires': {\n",
|
||||
" r'\\s*M\\$': '',\n",
|
||||
" r'\\,':'.',\n",
|
||||
" r'\\s':''\n",
|
||||
" },\n",
|
||||
" 'Contribution du Québec': {\n",
|
||||
" r'\\s*M\\$': '',\n",
|
||||
" r'\\,':'.',\n",
|
||||
" r'\\s':''\n",
|
||||
" },\n",
|
||||
" 'Coût': {\n",
|
||||
" r'\\s*M\\$': '',\n",
|
||||
" r'\\,':'.',\n",
|
||||
" r'\\s':''\n",
|
||||
" }\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"liens_infos_projets = liens_infos_projets.replace(replacements,regex=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"liens_infos_projets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"liens_infos_projets.to_csv(\"projets_infrastructures.csv\",index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
7
download.py
Normal file
7
download.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
def download(url):
|
||||
try:
|
||||
html = urllib.request.urlopen(urllib.request.Request(url,headers={'User-Agent': 'Mozilla'})).read()
|
||||
except (URLError, HTTPError, ContentTooShortError) as e:
|
||||
print('Download error:', e.reason)
|
||||
html = None
|
||||
return html
|
Loading…
Reference in a new issue