Initial commit

This commit is contained in:
François Pelletier 2017-12-17 01:02:37 -05:00
commit 30afc79e6c
4 changed files with 1729 additions and 0 deletions

View file

@ -0,0 +1,597 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"import prettytable"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"headers = {'Content-type': 'application/json'}\n",
"data = json.dumps({\"seriesid\": ['CUUR0000SA0','SUUR0000SA0'],\"startyear\":\"2011\", \"endyear\":\"2014\"})\n",
"p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"json_data = json.loads(p.text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Results': {'series': [{'data': [{'footnotes': [{}],\n",
" 'period': 'M12',\n",
" 'periodName': 'December',\n",
" 'value': '134.207',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M11',\n",
" 'periodName': 'November',\n",
" 'value': '135.107',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M10',\n",
" 'periodName': 'October',\n",
" 'value': '135.891',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M09',\n",
" 'periodName': 'September',\n",
" 'value': '136.211',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M08',\n",
" 'periodName': 'August',\n",
" 'value': '136.127',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M07',\n",
" 'periodName': 'July',\n",
" 'value': '136.392',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M06',\n",
" 'periodName': 'June',\n",
" 'value': '136.433',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M05',\n",
" 'periodName': 'May',\n",
" 'value': '136.216',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M04',\n",
" 'periodName': 'April',\n",
" 'value': '135.771',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M03',\n",
" 'periodName': 'March',\n",
" 'value': '135.375',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M02',\n",
" 'periodName': 'February',\n",
" 'value': '134.542',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M01',\n",
" 'periodName': 'January',\n",
" 'value': '134.017',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M12',\n",
" 'periodName': 'December',\n",
" 'value': '133.509',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M11',\n",
" 'periodName': 'November',\n",
" 'value': '133.596',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M10',\n",
" 'periodName': 'October',\n",
" 'value': '133.876',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M09',\n",
" 'periodName': 'September',\n",
" 'value': '134.255',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M08',\n",
" 'periodName': 'August',\n",
" 'value': '134.098',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M07',\n",
" 'periodName': 'July',\n",
" 'value': '133.919',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M06',\n",
" 'periodName': 'June',\n",
" 'value': '133.900',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M05',\n",
" 'periodName': 'May',\n",
" 'value': '133.626',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M04',\n",
" 'periodName': 'April',\n",
" 'value': '133.421',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M03',\n",
" 'periodName': 'March',\n",
" 'value': '133.558',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M02',\n",
" 'periodName': 'February',\n",
" 'value': '133.204',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M01',\n",
" 'periodName': 'January',\n",
" 'value': '132.137',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M12',\n",
" 'periodName': 'December',\n",
" 'value': '131.770',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M11',\n",
" 'periodName': 'November',\n",
" 'value': '132.208',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M10',\n",
" 'periodName': 'October',\n",
" 'value': '132.892',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M09',\n",
" 'periodName': 'September',\n",
" 'value': '132.988',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M08',\n",
" 'periodName': 'August',\n",
" 'value': '132.430',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M07',\n",
" 'periodName': 'July',\n",
" 'value': '131.731',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M06',\n",
" 'periodName': 'June',\n",
" 'value': '131.956',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M05',\n",
" 'periodName': 'May',\n",
" 'value': '132.154',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M04',\n",
" 'periodName': 'April',\n",
" 'value': '132.284',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M03',\n",
" 'periodName': 'March',\n",
" 'value': '131.905',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M02',\n",
" 'periodName': 'February',\n",
" 'value': '130.953',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M01',\n",
" 'periodName': 'January',\n",
" 'value': '130.438',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M12',\n",
" 'periodName': 'December',\n",
" 'value': '129.844',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M11',\n",
" 'periodName': 'November',\n",
" 'value': '130.196',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M10',\n",
" 'periodName': 'October',\n",
" 'value': '130.373',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M09',\n",
" 'periodName': 'September',\n",
" 'value': '130.635',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M08',\n",
" 'periodName': 'August',\n",
" 'value': '130.351',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M07',\n",
" 'periodName': 'July',\n",
" 'value': '129.983',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M06',\n",
" 'periodName': 'June',\n",
" 'value': '129.846',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M05',\n",
" 'periodName': 'May',\n",
" 'value': '129.999',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M04',\n",
" 'periodName': 'April',\n",
" 'value': '129.483',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M03',\n",
" 'periodName': 'March',\n",
" 'value': '128.585',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M02',\n",
" 'periodName': 'February',\n",
" 'value': '127.363',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M01',\n",
" 'periodName': 'January',\n",
" 'value': '126.778',\n",
" 'year': '2011'}],\n",
" 'seriesID': 'SUUR0000SA0'},\n",
" {'data': [{'footnotes': [{}],\n",
" 'period': 'M12',\n",
" 'periodName': 'December',\n",
" 'value': '234.812',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M11',\n",
" 'periodName': 'November',\n",
" 'value': '236.151',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M10',\n",
" 'periodName': 'October',\n",
" 'value': '237.433',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M09',\n",
" 'periodName': 'September',\n",
" 'value': '238.031',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M08',\n",
" 'periodName': 'August',\n",
" 'value': '237.852',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M07',\n",
" 'periodName': 'July',\n",
" 'value': '238.250',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M06',\n",
" 'periodName': 'June',\n",
" 'value': '238.343',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M05',\n",
" 'periodName': 'May',\n",
" 'value': '237.900',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M04',\n",
" 'periodName': 'April',\n",
" 'value': '237.072',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M03',\n",
" 'periodName': 'March',\n",
" 'value': '236.293',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M02',\n",
" 'periodName': 'February',\n",
" 'value': '234.781',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M01',\n",
" 'periodName': 'January',\n",
" 'value': '233.916',\n",
" 'year': '2014'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M12',\n",
" 'periodName': 'December',\n",
" 'value': '233.049',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M11',\n",
" 'periodName': 'November',\n",
" 'value': '233.069',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M10',\n",
" 'periodName': 'October',\n",
" 'value': '233.546',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M09',\n",
" 'periodName': 'September',\n",
" 'value': '234.149',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M08',\n",
" 'periodName': 'August',\n",
" 'value': '233.877',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M07',\n",
" 'periodName': 'July',\n",
" 'value': '233.596',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M06',\n",
" 'periodName': 'June',\n",
" 'value': '233.504',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M05',\n",
" 'periodName': 'May',\n",
" 'value': '232.945',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M04',\n",
" 'periodName': 'April',\n",
" 'value': '232.531',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M03',\n",
" 'periodName': 'March',\n",
" 'value': '232.773',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M02',\n",
" 'periodName': 'February',\n",
" 'value': '232.166',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M01',\n",
" 'periodName': 'January',\n",
" 'value': '230.280',\n",
" 'year': '2013'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M12',\n",
" 'periodName': 'December',\n",
" 'value': '229.601',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M11',\n",
" 'periodName': 'November',\n",
" 'value': '230.221',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M10',\n",
" 'periodName': 'October',\n",
" 'value': '231.317',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M09',\n",
" 'periodName': 'September',\n",
" 'value': '231.407',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M08',\n",
" 'periodName': 'August',\n",
" 'value': '230.379',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M07',\n",
" 'periodName': 'July',\n",
" 'value': '229.104',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M06',\n",
" 'periodName': 'June',\n",
" 'value': '229.478',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M05',\n",
" 'periodName': 'May',\n",
" 'value': '229.815',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M04',\n",
" 'periodName': 'April',\n",
" 'value': '230.085',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M03',\n",
" 'periodName': 'March',\n",
" 'value': '229.392',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M02',\n",
" 'periodName': 'February',\n",
" 'value': '227.663',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M01',\n",
" 'periodName': 'January',\n",
" 'value': '226.665',\n",
" 'year': '2012'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M12',\n",
" 'periodName': 'December',\n",
" 'value': '225.672',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M11',\n",
" 'periodName': 'November',\n",
" 'value': '226.230',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M10',\n",
" 'periodName': 'October',\n",
" 'value': '226.421',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M09',\n",
" 'periodName': 'September',\n",
" 'value': '226.889',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M08',\n",
" 'periodName': 'August',\n",
" 'value': '226.545',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M07',\n",
" 'periodName': 'July',\n",
" 'value': '225.922',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M06',\n",
" 'periodName': 'June',\n",
" 'value': '225.722',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M05',\n",
" 'periodName': 'May',\n",
" 'value': '225.964',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M04',\n",
" 'periodName': 'April',\n",
" 'value': '224.906',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M03',\n",
" 'periodName': 'March',\n",
" 'value': '223.467',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M02',\n",
" 'periodName': 'February',\n",
" 'value': '221.309',\n",
" 'year': '2011'},\n",
" {'footnotes': [{}],\n",
" 'period': 'M01',\n",
" 'periodName': 'January',\n",
" 'value': '220.223',\n",
" 'year': '2011'}],\n",
" 'seriesID': 'CUUR0000SA0'}]},\n",
" 'message': [],\n",
" 'responseTime': 73,\n",
" 'status': 'REQUEST_SUCCEEDED'}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"json_data"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"for series in json_data['Results']['series']:\n",
" x=prettytable.PrettyTable([\"series id\",\"year\",\"period\",\"value\",\"footnotes\"])\n",
" seriesId = series['seriesID']\n",
" for item in series['data']:\n",
" year = item['year']\n",
" period = item['period']\n",
" value = item['value']\n",
" footnotes=\"\"\n",
" for footnote in item['footnotes']:\n",
" if footnote:\n",
" footnotes = footnotes + footnote['text'] + ','\n",
" if 'M01' <= period <= 'M12':\n",
" x.add_row([seriesId,year,period,value,footnotes[0:-1]])\n",
" output = open(seriesId + '.txt','w')\n",
" output.write (x.get_string())\n",
" output.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -0,0 +1,238 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Web scraping avec Python\n",
"## RoboBrowser\n",
"\n",
"- RoboBrowser est une librairie python qui permet de simuler un comportement de navigation dans un navigateur web.\n",
"- Elle permet aussi d'extraire des éléments précis d'information d'une page et de les structurer. \n",
"- C'est la plus populaire qui combine les avantages de `beautifulsoup` avec la flexibilité de `requests`. \n",
"- Elle est disponible pour Python 2.7 et Python 3.6"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from robobrowser import RoboBrowser"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On crée un objet navigateur et on navigue vers une première URL"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"browser = RoboBrowser(history=True,parser=\"lxml\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"browser.open('https://meteo.gc.ca/')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ouverture d'un formulaire à partir d'une de ses propriétés (ici, son identifiant unique)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"form = browser.get_form(id=\"cityjump\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"En affichant le formulaire, on peut voir les différents champs disponibles, ainsi que les valeurs par défauts qui sont attribuées, si applicable"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<RoboForm city=, lang=f, unit=>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"form"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"En utilisant la propriété `value`, on peut remplir le formulaire"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"form['city'].value = 'Lévis'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On envoie ensuite le formulaire, comme si on cliquait sur le bouton d'envoi"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"browser.submit_form(form)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On obtient le résultat de la page suivante dans le navigatuer. À partir de ce résultat, on peut extraire différents éléments en utilisant le sélecteur `CSS`."
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"location_xml = browser. \\\n",
"select('.col-sm-10')[0]. \\\n",
"select('dl.mrgn-bttm-0')[0]. \\\n",
"select('dd.mrgn-bttm-0')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lorsqu'un motif se répète, on préfère alors créer une fonction qui va extraire l'élément selon des paramètres. La librairie RoboBrowser ne gere pas les noeuds enfants de la structure XML (sélecteur `nth_child()`), mais on peut utiliser les listes de Python pour répliquer un comportement similaire."
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"def temperature_xml(jour): \n",
" return browser. \\\n",
" select('div.div-column')[jour]. \\\n",
" select('div')[1]. \\\n",
" select('p.mrgn-bttm-0')[0]. \\\n",
" select('span')[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut assembler les données extraites dans un dictionnaire python et les utiliser dans son application."
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'city': 'Beauport',\n",
" 'date': '16h00 HNE le vendredi 8 décembre 2017',\n",
" 'temperature': ['-2°C\\n', '-4°C\\n', '-13°C\\n', '-9°C\\n', '-8°C\\n', '-11°C\\n']}"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"{'city': location_xml[0].text, \n",
" 'date': location_xml[1].text,\n",
" 'temperature': [temperature_xml(i).text for i in range(1,7)]}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

633
BLS.ipynb Normal file

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,261 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Web scraping avec Python\n",
"## RoboBrowser\n",
"\n",
"- RoboBrowser est une librairie python qui permet de simuler un comportement de navigation dans un navigateur web.\n",
"- Elle permet aussi d'extraire des éléments précis d'information d'une page et de les structurer. \n",
"- C'est la plus populaire qui combine les avantages de `beautifulsoup` avec la flexibilité de `requests`. \n",
"- Elle est disponible pour Python 2.7 et Python 3.6"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from robobrowser import RoboBrowser"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On crée un objet navigateur et on navigue vers une première URL"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"browser = RoboBrowser(history=True,parser=\"lxml\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"browser.open('https://meteo.gc.ca/')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ouverture d'un formulaire à partir d'une de ses propriétés (ici, son identifiant unique)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"form = browser.get_form(id=\"cityjump\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"En affichant le formulaire, on peut voir les différents champs disponibles, ainsi que les valeurs par défauts qui sont attribuées, si applicable"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<RoboForm city=, lang=f, unit=>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"form"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"En utilisant la propriété `value`, on peut remplir le formulaire"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"form['city'].value = 'Québec'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On envoie ensuite le formulaire, comme si on cliquait sur le bouton d'envoi"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"browser.submit_form(form)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'dl.mrgn-bttm-0'"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"dl.mrgn-bttm-0\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On obtient le résultat de la page suivante dans le navigatuer. À partir de ce résultat, on peut extraire différents éléments en utilisant le sélecteur `CSS`."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"location_xml = browser. \\\n",
"select('.col-sm-10')[0]. \\\n",
"select('dl.mrgn-bttm-0')[0]. \\\n",
"select('dd.mrgn-bttm-0')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lorsqu'un motif se répète, on préfère alors créer une fonction qui va extraire l'élément selon des paramètres. La librairie RoboBrowser ne gere pas les noeuds enfants de la structure XML (sélecteur `nth_child()`), mais on peut utiliser les listes de Python pour répliquer un comportement similaire."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\"div.div-column:nth-child(4) > div:nth-child(2) > p:nth-child(2) > span:nth-child(1)\""
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def temperature_xml(jour): \n",
" return browser. \\\n",
" select('div.div-column')[jour]. \\\n",
" select('div')[1]. \\\n",
" select('p.mrgn-bttm-0')[0]. \\\n",
" select('span')[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut assembler les données extraites dans un dictionnaire python et les utiliser dans son application."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'city': 'Aéroport int. Lesage de Québec',\n",
" 'date': '13h00 HNE le samedi 9 décembre 2017',\n",
" 'temperature': ['-1°C\\n',\n",
" '-2°C\\n',\n",
" '-13°C\\n',\n",
" '-4°C\\n',\n",
" '-8°C\\n',\n",
" '-11°C\\n',\n",
" '-11°C\\n']}"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"{'city': location_xml[0].text, \n",
" 'date': location_xml[1].text,\n",
" 'temperature': [temperature_xml(i).text for i in range(0,7)]}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}