564 lines
13 KiB
Text
564 lines
13 KiB
Text
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Ce code sert à extraire des publications d'une page Facebook et les enregistrer dans ElasticSearch\n",
|
||
|
"\n",
|
||
|
"Reesources: \n",
|
||
|
"\n",
|
||
|
"- Facebook SDK pour Python https://github.com/mobolic/facebook-sdk\n",
|
||
|
"- Documentation Facebook du feed https://developers.facebook.com/docs/graph-api/reference/v2.8/page/feed\n",
|
||
|
"- Documentation Facebook des commentaires https://developers.facebook.com/docs/graph-api/reference/v2.8/object/comments\n",
|
||
|
"- Aller chercher un access token pour Facebook http://stackoverflow.com/a/26844734\n",
|
||
|
"- Python datetime: https://docs.python.org/2/library/datetime.html\n",
|
||
|
"- Python FB Pagination https://stackoverflow.com/questions/28589239/python-facebook-api-cursor-pagination"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import requests\n",
|
||
|
"import facebook\n",
|
||
|
"from elasticsearch import Elasticsearch\n",
|
||
|
"from datetime import datetime\n",
|
||
|
"from datetime import timedelta"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"app_id=''\n",
|
||
|
"app_secret=''"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def get_fb_token(app_id, app_secret): \n",
|
||
|
" payload = {'grant_type': 'client_credentials', 'client_id': app_id, 'client_secret': app_secret}\n",
|
||
|
" file = requests.post('https://graph.facebook.com/oauth/access_token?', params = payload)\n",
|
||
|
" #print file.text #to test what the FB api responded with \n",
|
||
|
" result = file.text.split(\"=\")[1]\n",
|
||
|
" #print file.text #to test the TOKEN\n",
|
||
|
" return result"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"access_token=get_fb_token(app_id,app_secret)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"week_delta = timedelta(days=7)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"es = Elasticsearch()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 11,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"{u'acknowledged': True, u'shards_acknowledged': True}"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 11,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"es.indices.create(index='fb_page_post', ignore=400)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#es.indices.delete(index='fb_page_post')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 18,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"graph = facebook.GraphAPI(access_token=access_token, version='2.7')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"page_obj = graph.get_object('PartiConservateurDuQuebec')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 12,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def index_post(post):\n",
|
||
|
" es.index(\n",
|
||
|
" index=\"fb_page_post\",\n",
|
||
|
" doc_type=\"fb_page_post_data\", \n",
|
||
|
" id=post[u'id'],\n",
|
||
|
" timestamp=post[u'created_time'],\n",
|
||
|
" body=post)\n",
|
||
|
"def index_comments(post):\n",
|
||
|
" es.index(\n",
|
||
|
" index=\"fb_page_post\",\n",
|
||
|
" doc_type=\"fb_page_post_comment\", \n",
|
||
|
" id=post[u'id'],\n",
|
||
|
" timestamp=post[u'created_time'],\n",
|
||
|
" body=post)\n",
|
||
|
"def index_like(post):\n",
|
||
|
" es.index(\n",
|
||
|
" index=\"fb_page_post\",\n",
|
||
|
" doc_type=\"fb_page_post_like\", \n",
|
||
|
" id=post[u'like_id'],\n",
|
||
|
" timestamp=post[u'like_time'],\n",
|
||
|
" body=post)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 13,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def getfbpostsfrompage(fb_graph,page_id,field_list,time_since,time_until):\n",
|
||
|
" all_posts = []\n",
|
||
|
" res = fb_graph.get_object('/'+\n",
|
||
|
" page_id+\n",
|
||
|
" '/posts?fields='+','.join(field_list)+\n",
|
||
|
" '&since='+time_since+\n",
|
||
|
" '&until='+time_until)\n",
|
||
|
" while(True):\n",
|
||
|
" try:\n",
|
||
|
" for page in res[u'data']:\n",
|
||
|
" all_posts.append(page)\n",
|
||
|
" res=requests.get(res['paging']['next']).json()\n",
|
||
|
" except KeyError:\n",
|
||
|
" break\n",
|
||
|
" return all_posts"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"ttt = getfbpostsfrompage(graph,\n",
|
||
|
" page_obj[u'id'],\n",
|
||
|
" ['id','created_time'],\n",
|
||
|
" (datetime.now().date()-week_delta).isoformat(),\n",
|
||
|
" datetime.now().date().isoformat())"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"ttt"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def getpostmetacomplet(fb_graph,post_id,field_list):\n",
|
||
|
" post_meta_complet = fb_graph.get_object('/'+\n",
|
||
|
" post_id+\n",
|
||
|
" '?fields='+','.join(field_list))\n",
|
||
|
" return post_meta_complet"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"ppp_complet = getpostmetacomplet(graph,ttt[0][u'id'],['message','created_time','id','status_type','shares','link','via'])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"ppp_complet"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def getpostreact(fb_graph,post_id,field_list,react_type,filter_type):\n",
|
||
|
" res = fb_graph.get_object('/'+post_id+\n",
|
||
|
" '/'+react_type+'/?fields='+','.join(field_list)+\n",
|
||
|
" '&filter='+filter_type)\n",
|
||
|
" all_comments = []\n",
|
||
|
" while(True):\n",
|
||
|
" try:\n",
|
||
|
" for comment in res[u'data']:\n",
|
||
|
" all_comments.append(comment)\n",
|
||
|
" res=requests.get(res[u'paging'][u'next']).json()\n",
|
||
|
" except KeyError:\n",
|
||
|
" break\n",
|
||
|
" return all_comments"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def dict_update(l,x):\n",
|
||
|
" l.update(x)\n",
|
||
|
" return l"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"ccc = getpostreact(graph,ttt[0][u'id'],['id'],'comments','stream')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"[x[u'id'] for x in ccc[1:10]]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"[getpostmetacomplet(graph,x[u'id'],['id','from','message','created_time','comment_count','like_count','parent']) for x in ccc[1:10]] "
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"rrr = getpostreact(graph,ttt[0][u'id'],['id','name'],'likes','stream')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"rrr[0]['id']"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"ttt[0][u'id']+'_'+rrr[0]['id']"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"rrr[0]['id']"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"like_dicts = [dict_update(x,{'like_time':ttt[0][u'created_time'],\n",
|
||
|
" 'like_id':ttt[0][u'id']+'_'+x['id']}) for x in rrr]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"like_dicts[1:5]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"ppp_complet.update({u'likes':rrr})"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#TEST\n",
|
||
|
"ppp_complet"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"res = getfbpostsfrompage(graph,\n",
|
||
|
" page_obj[u'id'],\n",
|
||
|
" ['id','created_time'],\n",
|
||
|
" (datetime.now().date()-week_delta).isoformat(),\n",
|
||
|
" datetime.now().date().isoformat())"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"for pp in res:\n",
|
||
|
" # Post\n",
|
||
|
" post_complet = getpostmetacomplet(graph,\n",
|
||
|
" pp[u'id'],\n",
|
||
|
" ['message','created_time','id',\n",
|
||
|
" 'status_type','shares','link',\n",
|
||
|
" 'via'])\n",
|
||
|
" # Like\n",
|
||
|
" all_post_likes = getpostreact(graph,pp[u'id'],\n",
|
||
|
" ['id','name'],\n",
|
||
|
" 'likes',\n",
|
||
|
" 'stream')\n",
|
||
|
" like_count = len(all_post_likes)\n",
|
||
|
" post_complet.update({u'like_count':like_count})\n",
|
||
|
" # Sauvegarde des \"post\"\n",
|
||
|
" index_post(post_complet)\n",
|
||
|
" # Sauvegarde des \"like\"\n",
|
||
|
" like_dicts = [dict_update(x,{u'like_time':pp['created_time'],\n",
|
||
|
" u'like_id':pp[u'id']+'_'+x['id']}) for x in all_post_likes]\n",
|
||
|
" for l in like_dicts:\n",
|
||
|
" index_like(l)\n",
|
||
|
" # Comments\n",
|
||
|
" res_comments = getpostreact(graph,pp[u'id'],['id'],'comments','stream')\n",
|
||
|
" for cc in res_comments:\n",
|
||
|
" comment_complet = getpostmetacomplet(graph,\n",
|
||
|
" cc[u'id'],\n",
|
||
|
" ['id','from','message',\n",
|
||
|
" 'created_time','comment_count','like_count',\n",
|
||
|
" 'parent'])\n",
|
||
|
" # Sauvegarde des \"comments\"\n",
|
||
|
" index_comments(comment_complet)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"anaconda-cloud": {},
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python [default]",
|
||
|
"language": "python",
|
||
|
"name": "python2"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 2
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython2",
|
||
|
"version": "2.7.12"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 1
|
||
|
}
|