{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Ce code sert à extraire des publications d'une page Facebook et les enregistrer dans ElasticSearch\n", "\n", "Reesources: \n", "\n", "- Facebook SDK pour Python https://github.com/mobolic/facebook-sdk\n", "- Documentation Facebook du feed https://developers.facebook.com/docs/graph-api/reference/v2.8/page/feed\n", "- Documentation Facebook des commentaires https://developers.facebook.com/docs/graph-api/reference/v2.8/object/comments\n", "- Aller chercher un access token pour Facebook http://stackoverflow.com/a/26844734\n", "- Python datetime: https://docs.python.org/2/library/datetime.html\n", "- Python FB Pagination https://stackoverflow.com/questions/28589239/python-facebook-api-cursor-pagination" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "import requests\n", "import facebook\n", "from elasticsearch import Elasticsearch\n", "from datetime import datetime\n", "from datetime import timedelta" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "app_id=''\n", "app_secret=''" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "def get_fb_token(app_id, app_secret): \n", " payload = {'grant_type': 'client_credentials', 'client_id': app_id, 'client_secret': app_secret}\n", " file = requests.post('https://graph.facebook.com/oauth/access_token?', params = payload)\n", " #print file.text #to test what the FB api responded with \n", " result = file.text.split(\"=\")[1]\n", " #print file.text #to test the TOKEN\n", " return result" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "access_token=get_fb_token(app_id,app_secret)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "week_delta = timedelta(days=7)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "es = Elasticsearch()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "{u'acknowledged': True, u'shards_acknowledged': True}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "es.indices.create(index='fb_page_post', ignore=400)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "#es.indices.delete(index='fb_page_post')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "graph = facebook.GraphAPI(access_token=access_token, version='2.7')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "page_obj = graph.get_object('PartiConservateurDuQuebec')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "def index_post(post):\n", " es.index(\n", " index=\"fb_page_post\",\n", " doc_type=\"fb_page_post_data\", \n", " id=post[u'id'],\n", " timestamp=post[u'created_time'],\n", " body=post)\n", "def index_comments(post):\n", " es.index(\n", " index=\"fb_page_post\",\n", " doc_type=\"fb_page_post_comment\", \n", " id=post[u'id'],\n", " timestamp=post[u'created_time'],\n", " body=post)\n", "def index_like(post):\n", " es.index(\n", " index=\"fb_page_post\",\n", " doc_type=\"fb_page_post_like\", \n", " id=post[u'like_id'],\n", " timestamp=post[u'like_time'],\n", " body=post)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "def getfbpostsfrompage(fb_graph,page_id,field_list,time_since,time_until):\n", " all_posts = []\n", " res = fb_graph.get_object('/'+\n", " page_id+\n", " '/posts?fields='+','.join(field_list)+\n", " '&since='+time_since+\n", " '&until='+time_until)\n", " while(True):\n", " try:\n", " for page in res[u'data']:\n", " all_posts.append(page)\n", " res=requests.get(res['paging']['next']).json()\n", " except KeyError:\n", " break\n", " return all_posts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "ttt = getfbpostsfrompage(graph,\n", " page_obj[u'id'],\n", " ['id','created_time'],\n", " (datetime.now().date()-week_delta).isoformat(),\n", " datetime.now().date().isoformat())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "ttt" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "def getpostmetacomplet(fb_graph,post_id,field_list):\n", " post_meta_complet = fb_graph.get_object('/'+\n", " post_id+\n", " '?fields='+','.join(field_list))\n", " return post_meta_complet" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "ppp_complet = getpostmetacomplet(graph,ttt[0][u'id'],['message','created_time','id','status_type','shares','link','via'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "ppp_complet" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "def getpostreact(fb_graph,post_id,field_list,react_type,filter_type):\n", " res = fb_graph.get_object('/'+post_id+\n", " '/'+react_type+'/?fields='+','.join(field_list)+\n", " '&filter='+filter_type)\n", " all_comments = []\n", " while(True):\n", " try:\n", " for comment in res[u'data']:\n", " all_comments.append(comment)\n", " res=requests.get(res[u'paging'][u'next']).json()\n", " except KeyError:\n", " break\n", " return all_comments" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "def dict_update(l,x):\n", " l.update(x)\n", " return l" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "ccc = getpostreact(graph,ttt[0][u'id'],['id'],'comments','stream')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "[x[u'id'] for x in ccc[1:10]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "[getpostmetacomplet(graph,x[u'id'],['id','from','message','created_time','comment_count','like_count','parent']) for x in ccc[1:10]] " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "rrr = getpostreact(graph,ttt[0][u'id'],['id','name'],'likes','stream')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#TEST\n", "rrr[0]['id']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#TEST\n", "ttt[0][u'id']+'_'+rrr[0]['id']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#TEST\n", "rrr[0]['id']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#TEST\n", "like_dicts = [dict_update(x,{'like_time':ttt[0][u'created_time'],\n", " 'like_id':ttt[0][u'id']+'_'+x['id']}) for x in rrr]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#TEST\n", "like_dicts[1:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "ppp_complet.update({u'likes':rrr})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "#TEST\n", "ppp_complet" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "res = getfbpostsfrompage(graph,\n", " page_obj[u'id'],\n", " ['id','created_time'],\n", " (datetime.now().date()-week_delta).isoformat(),\n", " datetime.now().date().isoformat())" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "for pp in res:\n", " # Post\n", " post_complet = getpostmetacomplet(graph,\n", " pp[u'id'],\n", " ['message','created_time','id',\n", " 'status_type','shares','link',\n", " 'via'])\n", " # Like\n", " all_post_likes = getpostreact(graph,pp[u'id'],\n", " ['id','name'],\n", " 'likes',\n", " 'stream')\n", " like_count = len(all_post_likes)\n", " post_complet.update({u'like_count':like_count})\n", " # Sauvegarde des \"post\"\n", " index_post(post_complet)\n", " # Sauvegarde des \"like\"\n", " like_dicts = [dict_update(x,{u'like_time':pp['created_time'],\n", " u'like_id':pp[u'id']+'_'+x['id']}) for x in all_post_likes]\n", " for l in like_dicts:\n", " index_like(l)\n", " # Comments\n", " res_comments = getpostreact(graph,pp[u'id'],['id'],'comments','stream')\n", " for cc in res_comments:\n", " comment_complet = getpostmetacomplet(graph,\n", " cc[u'id'],\n", " ['id','from','message',\n", " 'created_time','comment_count','like_count',\n", " 'parent'])\n", " # Sauvegarde des \"comments\"\n", " index_comments(comment_complet)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }