premiere version code fonctionnel

This commit is contained in:
Francois Pelletier 2016-12-30 01:28:26 -05:00
commit 14cc4a39fa
2 changed files with 567 additions and 0 deletions

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
creds
.ipynb_checkpoints
.ipynb_checkpoints/*
.Rhistory

563
facebook_app.ipynb Normal file
View file

@ -0,0 +1,563 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ce code sert à extraire des publications d'une page Facebook et les enregistrer dans ElasticSearch\n",
"\n",
"Reesources: \n",
"\n",
"- Facebook SDK pour Python https://github.com/mobolic/facebook-sdk\n",
"- Documentation Facebook du feed https://developers.facebook.com/docs/graph-api/reference/v2.8/page/feed\n",
"- Documentation Facebook des commentaires https://developers.facebook.com/docs/graph-api/reference/v2.8/object/comments\n",
"- Aller chercher un access token pour Facebook http://stackoverflow.com/a/26844734\n",
"- Python datetime: https://docs.python.org/2/library/datetime.html\n",
"- Python FB Pagination https://stackoverflow.com/questions/28589239/python-facebook-api-cursor-pagination"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"import requests\n",
"import facebook\n",
"from elasticsearch import Elasticsearch\n",
"from datetime import datetime\n",
"from datetime import timedelta"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"app_id=''\n",
"app_secret=''"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"def get_fb_token(app_id, app_secret): \n",
" payload = {'grant_type': 'client_credentials', 'client_id': app_id, 'client_secret': app_secret}\n",
" file = requests.post('https://graph.facebook.com/oauth/access_token?', params = payload)\n",
" #print file.text #to test what the FB api responded with \n",
" result = file.text.split(\"=\")[1]\n",
" #print file.text #to test the TOKEN\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"access_token=get_fb_token(app_id,app_secret)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"week_delta = timedelta(days=7)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"es = Elasticsearch()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"{u'acknowledged': True, u'shards_acknowledged': True}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es.indices.create(index='fb_page_post', ignore=400)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"#es.indices.delete(index='fb_page_post')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"graph = facebook.GraphAPI(access_token=access_token, version='2.7')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"page_obj = graph.get_object('PartiConservateurDuQuebec')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"def index_post(post):\n",
" es.index(\n",
" index=\"fb_page_post\",\n",
" doc_type=\"fb_page_post_data\", \n",
" id=post[u'id'],\n",
" timestamp=post[u'created_time'],\n",
" body=post)\n",
"def index_comments(post):\n",
" es.index(\n",
" index=\"fb_page_post\",\n",
" doc_type=\"fb_page_post_comment\", \n",
" id=post[u'id'],\n",
" timestamp=post[u'created_time'],\n",
" body=post)\n",
"def index_like(post):\n",
" es.index(\n",
" index=\"fb_page_post\",\n",
" doc_type=\"fb_page_post_like\", \n",
" id=post[u'like_id'],\n",
" timestamp=post[u'like_time'],\n",
" body=post)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"def getfbpostsfrompage(fb_graph,page_id,field_list,time_since,time_until):\n",
" all_posts = []\n",
" res = fb_graph.get_object('/'+\n",
" page_id+\n",
" '/posts?fields='+','.join(field_list)+\n",
" '&since='+time_since+\n",
" '&until='+time_until)\n",
" while(True):\n",
" try:\n",
" for page in res[u'data']:\n",
" all_posts.append(page)\n",
" res=requests.get(res['paging']['next']).json()\n",
" except KeyError:\n",
" break\n",
" return all_posts"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"ttt = getfbpostsfrompage(graph,\n",
" page_obj[u'id'],\n",
" ['id','created_time'],\n",
" (datetime.now().date()-week_delta).isoformat(),\n",
" datetime.now().date().isoformat())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"ttt"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"def getpostmetacomplet(fb_graph,post_id,field_list):\n",
" post_meta_complet = fb_graph.get_object('/'+\n",
" post_id+\n",
" '?fields='+','.join(field_list))\n",
" return post_meta_complet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"ppp_complet = getpostmetacomplet(graph,ttt[0][u'id'],['message','created_time','id','status_type','shares','link','via'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"ppp_complet"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"def getpostreact(fb_graph,post_id,field_list,react_type,filter_type):\n",
" res = fb_graph.get_object('/'+post_id+\n",
" '/'+react_type+'/?fields='+','.join(field_list)+\n",
" '&filter='+filter_type)\n",
" all_comments = []\n",
" while(True):\n",
" try:\n",
" for comment in res[u'data']:\n",
" all_comments.append(comment)\n",
" res=requests.get(res[u'paging'][u'next']).json()\n",
" except KeyError:\n",
" break\n",
" return all_comments"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"def dict_update(l,x):\n",
" l.update(x)\n",
" return l"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"ccc = getpostreact(graph,ttt[0][u'id'],['id'],'comments','stream')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"[x[u'id'] for x in ccc[1:10]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"[getpostmetacomplet(graph,x[u'id'],['id','from','message','created_time','comment_count','like_count','parent']) for x in ccc[1:10]] "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"rrr = getpostreact(graph,ttt[0][u'id'],['id','name'],'likes','stream')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"#TEST\n",
"rrr[0]['id']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"#TEST\n",
"ttt[0][u'id']+'_'+rrr[0]['id']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"#TEST\n",
"rrr[0]['id']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"#TEST\n",
"like_dicts = [dict_update(x,{'like_time':ttt[0][u'created_time'],\n",
" 'like_id':ttt[0][u'id']+'_'+x['id']}) for x in rrr]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"#TEST\n",
"like_dicts[1:5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"ppp_complet.update({u'likes':rrr})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"#TEST\n",
"ppp_complet"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"res = getfbpostsfrompage(graph,\n",
" page_obj[u'id'],\n",
" ['id','created_time'],\n",
" (datetime.now().date()-week_delta).isoformat(),\n",
" datetime.now().date().isoformat())"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"for pp in res:\n",
" # Post\n",
" post_complet = getpostmetacomplet(graph,\n",
" pp[u'id'],\n",
" ['message','created_time','id',\n",
" 'status_type','shares','link',\n",
" 'via'])\n",
" # Like\n",
" all_post_likes = getpostreact(graph,pp[u'id'],\n",
" ['id','name'],\n",
" 'likes',\n",
" 'stream')\n",
" like_count = len(all_post_likes)\n",
" post_complet.update({u'like_count':like_count})\n",
" # Sauvegarde des \"post\"\n",
" index_post(post_complet)\n",
" # Sauvegarde des \"like\"\n",
" like_dicts = [dict_update(x,{u'like_time':pp['created_time'],\n",
" u'like_id':pp[u'id']+'_'+x['id']}) for x in all_post_likes]\n",
" for l in like_dicts:\n",
" index_like(l)\n",
" # Comments\n",
" res_comments = getpostreact(graph,pp[u'id'],['id'],'comments','stream')\n",
" for cc in res_comments:\n",
" comment_complet = getpostmetacomplet(graph,\n",
" cc[u'id'],\n",
" ['id','from','message',\n",
" 'created_time','comment_count','like_count',\n",
" 'parent'])\n",
" # Sauvegarde des \"comments\"\n",
" index_comments(comment_complet)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}