Snippets
Created by
Andres Vargas - zodman
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | import urllib2
import json
import datetime
import csv
import time
from gooey import Gooey, GooeyParser
import sys
reload(sys)
sys.setdefaultencoding('utf8')
if getattr(sys, "frozen",None):
nonbuffered_stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
sys.stdout = nonbuffered_stdout
DIR_PATH = getattr(sys, 'frozen', None) or os.path.dirname(os.path.abspath(__file__))
app_id = "347243152136431"
app_secret = "XXXX" # DO NOT SHARE WITH ANYONE!
access_token = app_id + "|" + app_secret
def request_until_succeed(url):
req = urllib2.Request(url)
success = False
while success is False:
try:
response = urllib2.urlopen(req)
if response.getcode() == 200:
success = True
except Exception, e:
print e
time.sleep(5)
print "Error for URL %s: %s" % (url, datetime.datetime.now())
return response.read()
# Needed to write tricky unicode correctly to csv
def unicode_normalize(text):
return text.translate({ 0x2018:0x27, 0x2019:0x27, 0x201C:0x22, 0x201D:0x22, 0xa0:0x20 }).encode('utf-8')
def getFacebookPageFeedData(page_id, access_token, num_statuses):
# Construct the URL string; see http://stackoverflow.com/a/37239851 for Reactions parameters
base = "https://graph.facebook.com/v2.6"
node = "/%s/posts" % page_id
fields = "/?fields=message,link,created_time,type,name,id,comments.limit(0).summary(true),shares,reactions.limit(0).summary(true)"
parameters = "&limit=%s&access_token=%s" % (num_statuses, access_token)
url = base + node + fields + parameters
# retrieve data
data = json.loads(request_until_succeed(url))
return data
def getReactionsForStatus(status_id, access_token):
# See http://stackoverflow.com/a/37239851 for Reactions parameters
# Reactions are only accessable at a single-post endpoint
base = "https://graph.facebook.com/v2.6"
node = "/%s" % status_id
reactions = "/?fields=" \
"reactions.type(LIKE).limit(0).summary(total_count).as(like)" \
",reactions.type(LOVE).limit(0).summary(total_count).as(love)" \
",reactions.type(WOW).limit(0).summary(total_count).as(wow)" \
",reactions.type(HAHA).limit(0).summary(total_count).as(haha)" \
",reactions.type(SAD).limit(0).summary(total_count).as(sad)" \
",reactions.type(ANGRY).limit(0).summary(total_count).as(angry)"
parameters = "&access_token=%s" % access_token
url = base + node + reactions + parameters
# retrieve data
data = json.loads(request_until_succeed(url))
return data
def processFacebookPageFeedStatus(status, access_token):
# The status is now a Python dictionary, so for top-level items,
# we can simply call the key.
# Additionally, some items may not always exist,
# so must check for existence first
status_id = status['id']
status_message = '' if 'message' not in status.keys() else unicode_normalize(status['message'])
link_name = '' if 'name' not in status.keys() else unicode_normalize(status['name'])
status_type = status['type']
status_link = '' if 'link' not in status.keys() else unicode_normalize(status['link'])
# Time needs special care since a) it's in UTC and
# b) it's not easy to use in statistical programs.
status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
status_published = status_published + datetime.timedelta(hours=-5) # EST
status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
# Nested items require chaining dictionary keys.
num_reactions = 0 if 'reactions' not in status else status['reactions']['summary']['total_count']
num_comments = 0 if 'comments' not in status else status['comments']['summary']['total_count']
num_shares = 0 if 'shares' not in status else status['shares']['count']
# Counts of each reaction separately; good for sentiment
# Only check for reactions if past date of implementation: http://newsroom.fb.com/news/2016/02/reactions-now-available-globally/
reactions = getReactionsForStatus(status_id, access_token) if status_published > '2016-02-24 00:00:00' else {}
num_likes = 0 if 'like' not in reactions else reactions['like']['summary']['total_count']
# Special case: Set number of Likes to Number of reactions for pre-reaction statuses
num_likes = num_reactions if status_published < '2016-02-24 00:00:00' else num_likes
num_loves = 0 if 'love' not in reactions else reactions['love']['summary']['total_count']
num_wows = 0 if 'wow' not in reactions else reactions['wow']['summary']['total_count']
num_hahas = 0 if 'haha' not in reactions else reactions['haha']['summary']['total_count']
num_sads = 0 if 'sad' not in reactions else reactions['sad']['summary']['total_count']
num_angrys = 0 if 'angry' not in reactions else reactions['angry']['summary']['total_count']
# Return a tuple of all processed data
return (status_id, status_message, link_name, status_type, status_link,
status_published, num_reactions, num_comments, num_shares, num_likes,
num_loves, num_wows, num_hahas, num_sads, num_angrys)
def scrapeFacebookPageFeedStatus(page_id, access_token):
with open('%s_facebook_statuses.csv' % page_id, 'wb') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_reactions", "num_comments", "num_shares", "num_likes",
"num_loves", "num_wows", "num_hahas", "num_sads", "num_angrys"])
has_next_page = True
num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()
print "Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime)
statuses = getFacebookPageFeedData(page_id, access_token, 100)
while has_next_page:
for status in statuses['data']:
# Ensure it is a status with the expected metadata
if 'reactions' in status:
w.writerow(processFacebookPageFeedStatus(status, access_token))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 100 == 0:
print "%s Statuses Processed: %s" % (num_processed, datetime.datetime.now())
# if there is no next page, we're done.
if 'paging' in statuses.keys():
statuses = json.loads(request_until_succeed(statuses['paging']['next']))
else:
has_next_page = False
print "\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - scrape_starttime)
@Gooey
def main():
parser = GooeyParser(description="My Cool GUI Program!")
parser.add_argument('page_id',metavar="page_id", help="facebook.com/<page_id>")
args = parser.parse_args()
page_id = args.page_id
scrapeFacebookPageFeedStatus(page_id, access_token)
#scrapeFacebookPageFeedStatus(page_id, access_token)
if __name__ == '__main__':
main()
# The CSV can be opened in all major statistical programs. Have fun! :)
|
Comments (0)
You can clone a snippet to your computer for local editing. Learn more.