Source

web2itter / modules / update_tweets.py

# -*- coding: utf-8 -*-

from urllib import urlencode
from urllib2 import urlopen, Request
import gluon.contrib.simplejson as sj
from time import strptime
import datetime
import htmlentitydefs

#search_query = '#pythonbrasil OR #apyb OR #pythonbrasil5'
search_query = '#DevInRio'

tbl = 'tweets_devinrio'
db.define_table(tbl,
    Field('author', 'string', length=140),
    Field('date', 'datetime'),
    Field('tweet', 'string', length=140),
    Field('tweet_id', 'double'),
    Field('image', 'string', length=512),
    Field('source', 'string', length=512),
)
TABLE = db[tbl]

def unescape(text):
    text = text.decode('utf-8')
    for k, v in htmlentitydefs.name2codepoint.iteritems():
        text = text.replace('&%s;' % k, unichr(v))
    return text


def get_tweets(last_id=None, query=None):
    '''Details about Twitter Search API:

    - Search API usage requires that applications include a unique and identifying
    User Agent string. A HTTP Referrer is expected but is not required. Consumers
    using the Search API but failing to include a User Agent string will receive a
    lower rate limit.
    - An application that exceeds the rate limitations of the Search API will
    receive HTTP 503 response codes to requests. It is a best practice to watch
    for this error condition and honor the Retry-After header that instructs the
    application when it is safe to continue. The Retry-After header's value is the
    number of seconds your application should wait before submitting another query
    (for example: Retry-After: 67).
    
    Example:
    created_at: 'Thu, 10 Sep 2009 12:20:15 +0000'
    from_user: 'juliogreff'
    id: 3886081183L
    profile_image_url: 'http://a1.twimg.com/profile_images/248388854/juliogreff_normal.jpg'
    source: '<a href="http://www.tweetdeck.com/" rel="nofollow">TweetDeck</a>'
    text: 'abertura da #pythonbrasil =D'
    '''
    
    search_url = 'http://search.twitter.com/search.json'
    headers = {'User-agent': 'web2itter v0.2'}
    if not query:
        query = {'q': search_query, 'rpp': '100'}
        if last_id:
            query['since_id'] = last_id
        query = urlencode(query)
    request = Request(search_url, data=query, headers=headers)
    
    search = urlopen(request)
    response = sj.loads(search.read())
    search.close()
    
    return response


def unescape(text):
    for k, v in htmlentitydefs.name2codepoint.iteritems():
        text = text.replace('&%s;' % k, unichr(v))
    return text


def update_tweets_table(query=None):
    last = db().select(TABLE.tweet_id, orderby=~TABLE.id,
                       limitby=(0, 1))
    if not len(last):
        response_json = get_tweets(query=query)
    else:
        response_json = get_tweets(query=query, last_id=last[0]['tweet_id'])
    tweets = response_json['results']
    
    for tweet in tweets:
        new_date = tweet['created_at'].split()
        date_tuple = strptime(' '.join(new_date[:-1]), '%a, %d %b %Y %H:%M:%S')
        plus = (int(new_date[-1][1:3]) * 3600 + int(new_date[-1][3:]) * 60)
        if new_date[-1][0] == '-':
            plus = - plus
        date = datetime.datetime(*date_tuple[:6]) + datetime.timedelta(0, plus)
        
        tweet_search = db(TABLE.tweet_id == tweet['id']).select()
        if not len(tweet_search):
            TABLE.insert(author=tweet['from_user'],
                             date=date, tweet=unescape(tweet['text']),
                             tweet_id=tweet['id'],
                             image=tweet['profile_image_url'],
                             source=unescape(tweet['source']))
    return response_json['next_page'] if 'next_page' in response_json else None

resp = update_tweets_table()
while resp != None:
    resp = update_tweets_table(resp[1:])
db.commit()