Source

twtbak / twtbak.boo

Full commit
/*

Description
===========
Script that pulls Twitter statuses for specified users to SQLite database.

Requirements
=============
* Mono (http://mono-project.com/)
* Boo (http://boo.codehaus.org/)
* System.Data.SQLite (http://sqlite.phxsoftware.com/)

Usage
=====
Edit screenNames array and credentials, compile[0] and run[1], maybe drop in
cron and forget.
[0] $ booc -v -strict -debug- twtbak.boo
[1] $ mono twtbak.exe

License
=======
I hereby donate this work to the public domain, and relinquish any rights I may
have in it. -- Alvis Mikovs <alvis.mikovs@gmail.com>

*/


import System
import System.Data
import System.Diagnostics
import System.Globalization.CultureInfo
import System.Net
import System.Xml
import System.Data.SQLite


// usernames you want statuses pulled from
screenNames = [ 'Asterius' ]
// username and password to login with, required only if you're pulling from
// protected accounts, otherwise can be null
// credentials = NetworkCredential('user', 'pass')
credentials = null
// database file location
dbConnection = SQLiteConnection('Data Source=tweets.db')
// server URL, might want to use https, should work with any Laconica server
serverUrl = 'http://twitter.com/'
// date format, don't change this, unless changed on server
serverCulture = CreateSpecificCulture('en-US')
// statuses server can return in one request, no reason to change this, unless
// server allows more
maxStatusesPerRequest = 200


def requestXml(url as String) as XmlDocument:
	success = false
	retries = 3
	while not success and retries > 0:
		Debug.WriteLine(url)
		req = WebRequest.Create(url) as HttpWebRequest
		if credentials != null:
			req.Credentials = credentials
			req.PreAuthenticate = true
		try:
			res = req.GetResponse() as HttpWebResponse
			xml = XmlDocument()
			xml.Load(res.GetResponseStream())
		except exc as WebException:
			if exc.Status == WebExceptionStatus.ProtocolError:
				res = exc.Response as HttpWebResponse
				if res.StatusCode == HttpStatusCode.BadGateway:
					Debug.WriteLine('got 502 (Bad Gateway), retrying')
					retries -= 1
					continue
				elif res.StatusCode == HttpStatusCode.BadRequest:
					Debug.WriteLine("request limit reached, exiting")
					Console.Error.WriteLine("Request limit reached, exiting.")
					System.Environment.Exit(2)
			raise exc
		ensure:
			res.Close()
		success = true
	if not success:
		Debug.WriteLine('server not responding correctly, giving up')
		Console.Error.WriteLine('Server not responding correctly, giving up.')
		System.Environment.Exit(1)
	return xml

def selectString(node as XmlNode, xpath as String) as String:
	return node.SelectSingleNode(xpath).InnerText

def selectBoolean(node as XmlNode, xpath as String) as Object:
	text = node.SelectSingleNode(xpath).InnerText
	if text.Length == 0:
		return null
	else:
		return Boolean.Parse(text)

def selectInt64(node as XmlNode, xpath as String) as Object:
	text = node.SelectSingleNode(xpath).InnerText
	if text.Length == 0:
		return null
	else:
		return Int64.Parse(text)

def selectDateTime(node as XmlNode, xpath as String) as Object:
	text = node.SelectSingleNode(xpath).InnerText
	if text.Length == 0:
		return null
	else:
		return DateTime.ParseExact(text, 'ddd MMM dd HH:mm:ss zz00 yyyy', serverCulture)


Debug.Listeners.Add(TextWriterTraceListener(Console.Out))
Debug.AutoFlush = true
Debug.WriteLine('opening db')
dbConnection.Open()
try:
	Debug.WriteLine('creating tables')
	qu = """
		CREATE TABLE IF NOT EXISTS statuses (
			id integer PRIMARY KEY, 
			user_id integer NOT NULL, 
			screen_name text NOT NULL, 
			created_at text NOT NULL, 
			text text NOT NULL, 
			truncated integer NOT NULL, 
			source text NOT NULL, 
			in_reply_to_user_id integer, 
			in_reply_to_screen_name text, 
			in_reply_to_status_id integer, 
			favorited integer NOT NULL
		);
	"""
	cm = dbConnection.CreateCommand()
	cm.CommandText = qu
	cm.ExecuteNonQuery()
	qu = """
		CREATE TABLE IF NOT EXISTS users (
			user_id integer PRIMARY KEY, 
			screen_name text NOT NULL, 
			initial_backup_done integer NOT NULL DEFAULT 0
		);
	"""
	cm = dbConnection.CreateCommand()
	cm.CommandText = qu
	cm.ExecuteNonQuery()

	for screenName in screenNames:
		Debug.WriteLine("processing ${screenName}")
		Debug.Indent()
		Debug.Write('checking if user exists in db... ')
		qu = 'SELECT user_id, initial_backup_done FROM users WHERE screen_name = @screen_name;'
		cm = dbConnection.CreateCommand()
		cm.CommandText = qu
		cm.Parameters.Add('@screen_name', DbType.String).Value = screenName
		dr = cm.ExecuteReader()
		if dr.Read():
			Debug.WriteLine('found')
			userId = dr['user_id']
			initialBackupDone = cast(Boolean, dr['initial_backup_done'])
			dr.Close()
		else:
			Debug.WriteLine('missing')
			dr.Close()
			Debug.WriteLine('requesting user')
			Debug.Indent()
			url = "${serverUrl}users/show/${screenName}.xml"
			try:
				xml = requestXml(url)
			except exc as WebException:
				if exc.Status == WebExceptionStatus.ProtocolError:
					res = exc.Response as HttpWebResponse
					if res.StatusCode == HttpStatusCode.NotFound:
						Debug.WriteLine("user ${screenName} not found, skiping")
						Debug.Unindent()
						Debug.Unindent()
						Console.Error.WriteLine("User ${screenName} not found, skiping.")
						continue
				raise exc
			userId = selectInt64(xml, '/user /id')
			Debug.Write('checking if user exists in db by id... ')
			qu = 'SELECT initial_backup_done FROM users WHERE user_id = @user_id;'
			cm = dbConnection.CreateCommand()
			cm.CommandText = qu
			cm.Parameters.Add('@user_id', DbType.Int64).Value = userId
			dr = cm.ExecuteReader()
			if dr.Read():
				Debug.WriteLine('found')
				initialBackupDone = cast(Boolean, dr['initial_backup_done'])
				dr.Close()
				Debug.WriteLine('updating user name')
				qu = 'UPDATE users SET screen_name = @screen_name WHERE user_id = @user_id;'
				cm = dbConnection.CreateCommand()
				cm.CommandText = qu
				cm.Parameters.Add('@screen_name', DbType.String).Value = screenName
				cm.Parameters.Add('@user_id', DbType.Int64).Value = userId
				cm.ExecuteNonQuery()
			else:
				Debug.WriteLine('missing')
				dr.Close()
				initialBackupDone = false
				Debug.WriteLine('saving user')
				qu = 'INSERT INTO users VALUES (@user_id, @screen_name, @initial_backup_done);'
				cm = dbConnection.CreateCommand()
				cm.CommandText = qu
				cm.Parameters.Add('@user_id', DbType.Int64).Value = userId
				cm.Parameters.Add('@screen_name', DbType.String).Value = screenName
				cm.Parameters.Add('@initial_backup_done', DbType.Boolean).Value = initialBackupDone
				cm.ExecuteNonQuery()
			Debug.Unindent()
		Debug.WriteLine("userId: ${userId}")
		Debug.WriteLine("initialBackupDone: ${initialBackupDone}")

		Debug.WriteLine('backing up')
		Debug.Indent()
		baseUrl = "${serverUrl}statuses/user_timeline/${userId}.xml?count=${maxStatusesPerRequest}"
		if initialBackupDone:
			makeUrl = def ():
				Debug.WriteLine('finding last status in db')
				qu = 'SELECT MAX(id) FROM statuses WHERE user_id = @user_id;'
				cm = dbConnection.CreateCommand()
				cm.CommandText = qu
				cm.Parameters.Add('@user_id', DbType.String).Value = userId
				sinceId = cm.ExecuteScalar()
				Debug.WriteLine("sinceId: ${sinceId}")
				if sinceId != DBNull.Value:
					return "${baseUrl}&since_id=${sinceId}"
				else:
					return baseUrl
		else:
			makeUrl = def ():
				Debug.WriteLine('finding first status in db')
				qu = 'SELECT MIN(id) - 1 FROM statuses WHERE user_id = @user_id;'
				cm = dbConnection.CreateCommand()
				cm.CommandText = qu
				cm.Parameters.Add('@user_id', DbType.Int64).Value = userId
				maxId = cm.ExecuteScalar()
				Debug.WriteLine("maxId: ${maxId}")
				if maxId != DBNull.Value:
					return "${baseUrl}&max_id=${maxId}"
				else:
					return baseUrl
		firstRequest = true
		statusesReceived = 0
		while firstRequest or statusesReceived == maxStatusesPerRequest:
			firstRequest = false
			statusesReceived = 0
			url = makeUrl()
			Debug.WriteLine('requesting statuses')
			try:
				xml = requestXml(url)
			except exc as WebException:
				if exc.Status == WebExceptionStatus.ProtocolError:
					res = exc.Response as HttpWebResponse
					if res.StatusCode == HttpStatusCode.Unauthorized:
						Debug.WriteLine("user ${screenName} is protected, authentication required, skiping")
						Console.Error.WriteLine("User ${screenName} is protected, authentication required, skiping.")
						break
				raise exc
			statuses = xml.SelectNodes('/statuses /status')
			statusesReceived = statuses.Count
			Debug.WriteLine("statusesReceived: ${statusesReceived}")
			if statusesReceived > 0:
				Debug.WriteLine('saving statuses')
				Debug.Indent()
				tr = dbConnection.BeginTransaction()
				qu = 'INSERT INTO statuses VALUES (@id, @user_id, @screen_name, @created_at, @text, @truncated, @source, @in_reply_to_user_id, @in_reply_to_screen_name, @in_reply_to_status_id, @favorited);'
				cm = dbConnection.CreateCommand()
				cm.CommandText = qu
				cm.Parameters.Add('@id', DbType.Int64)
				cm.Parameters.Add('@user_id', DbType.Int64)
				cm.Parameters.Add('@screen_name', DbType.String)
				cm.Parameters.Add('@created_at', DbType.DateTime)
				cm.Parameters.Add('@text', DbType.String)
				cm.Parameters.Add('@truncated', DbType.Boolean)
				cm.Parameters.Add('@source', DbType.String)
				cm.Parameters.Add('@in_reply_to_user_id', DbType.Int64)
				cm.Parameters.Add('@in_reply_to_screen_name', DbType.String)
				cm.Parameters.Add('@in_reply_to_status_id', DbType.Int64)
				cm.Parameters.Add('@favorited', DbType.Boolean)
				for status as XmlNode in statuses:
					cm.Parameters['@id'].Value = selectInt64(status, 'id')
					cm.Parameters['@user_id'].Value = selectInt64(status, 'user /id')
					cm.Parameters['@screen_name'].Value = selectString(status, 'user /screen_name')
					cm.Parameters['@created_at'].Value = selectDateTime(status, 'created_at')
					cm.Parameters['@text'].Value = selectString(status, 'text')
					cm.Parameters['@truncated'].Value = selectBoolean(status, 'truncated')
					cm.Parameters['@source'].Value = selectString(status, 'source')
					cm.Parameters['@in_reply_to_user_id'].Value = selectInt64(status, 'in_reply_to_user_id')
					cm.Parameters['@in_reply_to_screen_name'].Value = selectString(status, 'in_reply_to_screen_name')
					cm.Parameters['@in_reply_to_status_id'].Value = selectInt64(status, 'in_reply_to_status_id')
					cm.Parameters['@favorited'].Value = selectBoolean(status, 'favorited')
					Debug.WriteLine("id: ${cm.Parameters['@id'].Value}")
					cm.ExecuteNonQuery()
				tr.Commit()
				Debug.Unindent()
		Debug.Unindent()
		if not initialBackupDone and statusesReceived > 0 and statusesReceived != maxStatusesPerRequest:
			Debug.WriteLine('saving initial backup done')
			qu = 'UPDATE users SET initial_backup_done = @initial_backup_done WHERE user_id = @user_id;'
			cm = dbConnection.CreateCommand()
			cm.CommandText = qu
			cm.Parameters.Add('@initial_backup_done', DbType.Boolean).Value = true
			cm.Parameters.Add('@user_id', DbType.Int64).Value = userId
			cm.ExecuteNonQuery()

		Debug.Unindent()
ensure:
	dbConnection.Clone()