Source

rankademy / www / imagecrawler.py

Full commit
#!/usr/local/bin/python
# -*- coding: utf-8 -*-

'''
image crawler does image search against Google or Bing Image search and
extracts binary coded pictures and urls for given search string
'''
import logging
import requests
import simplejson
import pymongo
import re

MAX_IMG_SIZE  = 100000
MIN_IMG_SIZE  = 10000

class BingImageSearch(object):
	'''
	Bing settings: http://msdn.microsoft.com/en-us/library/dd560913.aspx
	usage:
		>>> stalker = BingImageSearch()
		>>> results = stalker.fetch("Melanie Raukko")
		>>> print list(results)
	'''
	def __init__(self):
		''' '''

		self.headers = {
            'User-Agent': 'Mozilla/5.0 (EN; rv:1.7.8) Gecko/20050511 TIMGLUZ-spider',
            'Accept-Language': 'en',
            'Keep-Alive': '300',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            "Contact": "timgluz@gmail.com"
        }
		
		self.url =  "http://api.bing.net/json.aspx"
		self.args = {
						"AppId": "E85417EC4230B5745AB391700F02E72A1DE980F7",
						"Query": None,
						"Sources": "Image",
						"Version": 2.0,
						"Adult": "Moderate",
						"Face": "Face",
						"Image.Count": 10,
						"Image.Offset": 0
					}


	def fetch(self, query, **kwargs):
		'''returns  img docs '''
		url = self._build_url(query, **kwargs)
		print url
		r = requests.get(url, headers = self.headers)

		images = None
		if r.ok:
			images = self._extract_images(r.content)
		else:
			print "No luck!\n{0}-{1}".format(r.status_code, r.error)
		
		return images

	def _build_url(self, query, count = 5, offset = 0):
		''' builds urls and tries to mask Bings weird names of args'''
		url = None
		#init args
		self.args["Query"] = query
		self.args["Image.Count"] = count
		self.args["Image.Offset"] = offset

		#this does same thing as urllib.urlencode, except support unicode characters
		args = u"&".join([u"{0}={1}".format(arg, val) for arg, val in self.args.items()])
		args = re.sub("\s+", "+", args.strip()) #replace spaces in URL with +
		url = u"%s?%s" % (self.url, args)
		return url.encode("utf-8")

	def _extract_images(self, content):
		'''generator that returns  extracted images from response'''
		results = None
		images = []
		json = simplejson.loads(content)
		if json["SearchResponse"].has_key("Image") and\
			json["SearchResponse"]["Image"]["Total"] > 0:
			results = json["SearchResponse"]["Image"]["Results"]
		else:
			return []

		#if all is OK --> get all images
		for img in results:
			
			if img["FileSize"] > MAX_IMG_SIZE or img["FileSize"] < MIN_IMG_SIZE:
				continue #dont load to big /small files 

			r = requests.get(img["MediaUrl"]) 
			if r.ok:
				image = "data:{0};base64,{1}".format( r.headers["content-type"],
					r.content.encode("base64").replace("\n", ""))

			row_data = {"title": img["Title"],
						"url":  img["MediaUrl"],
						"origin": img["Url"],
						"width": img["Width"],
						"height": img["Height"],
						"filetype": r.headers["content-type"].split("/")[-1],
						"src":  image or None
						}
			images.append(row_data)
		
		return images

def read_img_str(url):
	'''
		reads image from given url and builds image string.
	'''
	import requests

	if url == None:
		return {"src":  None}

	print url
	logging.debug("Fetching profile picture from:{0}".format(url))
	r = requests.get(url)

	if r.ok:
			image = "data:{0};base64,{1}".format( r.headers["content-type"],
				r.content.encode("base64").replace("\n", ""))

	row_data = {
				"url":  "url",
				"size": int(r.headers["content-length"]),
				"filetype": r.headers["content-type"].split("/")[-1],
				"src":  image or None
				}
	return row_data


def read_img_file(url, filepath, filename = None):
	'''
		reads image from given url and saves it on disk
	'''
	import requests
	import time

	if url is None or url == u"None":
		return None

	logging.debug(url)
	r = requests.get(url)
	if not r.ok:
		return None
	if filename is None:
		filename = "img_{0}".format(int(time.time()))


	filetype = r.headers["content-type"].split("/")[-1]
	fullpath = "%s/%s.%s" % (filepath, filename, filetype)
	with open(fullpath, "w")  as fp:
		fp.write(r.content)

	return True

def crawl_lecturers(conn, dbname):
	'''gets lectrurers without picture and  tries to find 5 picture with that name
	current version trusts quality of Bing service and therefor profile picture will be 
	usage:
		>>> conn= pymongo.Connection("192.168.100.4")
		>>> crawl_lecturers(conn, "unigo")

	'''
	print "Crawling lectrures profile pictures"
	db = conn[dbname]
	img_searcher = BingImageSearch()
	#get name of lecturers whose dont have profile pictures yet
	lecturers = db.lecturers.find({"portrait":None}, {"name": True}) 
	for lecturer in lecturers:
		if lecturer is None or len(lecturer["name"]) < 3:
			print "Skipped '{0}'".format(lecturer["name"])
			continue #smt wrong with that name
		images = img_searcher.fetch(query = lecturer["name"])
		if len(images)  > 0:
			db.lecturers.update({"_id": lecturer["_id"]}, 
				{"$set": {"portrait": images[0], "images": images}})

	print "Done"

if __name__ == "__main__":
	print "Image searcher demo run"
	'''
	stalker = BingImageSearch()
	results = stalker.fetch("Määgiv Räämasõunapuu")
	print list(results)
	'''
	conn= pymongo.Connection("192.168.100.4")
	crawl_lecturers(conn, "unigo")