Commits

Anonymous committed 20f28f6 Draft

none

Comments (0)

Files changed (3)

 simplejson
+anyjson
 pymongo
 mongokit
-Flask
+flask
+jinja2
+requests

www/imagecrawler.py

+#!/usr/local/bin/python
+# -*- coding: utf-8 -*-
+
+'''
+image crawler does image search against Google or Bing Image search and
+extracts binary coded pictures and urls for given search string
+'''
+import logging
+import requests
+import simplejson
+import pymongo
+import re
+
+MAX_IMG_SIZE  = 100000
+MIN_IMG_SIZE  = 10000
+
+class BingImageSearch(object):
+	'''
+	Bing settings: http://msdn.microsoft.com/en-us/library/dd560913.aspx
+	usage:
+		>>> stalker = BingImageSearch()
+		>>> results = stalker.fetch("Melanie Raukko")
+		>>> print list(results)
+	'''
+	def __init__(self):
+		''' '''
+
+		self.headers = {
+            'User-Agent': 'Mozilla/5.0 (EN; rv:1.7.8) Gecko/20050511 TIMGLUZ-spider',
+            'Accept-Language': 'en',
+            'Keep-Alive': '300',
+            'Connection': 'keep-alive',
+            'Cache-Control': 'max-age=0',
+            "Contact": "timgluz@gmail.com"
+        }
+		
+		self.url =  "http://api.bing.net/json.aspx"
+		self.args = {
+						"AppId": "E85417EC4230B5745AB391700F02E72A1DE980F7",
+						"Query": None,
+						"Sources": "Image",
+						"Version": 2.0,
+						"Adult": "Moderate",
+						"Face": "Face",
+						"Image.Count": 10,
+						"Image.Offset": 0
+					}
+
+
+	def fetch(self, query, **kwargs):
+		'''returns  img docs '''
+		url = self._build_url(query, **kwargs)
+		print url
+		r = requests.get(url, headers = self.headers)
+
+		images = None
+		if r.ok:
+			images = self._extract_images(r.content)
+		else:
+			print "No luck!\n{0}-{1}".format(r.status_code, r.error)
+		
+		return images
+
+	def _build_url(self, query, count = 5, offset = 0):
+		''' builds urls and tries to mask Bings weird names of args'''
+		url = None
+		#init args
+		self.args["Query"] = query
+		self.args["Image.Count"] = count
+		self.args["Image.Offset"] = offset
+
+		#this does same thing as urllib.urlencode, except support unicode characters
+		args = u"&".join([u"{0}={1}".format(arg, val) for arg, val in self.args.items()])
+		args = re.sub("\s+", "+", args.strip()) #replace spaces in URL with +
+		url = u"%s?%s" % (self.url, args)
+		return url.encode("utf-8")
+
+	def _extract_images(self, content):
+		'''generator that returns  extracted images from response'''
+		results = None
+		images = []
+		json = simplejson.loads(content)
+		if json["SearchResponse"].has_key("Image") and\
+			json["SearchResponse"]["Image"]["Total"] > 0:
+			results = json["SearchResponse"]["Image"]["Results"]
+		else:
+			return []
+
+		#if all is OK --> get all images
+		for img in results:
+			
+			if img["FileSize"] > MAX_IMG_SIZE or img["FileSize"] < MIN_IMG_SIZE:
+				continue #dont load to big /small files 
+
+			r = requests.get(img["MediaUrl"]) 
+			if r.ok:
+				image = "data:{0};base64,{1}".format( r.headers["content-type"],
+					r.content.encode("base64").replace("\n", ""))
+
+			row_data = {"title": img["Title"],
+						"url":  img["MediaUrl"],
+						"origin": img["Url"],
+						"width": img["Width"],
+						"height": img["Height"],
+						"filetype": r.headers["content-type"].split("/")[-1],
+						"src":  image or None
+						}
+			images.append(row_data)
+		
+		return images
+
+def read_img_str(url):
+	'''
+		reads image from given url and builds image string.
+	'''
+	import requests
+
+	if url == None:
+		return {"src":  None}
+
+	print url
+	logging.debug("Fetching profile picture from:{0}".format(url))
+	r = requests.get(url)
+
+	if r.ok:
+			image = "data:{0};base64,{1}".format( r.headers["content-type"],
+				r.content.encode("base64").replace("\n", ""))
+
+	row_data = {
+				"url":  "url",
+				"size": int(r.headers["content-length"]),
+				"filetype": r.headers["content-type"].split("/")[-1],
+				"src":  image or None
+				}
+	return row_data
+
+
+def read_img_file(url, filepath, filename = None):
+	'''
+		reads image from given url and saves it on disk
+	'''
+	import requests
+	import time
+
+	if url is None or url == u"None":
+		return None
+
+	logging.debug(url)
+	r = requests.get(url)
+	if not r.ok:
+		return None
+	if filename is None:
+		filename = "img_{0}".format(int(time.time()))
+
+
+	filetype = r.headers["content-type"].split("/")[-1]
+	fullpath = "%s/%s.%s" % (filepath, filename, filetype)
+	with open(fullpath, "w")  as fp:
+		fp.write(r.content)
+
+	return True
+
+def crawl_lecturers(conn, dbname):
+	'''gets lectrurers without picture and  tries to find 5 picture with that name
+	current version trusts quality of Bing service and therefor profile picture will be 
+	usage:
+		>>> conn= pymongo.Connection("192.168.100.4")
+		>>> crawl_lecturers(conn, "unigo")
+
+	'''
+	print "Crawling lectrures profile pictures"
+	db = conn[dbname]
+	img_searcher = BingImageSearch()
+	#get name of lecturers whose dont have profile pictures yet
+	lecturers = db.lecturers.find({"portrait":None}, {"name": True}) 
+	for lecturer in lecturers:
+		if lecturer is None or len(lecturer["name"]) < 3:
+			print "Skipped '{0}'".format(lecturer["name"])
+			continue #smt wrong with that name
+		images = img_searcher.fetch(query = lecturer["name"])
+		if len(images)  > 0:
+			db.lecturers.update({"_id": lecturer["_id"]}, 
+				{"$set": {"portrait": images[0], "images": images}})
+
+	print "Done"
+
+if __name__ == "__main__":
+	print "Image searcher demo run"
+	'''
+	stalker = BingImageSearch()
+	results = stalker.fetch("Määgiv Räämasõunapuu")
+	print list(results)
+	'''
+	conn= pymongo.Connection("192.168.100.4")
+	crawl_lecturers(conn, "unigo")
 import moduleloader
 from escape import _u
 
-imagecrawler = moduleloader.load_file("imagecrawler", 
-		moduleloader.sibling_module_path("crawler", "imagecrawler.py"))
+import imagecrawler
 
 
 main = Blueprint('main', __name__)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.