Commits

Anonymous committed f72e62d Draft

Added Crawlers

  • Participants

Comments (0)

Files changed (220)

+<!doctype html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <title>Page Not Found :(</title> 
+  <style>
+	  body { text-align: center;}
+	  h1 { font-size: 50px; text-align: center }
+	  span[frown] { transform: rotate(90deg); display:inline-block; color: #bbb; }
+	  body { font: 20px Constantia, 'Hoefler Text',  "Adobe Caslon Pro", Baskerville, Georgia, Times, serif; color: #999; text-shadow: 2px 2px 2px rgba(200, 200, 200, 0.5); }
+	  ::-moz-selection{ background:#FF5E99; color:#fff; }
+	  ::selection { background:#FF5E99; color:#fff; } 
+	  article {display:block; text-align: left; width: 500px; margin: 0 auto; }
+	  
+	  a { color: rgb(36, 109, 56); text-decoration:none; }
+	  a:hover { color: rgb(96, 73, 141) ; text-shadow: 2px 2px 2px rgba(36, 109, 56, 0.5); }
+  </style>
+</head>
+<body>
+     <article>
+	  <h1>Not found <span frown>:(</span></h1>
+	   <div>
+	       <p>Sorry, but the page you were trying to view does not exist.</p>
+	       <p>It looks like this was the result of either:</p>
+	       <ul>
+		   <li>a mistyped address</li>
+		   <li>an out-of-date link</li>
+	       </ul>
+	   </div>
+	    
+	    <script>
+	    var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),
+		GOOG_FIXURL_SITE = location.host;
+	    </script>
+	    <script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
+     </article>
+</body>
+</html>

File crawler/__init__.py

Empty file added.

File crawler/aalto.py

+"""
+Aalto crawler 
+"""
+
+from base import BaseCrawler
+
+class AaltoCrawler(BaseCrawler):
+    """collects data from Aaltoes"""
+    def __init__(self):
+        super(AaltoSpider, self).__init__(name = "aalto")
+    
+    def crawl(self):
+        """ 
+        Crawls data from pages and saves final results to class variable "self.results"
+
+        """
+        def join_url(ending):
+            '''joins crawled link to base root'''
+            base_url = self.settings["root_url"]
+            base_url = base_url[:base_url.find("/", 10)]
+            return base_url + ending
+        
+        def parse_units(url):
+            """fetch main page and parses links to faculities"""
+            logging.debug("Parsing root page to find links to units/subschools.")
+            soup = self.get_soup(url)
+            contents = soup.find("table", {"id": "departmentView"})
+            links = content.findAll("a", {"class": "courses"})
+            return [(link.text, link.attrs[-1][1]) for link in links]
+
+        def parse_departments(url):
+            """
+            fetch faculity page and parses links to department pages, 
+            which includes links to courses
+            """
+            logging.debug("Parsing units page to find links to departments")
+            soup = self.get_soup(join_url(url))
+            content = soup.find("table", {"id":"tuView"})
+            links = content.findAll("a", {"class": "courses"})
+            return [(links.text, links.attrs[-1][1]) for link in links]
+        
+        def parse_courses(url):
+            """fetch links to courses page"""
+            logging.debug("Parsing links to courses")
+            soup = self.get_soup(join_url(url))
+            content = soup.find("table", {"class": "courseTableView"})
+            links = content.findAll("a", {"class": "courses"})
+            return [(links.text, links.attrs[-1][1]) for link in links]
+        
+        #parse content
+        def parse_content(url):
+            """parses data from course page """
+            data = {}
+            parent_url = lambda url: return url[:url.rfind("/")]
+
+            def parse_lectures(url):
+                data = {}
+                soup = get_soup(join_url("/luennot"))
+                content = soup.find("table", {"class":"lectures"})
+                
+                rows = content.findChildren(name="tr", id=re.compile("informal_[\d*]"))
+                data_rows = []
+                for row  in rows:
+                    data = {}
+                    cols = row.findChildren("td")
+                    data["date"] = cols[0].text
+                    data["week"] = cols[1].text
+                    data["weekday"] = cols[2].text
+                    data["time"] = cols[3].text.split("-")[0]
+                    data["location"] = cols[4].text
+                    data["topic"] = cols[5].text
+                    data["datetime"] = datetime.strptime("%s %s" % (data["date"],
+                        data["time"]), "%d %b %y %H:%M")
+                    data_rowsa.append(data)
+                return {"lectures": data_rows}
+            
+            #TODO: add field translation - find same names to all unis
+            def parse_overview(url):
+                logging.debug("Parsing course overview: {0}".format(url))
+                data = {}
+                soup = get_soup(join_url("/esite"))
+                content = soup.find("div", {"id": "courseBrochure"})
+                
+                rows = content.findAll("tr")
+                for row in rows:
+                    cols = row.findChildren("td")
+                    col_name = cols[0].text
+                    data[col_name] = cols[1].text
+
+                return {"overview":  data}
+
+            url = parent_url(url) #take parent url path, where we easily add path to calendar, etc
+            data.update(parse_lectures(url))
+            data.update(parse_overview(url))
+            return data 
+            
+        #crawling content
+        logging.debug("Starting crawling.")
+        start_time = time.time()
+        #TODO: add content date or change markup
+        dep_links = parse_units(url = self.settings["root_url"])
+        fac_links = self.workmanager(parse_departments, dep_links)
+        course_links = self.workmanager(parse_courses, fac_links)
+        data = self.workmanger(parse_content, course_links)
+        #TODO: field translation/replacing here
+        self.results = [val for val in data]
+        json.dump(self.results, open("results/aalto.json"))
+        logging.debug("Crawled {1} links in {0} sec".format(time.time() - start_time), 
+            dep_links.qsize() + fac_links() + course_links.qsize()
+
+
+if __name__ == "__main__":
+    spider = AaltoSpider()
+    spider.crawl()

File crawler/base.py

+"""
+Base Crawler
+
+All child crawlers have prefix - Spider
+
+"""
+import requests
+import gevent
+from gevent.queue import Queue
+from gevent.pool import Pool, Group
+
+import simplejson as json
+import logging
+import os
+import random
+
+import cookielib
+from BeautifulSoup import BeautifulSoup
+from datetime import datetime
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+class UrlContainer(object):
+    """
+        holds url info
+    """
+    def __init__(self, url, unit = None, department = None,
+                course = None, module = None,
+                status_code = 200, error = None,
+                updated = None, readed = 0, **kwargs):
+        self.url = url
+        self.unit = unit
+        self.department = department
+        self.course =  course
+        self.module = module
+        self.status_code = status_code
+        self.updated = updated or datetime.utcnow()
+        self.readed = readed
+    
+    def update(self):
+        """increases stats manually, useful when you changed value of class members
+            after initializing
+        """
+        self.updated = datetime.utcnow()
+        self.readed += 1 
+
+    def __str__(self):
+        return str(self.__dict__)
+
+class WorkManager(object):
+    """handles async execution for given func"""
+
+    def __init__(self, func, pool_size = 10):
+        self.func = func
+        self.jobs = Queue()
+        self.results = Queue()
+        self.pool = Pool(pool_size)
+
+    
+    def run(self, data = None):
+        """ """
+        #fill job queue
+        if data:
+            [self.jobs(item) for item in data if item]
+        #fill pool
+        while True:
+            job = self.jobs.get()
+            results.put(self.func(job))
+    
+    def add(self, job):
+        """adds new job to work-queue"""
+        self.jobs.put(job)
+    
+
+class BaseCrawler(object):
+    """
+    Implements crawler interface.
+    """
+
+    def __init__(self, name, queue_size = 10):
+        self.client = requests
+        self.name = name
+        self.soups = {} #holds fetched soups, key will be url
+        self.data = [] # will hold parsed data, which can export to db
+        self.logger = logging.getLogger(self.name)
+        self.logger.setLevel(logging.DEBUG)
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (EN; rv:1.7.8) Gecko/20050511 TIMGLUZ-spider',
+            'Accept-Language': 'en',
+            'Keep-Alive': '300',
+            'Connection': 'keep-alive',
+            'Cache-Control': 'max-age=0'
+        }
+        self.cookies = cookielib.CookieJar()
+        self.settings_path = "config.json"
+        self.crawled_urls = []
+        self._init_settings()
+
+    def crawl(self):
+        """ starts crawling """
+        raise NotImplementedError, "crawl() - Should be implemented in child class"
+    
+    #TODO: use my own httpclients module
+    def get_soup(self, request):
+        """makes HTTP requests and returns BeautifulSoup object"""
+        val = None
+        response = self.client.get(request, headers = self.headers,
+            cookies = self.cookies)
+        if response.ok:
+            val = BeautifulSoup(response.content)
+        else:
+            self.logger.error("Didnt get soup from: {0} -{1}{2}".format(request,
+                response.status_code, response.content))
+            val = (response.status_code, response.content)
+        
+        return val
+    
+
+    def is_soup(self, soup):
+        if isinstance(soup, BeautifulSoup):
+            return True
+        else:
+            return False
+    
+    def add_root(self, urlstr):
+        '''adds urlstr to rooturl '''
+        root_url = self.settings['root_url']
+        return root_url[:root_url.rfind('/')] + '/' + urlstr
+
+    def workmanager(self, func, urls, threads = 5, avg_pause = 0 , std_pause = 0):
+        """ makes async HTTP requests and collects results to queue
+        To prevent overflooding, workmanager will use random integer, 
+        that is generated by gaussian number generator.
+        """
+        def generate_pause(avg, std):
+            """generates random integer with gaussian distribution"""
+            val = abs(random.gauss(avg, std))
+            return int(val)
+        
+
+        self.logger.debug("Workmanager starts working")
+        pool = Pool(threads)
+        urllist = []
+        results = []
+        if not isinstance(urls[0], UrlContainer):
+            #if urls first element isnt urlcontainer, then we have list of urllists
+            for item in urls:
+                urllist.extend(item)
+        else:
+            urllist = urls
+        
+        with gevent.Timeout(5, False):
+            pool.map_cb(func, urllist, lambda x: results.extend(x))
+        #pool.join()
+        self.logger.debug("Workmanager is finished")
+
+        return [result for result in results]
+
+    
+    def size(self):
+        """how many links are already readed"""
+        return len(self.crawler_urls)
+
+    def _save(self):
+        """private funcs to save data to db"""
+        pass
+    
+    def _init_settings(self):
+        self._get_crawler_settings()
+        self._get_db_settings()
+
+    def _get_db_settings(self):
+        """
+        reads db settings from json file, which specified by self.settings_path 
+        """
+        if os.path.exists(self.settings_path):
+            doc = json.load(open(self.settings_path, "r"))
+            self.db_settings = doc["database"]
+        else:
+            logging.error("Didnt find config file: {0}".format(self.settings_path))
+
+    def _get_crawler_settings(self):
+        """reads config.json and returns settings of crawler,
+        if there is crawler with such name 
+        """
+        if os.path.exists(self.settings_path):
+            doc = json.load(open(self.settings_path, "r"))
+            if doc["crawlers"].has_key(self.name):
+                self.settings = doc["crawlers"][self.name]
+            else:
+                logging.error("Didnt find settings for: '{0}'".format(self.name))
+        else:
+            logging.error("Didnt find config file: {0}".format(self.settings_path))

File crawler/base.pyc

Binary file added.

File crawler/config.json

+{
+    "meta" : {
+        "desc": "Crawler settings and data."
+    },
+
+    "database": {
+        "url" : "",
+        "user": "",
+        "password": ""
+    },
+
+    "crawlers" : {
+        "aalto": {
+            "name": "aalto",
+            "language": "en",
+            "main_url": "http://taik.aalto.fi/fi/sitemap/",
+            "root_url" : "https://noppa.aalto.fi/noppa/kurssit/",
+            "field_map": {},
+            "links": [
+                {   
+                    "url": "",
+                    "content_size": "",
+                    "last_update": "",
+                    "content_md5": "",
+                    "modified_since": ""
+                }
+            ] 
+             
+        },
+        
+        "lappeenranta": {
+            "name": "lappeenranta",
+            "language": "en",
+            "main_url": "http://www.lut.fi/en/pages/sitemap.aspx",
+            "root_url" : "https://noppa.lut.fi/noppa/opintojaksot/",
+            "field_map": {},
+            "links": [
+                {   
+                    "url": "",
+                    "content_size": "",
+                    "last_update": "",
+                    "content_md5": "",
+                    "modified_since": ""
+                }
+            ] 
+             
+        },
+
+        "tamperetech": {
+            "name": "tamperetech",
+            "language": "en",
+            "main_url": "http://www.tut.fi/fi/sivustokartta/index.htm#etusivuTTYINTERNETFI",
+            "root_url" : "http://www.tut.fi/wwwoppaat/opas2011-2012/kv/laitokset/index.html",
+            "field_map": {},
+            "links": [
+                {   
+                    "url": "",
+                    "content_size": "",
+                    "last_update": "",
+                    "content_md5": "",
+                    "modified_since": ""
+                }
+            ]     
+        },
+
+        "tampereuni": {
+            "name": "tampereuni",
+            "language": "en",
+            "main_url": "http://www.uta.fi/english/",
+            "root_url" : "https://www10.uta.fi/opas/index.htm?lvv=2011&uiLang=en&lang=en",
+            "field_map": {},
+            "links": [
+                {   
+                    "url": "",
+                    "content_size": "",
+                    "last_update": "",
+                    "content_md5": "",
+                    "modified_since": ""
+                }
+            ] 
+             
+        }
+    }
+        
+}

File crawler/crawler.py

+"""
+ocw.py 
+
+Collects and inits course database with MIT OCW .
+"""
+from BeautifulSoup import BeautifulSoup
+import requests
+
+
+root_url =  "http://ocw.mit.edu"
+
+def get_soup(url):
+    response = requests.get(url)
+    return BeautifulSoup(response.content)
+
+def get_subpages():
+    ''' reads links from mainpage to subpages'''
+    soup = get_soup(root_url + "/courses/electrical-engineering-and-computer-science/")
+    tables = soup.findAll(attrs = {"class": "course_table"})
+    print "Find {0} tables from rootpages.\nNow cleaning...".format(len(tables))
+    links = []
+    for table in tables:
+        links.extend(table.findAll(lambda tag: len(tag.attrs) == 1 and tag.name == "a"))
+    
+    return [(link.u.text, root_url + link.attrs[0][1]) for link in links]
+
+def parse_subpage(link):
+    ''' reads information from subpage'''
+    def parse_syllabus(soup):
+        '''parses syllabus page'''
+        data = {}
+        title = "misc"
+        data[title] = []
+        items= soup.find("div", {"id": "parent-fieldname-text"})
+        for content in items.contents:
+            if content == u'\n':
+                continue #we dont collect rawstrins, usually \n
+            elif content.name == u'h2':
+                title = content.text.replace(" ", "_").lower()
+                data[title] = []
+            else:
+                data[title].append(content.text)
+        return data
+
+    def parse_calendar(soup):
+        items = soup.findChild("div", {"class": "maintabletemplate"})
+        tbody = items.find("tbody")
+        data = []
+        for row in tbody.contents:
+            if row == '\n': continue
+            
+            cols = row.findAll("td")
+            vals = {}
+            vals["no"] = cols[0]
+            vals["desc"] = cols[1]
+            vals["comments"] = cols[2]
+            data.append(vals)
+        return data
+
+    data = { "name": link[0],
+            "url": link[1]
+            }
+    #get info from main page
+    print "Parsing index.htm"
+    soup = get_soup(link[1] + "/index.htm")
+    item = soup.find("meta", {"name": "Title"})
+    data["title"] = item.attrs[0][1]
+    item = soup.find("meta", {"name": "Description"})
+    data["description"] = item.attrs[0][1]
+    item = soup.find("meta", {"name": "Author"})
+    data["author"] = item.attrs[0][1]
+    item = soup.find("meta", {"name": "Keyword"})
+    data["keyword"] = item.attrs[0][1]
+
+    #get info from syllabus
+    print("Parsing syllabus:")
+    soup = get_soup(link[1] + "/syllabus")
+    data["syllabus"] = parse_syllabus(soup)
+    #read calendar data
+    print("Parsing calendar:")
+    soup = get_soup(link[1] + "/calendar")
+    data["calendar"] = parse_calendar(soup)
+
+    return data
+
+def main():
+    from pprint import pprint
+    print "Read links to subpages:"
+    links = get_subpages()
+    print "Found {0}".format(len(links))
+    print "All links to subpages are cleaned.\nNow crawling subpages..."
+    #data_docs = map(parse_subpage, links)
+    pprint(parse_subpage(links[0]))
+
+
+if __name__ == "__main__":
+    main()

File crawler/lappeenranta.py

+"""
+
+"""
+
+from base import BaseCrawler
+
+class LappeenrantaCrawler(BaseCrawler):
+    ''' '''
+    def __init__(self):
+        super(LappeenrantaCrawler, self).__init__("lappeenranta")
+    
+    def crawl(self):
+        ''' '''
+
+        def parse_units(url):
+            soup = self.get_soup(url)
+            context = soup.find("table", {"id": "departmentView"})
+            links = context.findAll("a", {"class":  "courses"})
+            return [(link.text, link.attrs[-1][1]) for link in links]
+        
+        def parse_departments(url):
+            soup = self.get_soup(url)
+            context = soup.find("table", {"id": "tuView"})
+            links = context.findAll("a", {"class":  "courses"})
+            return [(link.text, link.attrs[-1][1]) for link in links]
+
+        def parse_courses(url):
+            soup = self.get_soup(url)
+            context = soup.find("table", {"id":  "courseTableView"})
+            return [(link.text, link.attrs[-1][1]) for link in links]
+        
+        def parse_content(course_url):
+            """parses data from course page """
+            data = {}
+            parent_url = lambda url: return url[:url.rfind("/")]
+
+            def parse_description(parent_url):
+                data = {}
+                soup = self.get_soup(parent_url + "/kuvaus")
+                context = soup.find("div", {"id": "courseSheetInfo"})
+                rows =  context.findAll("tr")
+                for row in rows:
+                    cols = row.findChildren("td")
+                    col_name = cols[0].text
+                    data[col_name] = cols[1].text
+                return {"overview":  data}
+
+            def parse_lectures(url):
+                '''parses content from course calendar'''
+                data = {}
+                soup = get_soup(join_url("/luennot"))
+                content = soup.find("table", {"class":"lectures"})
+                
+                rows = content.findChildren(name="tr", id=re.compile("informal_[\d*]"))
+                data_rows = []
+                for row  in rows:
+                    data = {}
+                    cols = row.findChildren("td")
+                    data["date"] = cols[0].text
+                    data["week"] = cols[1].text
+                    data["weekday"] = cols[2].text
+                    data["time"] = cols[3].text.split("-")[0]
+                    data["location"] = cols[4].text
+                    data["topic"] = cols[5].text
+                    data["datetime"] = datetime.strptime("%s %s" % (data["date"],
+                        data["time"]), "%d %b %y %H:%M")
+                    data_rowsa.append(data)
+                return {"lectures": data_rows}
+            
+            logging.debug("Parsing content from: {0}".format(course_url))
+            url = parent(course_url)
+            data.update(parse_description(url))
+            data.update(parse_lectures(url))
+            return data
+        
+        logging.debug("Starting crawling")
+        #get links
+        unit_links = parse_units(self.settings["root_url"])
+        dep_links = parse_departments(unit_links[3][1])
+        course_links = parse_courses(dep_links[3][1])
+        #get content
+        data = parse_content(course_links[0])
+        #save results 
+        self.results = data
+
+if __name__ == "__main__":
+    crawler =  LappeenrantaCrawler()
+    crawler.crawl()

File crawler/run_tests.py

+import unittest
+import test.all_tests
+
+testSuite = test.all_tests.create_test_suite()
+text_runner = unittest.TextTestRunner().run(testSuite)

File crawler/tamperetech.py

+"""
+
+"""
+from base import BaseCrawler
+from base import UrlContainer
+
+from gevent.queue import Queue
+
+class TampereTech(BaseCrawler):
+    ''' '''
+    def __init__(self):
+        super(TampereTech, self).__init__(name = "tamperetech")
+
+    def crawl(self):
+        """ """
+        def parse_units(urlobj):
+            self.logger.debug("Reading units: {0}".format(urlobj.url))
+            soup = self.get_soup(urlobj.url)
+            results = []
+            if not self.is_soup(soup):
+                urlobj.status_code = soup[0]
+                urlobj.error = soup[1]
+                self.logger.error("Cant crawl {0}, error: <{1}>:{2}\nUrlobj: {3}".format(urlobj.url,
+                    urlobj.status_code, urlobj.error, str(urlobj)))
+                return None
+            #parse units and urls to departments of unit
+            units = soup.findAll("h3", {"class": "guide"})
+            for unit  in units:
+                ul = unit.findNextSibling()
+                links = ul.findAll("a", {"class": "index"})
+                for link in links:
+                    rel_url = link.attrs[-1][1].split("/")
+                    child_url = url[:url.rfind("/")]  + '/' + rel_url[-2] + '/' + rel_url[-1]
+                    parsed_url = UrlContainer(url = child_url,
+                        unit = unit.text,
+                        department = link.text)
+                    results.append(parsed_url)
+            if len(results) == 0:
+                self.logger.warning("Empty result: {0}".format(urlobj.url))
+            
+            self.logger.debug("Found {0} links from units page.".format(len(results)))
+            return results
+
+        
+        def parse_courses(urlobj):
+            self.logger.debug("Reading courses: {0}".format(urlobj.url))
+            soup = self.get_soup(urlobj.url)
+            links = soup.findAll("a", {"class": "index"})
+            #[(link.text, link.attrs[-1][1]) for link in links]
+            results = []
+            for link in links:
+                #concanate  relative url with root url
+                rel_url = link.attrs[-1][1].split("/")
+                child_url = url[:url.rfind("/")] + '/' + rel_url[-2]  + '/' + rel_url[-1]
+
+                #had values to 
+                parsed_url = UrlContainer(url = child_url, unit = urlobj.unit,
+                            department = urlobj.department,
+                            course = link.text.split(',')[0]
+                            ) #create url holders by keeping previous info 
+
+                results.append(parsed_url)
+
+            self.logger.debug("Found {0} links from courses page.".format(len(results)))
+            return results
+
+        def parse_content(urlobj):
+            ''' '''
+            self.logger.debug("Parsing content of {0}".format(urlobj.url))
+            data = {}
+            soup = self.get_soup(urlobj.url)
+            if self.is_soup(soup):
+                headers = soup.findAll("h4", {"class": "guide"})
+                for header in headers:
+                    content = header.findNextSibling()
+                    data[header.text.lower] = content.text
+            else:
+                self.logger.error("Content parser cant read: {0}".format(urlobj.url))
+            return data
+
+        
+        self.logger.debug("Starting crawl process")
+        url = self.settings['root_url']
+        #TODO: save readed urls to settings file
+        urls = self.workmanager(parse_units, [[UrlContainer(url)]])
+        urls = self.workmanager(parse_courses, urls)
+        contents = self.workmanager(parse_content, urls)
+
+        for content in contents:
+            self.data.append(content)
+        print 'done'
+
+
+if __name__ == "__main__":
+    crawler = TampereTech()
+    crawler.crawl()

File crawler/tampereuni.py

+"""
+Crawler for University of Tampere
+"""
+
+from base import BaseCrawler, UrlContainer
+import re
+import copy
+
+class TampereUni(BaseCrawler):
+    ''' '''
+    def __init__(self):
+        super(TampereUni, self).__init__(name = 'tampereuni')
+    
+    def crawl(self):
+        ''' '''
+        def parse_units(urlobj):
+            ''' '''
+            self.logger.debug("parsing links from unit page: {}.".format(
+                urlobj.url))
+            urls = []
+            soup = self.get_soup(urlobj.url)
+            units = soup.findAll("div", {'class': re.compile("frontpage_header*")})
+
+            for unit in units:
+                links = unit.findAll("a")
+                for link in links:
+                    url = UrlContainer(url = link.attrs[0][1], unit = link.text)
+                    urls.append(url)
+            return urls
+
+        def parse_department(urlobj):
+            ''' '''
+            self.logger.debug("parsing links to department pages: {0}".format(
+                urlobj.url))
+            urls = []
+            soup = self.get_soup(urlobj.url)
+            context = soup.find("div", {"class": re.compile("department_header*")})
+            dep_header = context.text
+
+            context = soup.find("ul", {'class': 'elementti_listing'})
+            for link in links:
+                urls.append(url = self.add_root(link.attrs[0][1]),
+                    unit = urlobj.unit, department = dep_header, module = link.text)
+
+            return urls
+
+        def parse_courses(urlobj):
+            ''' '''
+            self.logger.debug("parsing links to courses: {0}".format(
+                urlobj.url))
+            urls = []
+            soup = self.get_soup(urlobj.url)
+            links = soup.findAll("div", {"class": re.compile("tutrak_subElement*")})
+            for link in links:
+                content = link.find('a')
+                url = copy.copy(urlobj)
+                url.url = self.add_root(content.attrs[0][1])
+                url.course = content.text
+                urls.append(url)
+            return urls
+
+        def parse_content(urlobj):
+            ''' '''
+            self.logger.debug("parsing content of course: {0}". format(
+                urlobj.url))
+            data = {}
+            soup  = self.get_soup(urlobj.url)
+            #parse infobox
+            infobox = soup.find("div", {"class": "opintojakso_infobox"})
+            course = infobox.first("div")
+            #get link to course homepage
+            data[course.a.text.lower()] = course.a.attrs[0][1]
+            #get fields from infobox
+            fields = infobox.findAll("div", {"class": "infobox_header"})
+            for field in fields:
+                data[field.text.lower()] = field.nextSibling('div').text
+
+            #get content
+            content = soup.find("div", {"id": "layout_content_inner"})
+            body = []
+            for i in range(2, len(content.contents) -1):
+                item = content.contents[i]
+                if len(item) < 2:
+                    continue #excludes newlines and another whitespaces
+                if isinstance(item, basestring):
+                    body.append(item.title().strip())
+                else:
+                    body.append(item.getText("#"))
+
+            fields = content.findAll("h2") #get headers of field from html headers
+            fields = [field.text.lower() for field in fields]
+            data.update( dict(zip(fields, body))) #match header and data together
+
+            return data
+
+        #TODO: add workmanager
+        root_url = self.settings["root_url"]
+        urls = parse_units(UrlContainer(root_url))
+        urls = parse_department(urls[5])
+        urls = parse_courses(urls[1])
+        data = parse_content(urls[4])
+        return urls
+
+
+if __name__ == "__main__":
+    crawler = TampereUni()
+    crawler.crawl()

File crawler/test/__init__.py

Empty file added.

File crawler/test/__init__.pyc

Binary file added.

File crawler/test/all_tests.py

+import glob
+import unittest
+
+def create_test_suite():
+    test_file_strings = glob.glob('test/test_*.py')
+    module_strings = ['test.'+str[5:len(str)-3] for str in test_file_strings]
+    suites = [unittest.defaultTestLoader.loadTestsFromName(name) \
+              for name in module_strings]
+    testSuite = unittest.TestSuite(suites)
+    return testSuite

File crawler/test/all_tests.pyc

Binary file added.

File crawler/test/test_base.py

+"""
+base_test.py
+"""
+
+import unittest
+import base
+
+class  BaseTest(unittest.TestCase):
+    """
+    """
+    def setUp(self):
+        self.basecrawler = base.BaseCrawler("test")
+        self.broken_url = "http://httpbin.org/headers1"
+        self.working_url = "http://httpbin.org/headers"
+        self.testing_urls = ["http://httpbin.org/ip",
+                            "http://httpbin.org/headers", 
+                            "http://httpbin.org/user-agent"
+                            ]
+
+    def tearDown(self):
+        pass
+
+    def test_crawl(self):
+        self.assertRaises(NotImplementedError, self.basecrawler.crawl)
+    
+    def test_no_soup(self):
+        """ what happens if we give non existed url"""
+        soup = self.basecrawler.get_soup(self.broken_url)
+        self.assertIsNone(soup)
+
+    def test_get_soup(self):
+        """test fetcher with correct url"""
+        soup = self.basecrawler.get_soup(self.working_url)
+        self.assertIsNotNone(soup)
+    
+    def test_work_manager_with_no_urls(self):
+        """Does working manager end correctly, when it has no urls"""
+        q = self.basecrawler.working_manager( avg_pause = 5, std_pause = 1)
+        self.assertEqual(q.qsize(), 0)
+
+    def test_work_manager(self):
+        '''How working manager will work with correct data '''
+        q = self.basecrawler.working_manager(urls = self.testing_urls)
+        self.assertEqual(q.qsize(), len(self.testing_urls))
+    
+
+    
+if __name__ == "__main__":
+    unittest.main()

File crawler/test/test_base.pyc

Binary file added.

File crawler/tse.py

+"""
+Crawler for Turku School of Economy
+"""
+from base import BaseCrawler
+
+class TSECrawler(BaseCrawler):
+    """ """
+    def __init__(self):
+        super(TSECrawler, self).__init__("tse")
+    
+    def crawl(self):
+        """ """
+        logging.debug("Starting crawling")
+        raise NotImplementedError("Their webpage Sucks")
+
+
+if __name__ == "__main__":
+    crawler = TSECrawler()
+    crawler.run()

File crossdomain.xml

+<?xml version="1.0"?>
+<!DOCTYPE cross-domain-policy SYSTEM "http://www.adobe.com/xml/dtds/cross-domain-policy.dtd">
+<cross-domain-policy>
+  
+  
+<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+
+<!-- Most restrictive policy: -->
+	<site-control permitted-cross-domain-policies="none"/>
+	
+	
+	
+<!-- Least restrictive policy: -->
+<!--
+	<site-control permitted-cross-domain-policies="all"/>
+	<allow-access-from domain="*" to-ports="*" secure="false"/>
+	<allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+-->
+<!--
+  If you host a crossdomain.xml file with allow-access-from domain="*" 	 	
+  and don’t understand all of the points described here, you probably 	 	
+  have a nasty security vulnerability. ~ simon willison
+-->
+
+</cross-domain-policy>

File dotcloud.yml

+www:
+  type: python
+
+db:
+  type: mongodb
+

File favicon.ico

Added
New image
+/* the humans responsible & colophon */
+/* humanstxt.org */
+
+
+/* TEAM */
+  <your title>: <your name>
+  Site: 
+  Twitter: 
+  Location: 
+
+/* THANKS */
+  Names (& URL): 
+
+/* SITE */
+  Standards: HTML5, CSS3
+  Components: Modernizr, jQuery
+  Software:
+  
+
+                                    
+                               -o/-                       
+                               +oo//-                     
+                              :ooo+//:                    
+                             -ooooo///-                   
+                             /oooooo//:                   
+                            :ooooooo+//-                  
+                           -+oooooooo///-                 
+           -://////////////+oooooooooo++////////////::    
+            :+ooooooooooooooooooooooooooooooooooooo+:::-  
+              -/+ooooooooooooooooooooooooooooooo+/::////:-
+                -:+oooooooooooooooooooooooooooo/::///////:-
+                  --/+ooooooooooooooooooooo+::://////:-   
+                     -:+ooooooooooooooooo+:://////:--     
+                       /ooooooooooooooooo+//////:-        
+                      -ooooooooooooooooooo////-           
+                      /ooooooooo+oooooooooo//:            
+                     :ooooooo+/::/+oooooooo+//-           
+                    -oooooo/::///////+oooooo///-          
+                    /ooo+::://////:---:/+oooo//:          
+                   -o+/::///////:-      -:/+o+//-         
+                   :-:///////:-            -:/://         
+                     -////:-                 --//:        
+                       --                       -:        
+
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Bootstrap, from Twitter</title>
+    <meta name="description" content="">
+    <meta name="author" content="">
+
+    <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
+    <!--[if lt IE 9]>
+      <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
+    <![endif]-->
+
+    <!-- Le styles -->
+    <link href="static/css/bootstrap.min.css" rel="stylesheet">
+    <style type="text/css">
+      /* Override some defaults */
+      html, body {
+        background-color: #eee;
+      }
+      body {
+        padding-top: 40px; /* 40px to make the container go all the way to the bottom of the topbar */
+      }
+      .container > footer p {
+        text-align: center; /* center align it with the container */
+      }
+      .container {
+        width: 820px; /* downsize our container to make the content feel a bit tighter and more cohesive. NOTE: this removes two full columns from the grid, meaning you only go to 14 columns and not 16. */
+      }
+
+      /* The white background content wrapper */
+      .content {
+        background-color: #fff;
+        padding: 20px;
+        margin: 0 -20px; /* negative indent the amount of the padding to maintain the grid system */
+        -webkit-border-radius: 0 0 6px 6px;
+           -moz-border-radius: 0 0 6px 6px;
+                border-radius: 0 0 6px 6px;
+        -webkit-box-shadow: 0 1px 2px rgba(0,0,0,.15);
+           -moz-box-shadow: 0 1px 2px rgba(0,0,0,.15);
+                box-shadow: 0 1px 2px rgba(0,0,0,.15);
+      }
+
+      /* Page header tweaks */
+      .page-header {
+        background-color: #f5f5f5;
+        padding: 20px 20px 10px;
+        margin: -20px -20px 20px;
+      }
+
+      /* Styles you shouldn't keep as they are for displaying this base example only */
+      .content .span10,
+      .content .span4 {
+        min-height: 500px;
+      }
+      /* Give a quick and non-cross-browser friendly divider */
+      .content .span4 {
+        margin-left: 0;
+        padding-left: 19px;
+        border-left: 1px solid #eee;
+      }
+
+      .topbar .btn {
+        border: 0;
+      }
+
+    </style>
+
+    <!-- Le fav and touch icons -->
+    <link rel="shortcut icon" href="static/img/favicon.ico">
+    <link rel="apple-touch-icon" href="static/img/apple-touch-icon.png">
+    <link rel="apple-touch-icon" sizes="72x72" href="static/img/apple-touch-icon-72x72.png">
+    <link rel="apple-touch-icon" sizes="114x114" href="static/img/apple-touch-icon-114x114.png">
+  </head>
+
+  <body>
+  	<!-- #topbar ================================================================ -->
+    <div class="topbar">
+      <div class="fill">
+        <div class="container">
+          <a class="brand" href="#">Unigo</a>
+          <ul class="nav">
+            <li class="active"><a href="#">Home</a></li>
+            <li><a href="#about">About</a></li>
+            <li><a href="#contact">Contact</a></li>
+          </ul>
+          <form action="" class="pull-right">
+            <input class="input-small" type="text" placeholder="Username">
+            <input class="input-small" type="password" placeholder="Password">
+            <button class="btn" type="submit">Sign in</button>
+          </form>
+        </div>
+      </div>
+    </div>
+
+    <!-- #content ===============================================================  -->
+    <div class="container">
+
+      <div class="content">
+      	<!-- #page-header  ======================================================= -->
+        <div class="page-header">
+          	<h1>Page name <small>Supporting text or tagline</small></h1>
+          	<ul class="breadcrumb">
+				  <li><a href="#">Home</a> <span class="divider">/</span></li>
+				  <li><a href="#">Middle page</a> <span class="divider">/</span></li>
+				  <li><a href="#">Another one</a> <span class="divider">/</span></li>
+				  <li class="active">You are here</li>
+			</ul>
+        </div>
+        <!-- #body-content  ===================================================== -->
+        <div class="row">
+          <div class="span14">
+				<p>Example content</p>
+          </div>
+
+        </div>
+      </div>
+
+    <!-- #footer  ================================================================ -->
+      <footer>
+        <p>&copy; Company 2011</p>
+      </footer>
+
+    </div> <!-- /container -->
+
+  </body>
+</html>

File lib/env/bin/activate

+# This file must be used with "source bin/activate" *from bash*
+# you cannot run it directly
+
+deactivate () {
+    # reset old environment variables
+    if [ -n "$_OLD_VIRTUAL_PATH" ] ; then
+        PATH="$_OLD_VIRTUAL_PATH"
+        export PATH
+        unset _OLD_VIRTUAL_PATH
+    fi
+    if [ -n "$_OLD_VIRTUAL_PYTHONHOME" ] ; then
+        PYTHONHOME="$_OLD_VIRTUAL_PYTHONHOME"
+        export PYTHONHOME
+        unset _OLD_VIRTUAL_PYTHONHOME
+    fi
+
+    # This should detect bash and zsh, which have a hash command that must
+    # be called to get it to forget past commands.  Without forgetting
+    # past commands the $PATH changes we made may not be respected
+    if [ -n "$BASH" -o -n "$ZSH_VERSION" ] ; then
+        hash -r
+    fi
+
+    if [ -n "$_OLD_VIRTUAL_PS1" ] ; then
+        PS1="$_OLD_VIRTUAL_PS1"
+        export PS1
+        unset _OLD_VIRTUAL_PS1
+    fi
+
+    unset VIRTUAL_ENV
+    if [ ! "$1" = "nondestructive" ] ; then
+    # Self destruct!
+        unset -f deactivate
+    fi
+}
+
+# unset irrelavent variables
+deactivate nondestructive
+
+VIRTUAL_ENV="/home/timgluz/Projects/unigo/lib/env"
+export VIRTUAL_ENV
+
+_OLD_VIRTUAL_PATH="$PATH"
+PATH="$VIRTUAL_ENV/bin:$PATH"
+export PATH
+
+# unset PYTHONHOME if set
+# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
+# could use `if (set -u; : $PYTHONHOME) ;` in bash
+if [ -n "$PYTHONHOME" ] ; then
+    _OLD_VIRTUAL_PYTHONHOME="$PYTHONHOME"
+    unset PYTHONHOME
+fi
+
+if [ -z "$VIRTUAL_ENV_DISABLE_PROMPT" ] ; then
+    _OLD_VIRTUAL_PS1="$PS1"
+    if [ "x" != x ] ; then
+	PS1="$PS1"
+    else
+    if [ "`basename \"$VIRTUAL_ENV\"`" = "__" ] ; then
+        # special case for Aspen magic directories
+        # see http://www.zetadev.com/software/aspen/
+        PS1="[`basename \`dirname \"$VIRTUAL_ENV\"\``] $PS1"
+    else
+        PS1="(`basename \"$VIRTUAL_ENV\"`)$PS1"
+    fi
+    fi
+    export PS1
+fi
+
+# This should detect bash and zsh, which have a hash command that must
+# be called to get it to forget past commands.  Without forgetting
+# past commands the $PATH changes we made may not be respected
+if [ -n "$BASH" -o -n "$ZSH_VERSION" ] ; then
+    hash -r
+fi

File lib/env/bin/activate.csh

+# This file must be used with "source bin/activate.csh" *from csh*.
+# You cannot run it directly.
+# Created by Davide Di Blasi <davidedb@gmail.com>.
+
+alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; test "\!:*" != "nondestructive" && unalias deactivate'
+
+# Unset irrelavent variables.
+deactivate nondestructive
+
+setenv VIRTUAL_ENV "/home/timgluz/Projects/unigo/lib/env"
+
+set _OLD_VIRTUAL_PATH="$PATH"
+setenv PATH "$VIRTUAL_ENV/bin:$PATH"
+
+set _OLD_VIRTUAL_PROMPT="$prompt"
+
+if ("" != "") then
+    set env_name = ""
+else
+    if (`basename "$VIRTUAL_ENV"` == "__") then
+        # special case for Aspen magic directories
+        # see http://www.zetadev.com/software/aspen/
+        set env_name = `basename \`dirname "$VIRTUAL_ENV"\``
+    else
+        set env_name = `basename "$VIRTUAL_ENV"`
+    endif
+endif
+set prompt = "[$env_name] $prompt"
+unset env_name
+
+rehash
+

File lib/env/bin/activate.fish

+# This file must be used with ". bin/activate.fish" *from fish* (http://fishshell.org)
+# you cannot run it directly
+
+function deactivate  -d "Exit virtualenv and return to normal shell environment"
+    # reset old environment variables
+    if test -n "$_OLD_VIRTUAL_PATH" 
+        set -gx PATH $_OLD_VIRTUAL_PATH
+        set -e _OLD_VIRTUAL_PATH
+    end
+    if test -n "$_OLD_VIRTUAL_PYTHONHOME"
+        set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
+        set -e _OLD_VIRTUAL_PYTHONHOME
+    end
+
+    if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
+        functions -e fish_prompt
+        set -e _OLD_FISH_PROMPT_OVERRIDE
+    end
+
+    set -e VIRTUAL_ENV
+    if test "$argv[1]" != "nondestructive"
+        # Self destruct!
+        functions -e deactivate
+    end
+end
+
+# unset irrelavent variables
+deactivate nondestructive
+
+set -gx VIRTUAL_ENV "/home/timgluz/Projects/unigo/lib/env"
+
+set -gx _OLD_VIRTUAL_PATH $PATH
+set -gx PATH "$VIRTUAL_ENV/bin" $PATH
+
+# unset PYTHONHOME if set
+if set -q PYTHONHOME
+    set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
+    set -e PYTHONHOME
+end
+
+if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
+    # fish shell uses a function, instead of env vars,
+    # to produce the prompt. Overriding the existing function is easy.
+    # However, adding to the current prompt, instead of clobbering it,
+    # is a little more work.
+    set -l oldpromptfile (tempfile)
+    if test $status
+        # save the current fish_prompt function...
+        echo "function _old_fish_prompt" >> $oldpromptfile
+        echo -n \# >> $oldpromptfile
+        functions fish_prompt >> $oldpromptfile
+        # we've made the "_old_fish_prompt" file, source it.
+        . $oldpromptfile
+        rm -f $oldpromptfile
+        
+        if test -n ""
+            # We've been given us a prompt override.
+            # 
+            # FIXME: Unsure how to handle this *safely*. We could just eval()
+            #   whatever is given, but the risk is a bit much.
+            echo "activate.fish: Alternative prompt prefix is not supported under fish-shell." 1>&2
+            echo "activate.fish: Alter the fish_prompt in this file as needed." 1>&2
+        end        
+        
+        # with the original prompt function renamed, we can override with our own.
+        function fish_prompt                
+            set -l _checkbase (basename "$VIRTUAL_ENV")
+            if test $_checkbase = "__"
+                # special case for Aspen magic directories
+                # see http://www.zetadev.com/software/aspen/
+                printf "%s[%s]%s %s" (set_color -b blue white) (basename (dirname "$VIRTUAL_ENV")) (set_color normal) (_old_fish_prompt)
+            else
+                printf "%s(%s)%s%s" (set_color -b blue white) (basename "$VIRTUAL_ENV") (set_color normal) (_old_fish_prompt)
+            end
+        end 
+        set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
+    end
+end
+

File lib/env/bin/activate_this.py

+"""By using execfile(this_file, dict(__file__=this_file)) you will
+activate this virtualenv environment.
+
+This can be used when you must use an existing Python interpreter, not
+the virtualenv bin/python
+"""
+
+try:
+    __file__
+except NameError:
+    raise AssertionError(
+        "You must run this like execfile('path/to/activate_this.py', dict(__file__='path/to/activate_this.py'))")
+import sys
+import os
+
+base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if sys.platform == 'win32':
+    site_packages = os.path.join(base, 'Lib', 'site-packages')
+else:
+    site_packages = os.path.join(base, 'lib', 'python%s' % sys.version[:3], 'site-packages')
+prev_sys_path = list(sys.path)
+import site
+site.addsitedir(site_packages)
+sys.real_prefix = sys.prefix
+sys.prefix = base
+# Move the added items to the front of the path:
+new_sys_path = []
+for item in list(sys.path):
+    if item not in prev_sys_path:
+        new_sys_path.append(item)
+        sys.path.remove(item)
+sys.path[:0] = new_sys_path

File lib/env/bin/easy_install

+#!/home/timgluz/Projects/unigo/lib/env/bin/python
+# EASY-INSTALL-ENTRY-SCRIPT: 'setuptools==0.6c11','console_scripts','easy_install'
+__requires__ = 'setuptools==0.6c11'
+import sys
+from pkg_resources import load_entry_point
+
+sys.exit(
+   load_entry_point('setuptools==0.6c11', 'console_scripts', 'easy_install')()
+)

File lib/env/bin/easy_install-2.7

+#!/home/timgluz/Projects/unigo/lib/env/bin/python
+# EASY-INSTALL-ENTRY-SCRIPT: 'setuptools==0.6c11','console_scripts','easy_install-2.7'
+__requires__ = 'setuptools==0.6c11'
+import sys
+from pkg_resources import load_entry_point
+
+sys.exit(
+   load_entry_point('setuptools==0.6c11', 'console_scripts', 'easy_install-2.7')()
+)

File lib/env/bin/pip

+#!/home/timgluz/Projects/unigo/lib/env/bin/python
+# EASY-INSTALL-ENTRY-SCRIPT: 'pip==1.0.2','console_scripts','pip'
+__requires__ = 'pip==1.0.2'
+import sys
+from pkg_resources import load_entry_point
+
+sys.exit(
+   load_entry_point('pip==1.0.2', 'console_scripts', 'pip')()
+)

File lib/env/bin/pip-2.7

+#!/home/timgluz/Projects/unigo/lib/env/bin/python
+# EASY-INSTALL-ENTRY-SCRIPT: 'pip==1.0.2','console_scripts','pip-2.7'
+__requires__ = 'pip==1.0.2'
+import sys
+from pkg_resources import load_entry_point
+
+sys.exit(
+   load_entry_point('pip==1.0.2', 'console_scripts', 'pip-2.7')()
+)

File lib/env/bin/python

Binary file added.

File lib/env/include/python2.7

+/usr/include/python2.7

File lib/env/lib/python2.7/UserDict.py

+/usr/lib/python2.7/UserDict.py

File lib/env/lib/python2.7/UserDict.pyc

Binary file added.

File lib/env/lib/python2.7/_abcoll.py

+/usr/lib/python2.7/_abcoll.py

File lib/env/lib/python2.7/_abcoll.pyc

Binary file added.

File lib/env/lib/python2.7/_weakrefset.py

+/usr/lib/python2.7/_weakrefset.py

File lib/env/lib/python2.7/_weakrefset.pyc

Binary file added.

File lib/env/lib/python2.7/abc.py

+/usr/lib/python2.7/abc.py

File lib/env/lib/python2.7/abc.pyc

Binary file added.

File lib/env/lib/python2.7/codecs.py

+/usr/lib/python2.7/codecs.py

File lib/env/lib/python2.7/codecs.pyc

Binary file added.

File lib/env/lib/python2.7/config

+/usr/lib/python2.7/config

File lib/env/lib/python2.7/copy_reg.py

+/usr/lib/python2.7/copy_reg.py

File lib/env/lib/python2.7/copy_reg.pyc

Binary file added.

File lib/env/lib/python2.7/dist-packages/readline-6.2.1-py2.7-linux-i686.egg/readline.so

+/usr/local/lib/python2.7/dist-packages/readline-6.2.1-py2.7-linux-i686.egg/readline.so

File lib/env/lib/python2.7/distutils/__init__.py

+import os
+import sys
+import warnings 
+import opcode # opcode is not a virtualenv module, so we can use it to find the stdlib
+              # Important! To work on pypy, this must be a module that resides in the
+              # lib-python/modified-x.y.z directory
+
+dirname = os.path.dirname
+
+distutils_path = os.path.join(os.path.dirname(opcode.__file__), 'distutils')
+if os.path.normpath(distutils_path) == os.path.dirname(os.path.normpath(__file__)):
+    warnings.warn(
+        "The virtualenv distutils package at %s appears to be in the same location as the system distutils?")
+else:
+    __path__.insert(0, distutils_path)
+    exec(open(os.path.join(distutils_path, '__init__.py')).read())
+
+try:
+    import dist
+    import sysconfig
+except ImportError:
+    from distutils import dist, sysconfig
+try:
+    basestring
+except NameError:
+    basestring = str
+
+## patch build_ext (distutils doesn't know how to get the libs directory
+## path on windows - it hardcodes the paths around the patched sys.prefix)
+
+if sys.platform == 'win32':
+    from distutils.command.build_ext import build_ext as old_build_ext
+    class build_ext(old_build_ext):
+        def finalize_options (self):
+            if self.library_dirs is None:
+                self.library_dirs = []
+            elif isinstance(self.library_dirs, basestring):
+                self.library_dirs = self.library_dirs.split(os.pathsep)
+            
+            self.library_dirs.insert(0, os.path.join(sys.real_prefix, "Libs"))
+            old_build_ext.finalize_options(self)
+            
+    from distutils.command import build_ext as build_ext_module 
+    build_ext_module.build_ext = build_ext
+
+## distutils.dist patches:
+
+old_find_config_files = dist.Distribution.find_config_files
+def find_config_files(self):
+    found = old_find_config_files(self)
+    system_distutils = os.path.join(distutils_path, 'distutils.cfg')
+    #if os.path.exists(system_distutils):
+    #    found.insert(0, system_distutils)
+        # What to call the per-user config file
+    if os.name == 'posix':
+        user_filename = ".pydistutils.cfg"
+    else:
+        user_filename = "pydistutils.cfg"
+    user_filename = os.path.join(sys.prefix, user_filename)
+    if os.path.isfile(user_filename):
+        for item in list(found):
+            if item.endswith('pydistutils.cfg'):
+                found.remove(item)
+        found.append(user_filename)
+    return found
+dist.Distribution.find_config_files = find_config_files
+
+## distutils.sysconfig patches:
+
+old_get_python_inc = sysconfig.get_python_inc
+def sysconfig_get_python_inc(plat_specific=0, prefix=None):
+    if prefix is None:
+        prefix = sys.real_prefix
+    return old_get_python_inc(plat_specific, prefix)
+sysconfig_get_python_inc.__doc__ = old_get_python_inc.__doc__
+sysconfig.get_python_inc = sysconfig_get_python_inc
+
+old_get_python_lib = sysconfig.get_python_lib
+def sysconfig_get_python_lib(plat_specific=0, standard_lib=0, prefix=None):
+    if standard_lib and prefix is None:
+        prefix = sys.real_prefix
+    return old_get_python_lib(plat_specific, standard_lib, prefix)
+sysconfig_get_python_lib.__doc__ = old_get_python_lib.__doc__
+sysconfig.get_python_lib = sysconfig_get_python_lib
+
+old_get_config_vars = sysconfig.get_config_vars
+def sysconfig_get_config_vars(*args):
+    real_vars = old_get_config_vars(*args)
+    if sys.platform == 'win32':
+        lib_dir = os.path.join(sys.real_prefix, "libs")
+        if isinstance(real_vars, dict) and 'LIBDIR' not in real_vars:
+            real_vars['LIBDIR'] = lib_dir # asked for all
+        elif isinstance(real_vars, list) and 'LIBDIR' in args:
+            real_vars = real_vars + [lib_dir] # asked for list
+    return real_vars
+sysconfig_get_config_vars.__doc__ = old_get_config_vars.__doc__
+sysconfig.get_config_vars = sysconfig_get_config_vars

File lib/env/lib/python2.7/distutils/__init__.pyc

Binary file added.

File lib/env/lib/python2.7/distutils/distutils.cfg

+# This is a config file local to this virtualenv installation
+# You may include options that will be used by all distutils commands,
+# and by easy_install.  For instance:
+#
+#   [easy_install]
+#   find_links = http://mylocalsite

File lib/env/lib/python2.7/encodings

+/usr/lib/python2.7/encodings

File lib/env/lib/python2.7/fnmatch.py

+/usr/lib/python2.7/fnmatch.py

File lib/env/lib/python2.7/fnmatch.pyc

Binary file added.

File lib/env/lib/python2.7/genericpath.py

+/usr/lib/python2.7/genericpath.py

File lib/env/lib/python2.7/genericpath.pyc

Binary file added.

File lib/env/lib/python2.7/lib-dynload

+/usr/lib/python2.7/lib-dynload

File lib/env/lib/python2.7/linecache.py

+/usr/lib/python2.7/linecache.py

File lib/env/lib/python2.7/linecache.pyc

Binary file added.

File lib/env/lib/python2.7/locale.py

+/usr/lib/python2.7/locale.py

File lib/env/lib/python2.7/ntpath.py

+/usr/lib/python2.7/ntpath.py

File lib/env/lib/python2.7/orig-prefix.txt

+/usr

File lib/env/lib/python2.7/os.py

+/usr/lib/python2.7/os.py

File lib/env/lib/python2.7/os.pyc

Binary file added.

File lib/env/lib/python2.7/posixpath.py

+/usr/lib/python2.7/posixpath.py

File lib/env/lib/python2.7/posixpath.pyc

Binary file added.

File lib/env/lib/python2.7/re.py

+/usr/lib/python2.7/re.py

File lib/env/lib/python2.7/re.pyc

Binary file added.

File lib/env/lib/python2.7/site-packages/Flask-0.7.2-py2.7.egg/EGG-INFO/PKG-INFO

+Metadata-Version: 1.0
+Name: Flask
+Version: 0.7.2
+Summary: A microframework based on Werkzeug, Jinja2 and good intentions
+Home-page: http://github.com/mitsuhiko/flask/
+Author: Armin Ronacher
+Author-email: armin.ronacher@active-4.com
+License: BSD
+Description: 
+        Flask
+        -----
+        
+        Flask is a microframework for Python based on Werkzeug, Jinja 2 and good
+        intentions. And before you ask: It's BSD licensed!
+        
+        Flask is Fun
+        ````````````
+        
+        ::
+        
+            from flask import Flask
+            app = Flask(__name__)
+        
+            @app.route("/")
+            def hello():
+                return "Hello World!"
+        
+            if __name__ == "__main__":
+                app.run()
+        
+        And Easy to Setup
+        `````````````````
+        
+        ::
+        
+            $ easy_install Flask
+            $ python hello.py
+             * Running on http://localhost:5000/
+        
+        Links
+        `````
+        
+        * `website <http://flask.pocoo.org/>`_
+        * `documentation <http://flask.pocoo.org/docs/>`_
+        * `development version
+          <http://github.com/mitsuhiko/flask/zipball/master#egg=Flask-dev>`_
+        
+        
+Platform: any
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Web Environment
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
+Classifier: Topic :: Software Development :: Libraries :: Python Modules