Commits

Timo Sulg committed 840b33a

Today i did lot of work:
* added test cases for CrawlerUrl, NaiveCrawler
* fixed syntax errors
* repaired coding styles - thats means i started to change book authors code
* removed many TODOs
But there i need to change some test, then there will be full functional crawler

  • Participants
  • Parent commits 3ef4b07

Comments (0)

Files changed (7)

Chapter6/__init__.py

+'''
+inits as module
+'''
+

Chapter6/result_naivecrawler.txt

Empty file added.

Chapter6/simple_crawler.py

 '''
 import threading
 import re
-import urljoin
+import urllib2
 
-class CrawlerUrl(object):
-    ''' encapsulates the URL visited by the crawler'''
-    pass
+import urlparse
+import hashlib
+from Queue import Queue
 
+#TODO: add examples how to use
+#TODO: filter out not-used class properties
 class NaiveCrawler(object):
     ''' implements the crawling process
         params:
-            url_queue   - queue of crawlable urls / root of nodes
+            url_queue   - queue of crawlable urls / root of nodes,it uses()
             max_urls    - total number of crawlable urls
-            max_depth     - how deeply to crawl
+            max_depth     - how deeply to crawl, if depth == -1,
+                            then its unlimited.
     '''
     USER_AGENT = r"User-agent:"
     DISALLOW = "Disallow:"
-    REXEXP_HTTP = r'<a href="http://(.)*">'
+    REGEXP_HTTP = r'<a href="http://(.)*">'
     REGEXP_RELATIVE = r'<a href="(.)*">'
     BOT_NAME = "*"
 
-    def __init__(self, url_queue, max_urls = 20, max_depth = 1, delay = 5,
-            search_pattern = None):
+    def __init__(self, url_queue = None,
+            max_urls = 20, max_depth = 1, delay = 5,
+            search_pattern = "collective.*intelligence"):
         ''' '''
-        self.url_queue = url_queue
+        self.url_queue = url_queue if url_queue != None else Queue()
         self.max_urls   = max_urls
         self.delay      = delay
-        self.max_depth  = depth
-        self.visited    = 0 #TODO: func or property
+        self.max_depth  = max_depth
+
         self.search_pattern = re.compile(search_pattern)
-        self.http_pattern = re.compile(REXEXP_HTTP)
-        self.rel_pattern = re.compile(REGEXP_RELATIVE)
-        self.visited_urls = {} # "string":"url",
-        self.site_permissions = {} # "string":["url", "url2"]
-        self.output = open("result_{0}".format())
+        self.http_pattern = re.compile(self.REGEXP_HTTP)
+        self.rel_pattern = re.compile(self.REGEXP_RELATIVE)
+        self.visited_urls = {} #TODO: "string":"url",(?host:path better)
+        #self.site_permissions = {} # "string":["url", "url2"] pointless
+        self.disallowed_urls = dict() #keys - hosts, val - set of full urls
+        self.output = open("result_naivecrawler.txt", "w")
         #TODO:parallel filewriting is thread killer
-        self.stat   = open("stat_{0}".format())
+        self.stat   = open("stat_naivecrawler.txt", "w")
+
+    def visited(self):
+        ''' returns how many pages already have already crawled'''
+        return len(self.visited_urls)
 
     def continue_crawling(self):
-        ''' '''
+        ''' this methods controls, can we continue crawling
+            returns: boolean, if theres is url and we havent
+            break limit of max_urls
+        '''
         result = False
         if (not self.url_queue.empty() and
-            len(self.visited_urls) < self.max_depth):
+            (len(self.visited_urls) < self.max_urls) or
+            self.max_urls == -1):
             result = True
         return result
 
     def get_next_url(self):
         ''' returns next crawlable links -
-        thats copied from book, there is better and pythonic way'''
-        url = None
-
+        thats copied from book, there is better and pythonic way
+        ps: urls, that doesn match with requirement will be removed
+        '''
+        next_url = None
         while (next_url == None and
                not self.url_queue.empty()):
             #temp
-            temp = self.url_queue.get()
-            if next_url.visited == False and\
-                next_url.is_permitted == True):
-                url = temp
-                continue
+            next_url = self.url_queue.get()
+            if next_url.is_visited == False and\
+                next_url.is_permitted == True:
+                break
 
-        return url
+        return next_url
 
-    def calc_permission(self):
-        ''' calc and sets url permissions
-        again, book example = seems i use this phrase
-        everytime i dont like author style :D -
-        i will refine it later
+    def calc_permission(self, crawler_url):
         '''
-        result = false
-        if crawler_url == None:
-            result = false
+        calc and sets url permissions
+        '''
+        result = True #all links are allowed, if they are not denied
+
+        if isinstance(crawler_url, CrawlerUrl):
+            host = crawler_url.parse_url()
+            host = host.netloc
+            #get permissions
+            if not self.disallowed_urls.has_key(host):
+                #if there is not such host,then try to read them
+                disallowed_paths = self.parse_robot_text(host)
+                self.disallowed_urls[host] = disallowed_paths
+            else:
+                disallowed_paths = self.disallowed_urls[host]
+            #calculate permission
+            if crawler_url.url_string in disallowed_paths:
+                result = False
+            else:
+                result = True
         else:
-            if crawler_url.is_permitted == True:
-                result = True
-            else:
-                parsed_url = urlparse.urlsplit(crawler_url.url_string)
-                if parsed_url['path'] in self.disallowed():
-                    result = False
-                else:
-                    result = True
-
+            raise TypeError, "Argument have to be CrawlerUrl object"
         return result
 
 
     def parse_robot_text(self, host):
         '''builds set of disallowed path
+            local use case: fill disallowed urls
+            globally: returns given url uncrawlable links
             Current solution reads in default(*) bot settings
             * if user given bot exists then reads also user bots
             and returns it,
             ps: reads robot_text from beginning to end, for larger files
             thats means performance issues
         '''
-        disallowed_path = set()
-        default_agent = set() #here we will keep default settings
+        disallowed_paths = set()
+        default_agent_paths = set() #here we will keep default settings
         request = urllib2.Request("http://" + host + "/robots.txt")
         request.add_header("user-agent", "CIbot")
 
         if robot_text:
             for line in robot_text.splitlines():
                 #get row of our useragent, st read lines pattern match
-                if re.match(USER_AGENT, line, re.I):
+                if re.match(self.USER_AGENT, line, re.I):
                     bot = line.split()
                     bot = bot[1].strip()
                     if bot == user_bot and bot != "*":
-                        collector = disallowed_path
+                        collector = disallowed_paths
                         start_filling = True
                     elif bot == "*":
-                        collector = default_agent
+                        collector = default_agent_paths
                         start_filling = True
                     else:
                         start_filling = False
                     continue
 
         #if we didnt find user bot, then return default setting
-        if len(disallowed_path) == 0:
-            disallowed_path = default_agent
-        return disallowed_path
+        if len(disallowed_paths) == 0:
+            disallowed_paths = default_agent_paths
+        return disallowed_paths
 
     def get_content(url):
         '''
         '''
         result = ""
 
-        if not isinstance(url, str):
+        if not isinstance(url, basestring):
             crawler_url = CrawlerUrl(url)
         #fetch result
         try:
             request = urllib2.Request(url.url_string)
             request.add_header("user-agent", "chrome") #
             response = urllib2.urlopen(request)
-            result = response.read() #memory critical - hopefully you read only txt
+            result = response.read() #memory critical - hopefullyread txt
             url.is_visited = True
         except IOERROR, e:
             if hasattr(e, "reason"):
         urls = re.findall(r'href=[\'"]?([^\'">]+)',text, re.I)
         host = crawler_url.get_host()
         for i,url in enumerate(urls):
-            #if link is relative
+            #if link is relative, then replace this value with full url
             if url.find("http://") < 0:
                 urls[i] = urlparse.urljoin(crawler_url, url)
         #extract_http(url_map, text)
 
 #TODO: validate point of that function
     def is_content_relevant(content, pattern):
-        '''content is relevant when it matched pattern '''
-        result = True
+        '''content is relevant when it matched pattern
+           This functions looks does this url contains searchable pattern,
+           if it does then it returns True
+        '''
+        result = False
         if content:
-            matcher = re.match(content.lower())
-            result = re.find(matcher)
+            matcher = re.findall(pattern, content.lower())
+            if len(matcher) > 0:
+                result = True
         return result
 
     def save_content(crawler_url, content):
     ''' object that helps to manage url permissions
         filters out visited links
         params:
-            url = string of full url
+            url = string of full url of crawling start point
             depth = how deeply to follow links - beaware for spider traps
         usage:
             crawler_url = CrawlerUrl("http://wiki.org/", 1)
     '''
     def __init__(self, url_string, depth = 1):
-        self.url_string  = url
+        self.url_string  = url_string
         self.url = None
         self.depth = depth
-        self.is_permitted = None #3states -None, permission not checked, false/true
+        self.is_permitted = True #
         self.is_visited = False
 
     def build_url(self, params = None):
             in book example
         '''
         if params != None:
-            url = urllib.urlencode(self.urlstring, params)
+            url = urllib.urlencode(self.url_string, params)
 
         return url
     def parse_url(self):
         ''' splits url to different parts
             returns: dict of url parts
         '''
-        return urlparse.urlsplit(self.urlstring)
+        return urlparse.urlparse(self.url_string)
+
+    def get_host(self):
+        ''' returns urls  with scheme and hosts '''
+        parts = self.parse_url()
+        return parts.scheme + "://" + parts.netloc
 
     def hash_code(self):
-        ''' '''
-       return hashlib.md5(self.urlstring)
+       ''' returns hexdigest of md5 of given url '''
+       hasher = hashlib.md5(self.url_string)
+       return hasher.hexdigest()
 
-    def __eq__(crawler_url = None, urlstring = None):
+
+    def __eq__(self, crawler_url = None):
         ''' this methods compares hashed urls.
             ps: encoded urls may be different from original string
         '''
-        if crawler_url:
+        hash_code2 = ""
+        if isinstance(crawler_url, CrawlerUrl):
             hash_code2 = crawler_url.hash_code()
-        else:
-            hash_code2 = hashlib.md5(urlstring)
+        elif isinstance(crawler_url, basestring):
+            hash_code2 = hashlib.md5(crawler_url).hexdigest()
 
         return self.hash_code() == hash_code2
 
 if __name__ == "__main__":
     url = "http://en.wikipedia.org/wiki/Collective_intelligence"
     reg_exp = "collective.*intelligence"
-    url_queue.add( CrawlerUrl(url, -1))
+    url_queue = Queue()
+    url_queue.add( CrawlerUrl(url, 1))
     crawler = NaiveCrawler(url_queue, 2000, 5, 1000, reg_exp)
 

Chapter6/simple_crawler.pyc

Binary file added.

Chapter6/stat_naivecrawler.txt

Empty file added.

Chapter6/test_naivecrawler.py

+'''
+Testcases for NaiveCrawler.
+
+'''
+import unittest
+from simple_crawler import *
+from urlparse import urlparse
+from Queue import Queue
+
+class TestNaiveCrawler(unittest.TestCase):
+    '''
+        Testcases for NaiveCrawler.
+    '''
+    url_string = "http://en.wikipedia.org/wiki/Collective_Intelligence"
+    search_pattern = "collective.*intelligence"
+    demo_urls = ["http://bit.ly/",
+            "http://www.neti.ee/",
+            "http://www.w3.org/"
+                ]
+
+    def setUp(self):
+        self.crawler = NaiveCrawler()
+        self.url_queue = Queue()
+
+    def tearDown(self):
+        del self.crawler
+#--TEST INITIALIZING ------------------
+
+    def testDefaultInitiliazing(self):
+        ''' Can we use crawler without spezializing
+            useful to test correctness of setUp
+        '''
+        crawler = NaiveCrawler()
+        self.assertIsNotNone(crawler)
+
+#--TEST CONTINUE_CRAWLING() ------------------
+    def test_continue_crawling_with_empty_queue(self):
+        ''' what happens when theres no url anymore'''
+        self.assertFalse(self.crawler.continue_crawling())
+
+    def test_continue_crawling_with_depth_url(self):
+        """
+        Test what happens when in queue is url, which depth
+        is much, much bigger than allowed and max_depth isnot unlimited
+        """
+        self.crawler.visited_urls = range(1,10)
+        self.crawler.max_urls = 7
+        self.assertFalse(self.crawler.continue_crawling())
+
+    def test_continue_crawling_with_positive_example(self):
+        '''what happens if we have perfect example
+        perfect example - there is url in queue and max_depth>url.depth
+        '''
+        self.crawler.visited_urls = range(1,5)
+        self.crawler.max_depth = 7
+        self.assertTrue(self.crawler.continue_crawling())
+
+#-- TEST GET_NEXT_URL() -------------------------------------------------------
+    def test_get_next_url_with_empty_queue(self):
+        ''' tests what happens if there is no item in queue'''
+        self.assertIsNone(self.crawler.get_next_url())
+
+    def test_get_next_url_with_all_visited_urls(self):
+        '''tests what happens when we have queue with already wisited links'''
+        for url in self.demo_urls:
+            temp = CrawlerUrl(url)
+            temp.is_visited = False
+            temp.is_permitted = False
+            self.url_queue.put(temp)
+        self.crawler.url_queue = self.url_queue
+        next_url = self.crawler.get_next_url()
+        self.assertIsNotNone(next_url, None)
+
+    def test_get_next_url_with_correct_queue(self):
+        #initialize urls queue
+        for url in self.demo_urls:
+            self.crawler.url_queue.put(CrawlerUrl(url))
+        self.assertIsNotNone(self.crawler.get_next_url())
+
+#--TEST CALC_PERMISSION -------------------------------------------------------
+    def test_calc_permission_with_empty_object(self):
+        ''' test does given function return False,
+        if we give just initialized object of crawlerUrl'''
+        urlobj = CrawlerUrl(self.demo_urls[0])
+        self.assertEqual(self.crawler.calc_permission(urlobj), False)
+
+    def test_calc_permission_with_disallowed_urls(self):
+        '''tests, what happens if there link is disallowed'''
+        urlobj = CrawlerUrl(self.demo_urls[0])
+        host = urlobj.get_host()
+        self.crawler.disallowed_urls[host] =  set(self.demo_urls[0])
+        self.assertEqual(self.crawler.calc_permission(urlobj), False)
+#--TEST PARSE_ROBOT_TEXT ------------------------------------------------------
+    def test_parse_robot_text_if_nofile(self):
+        '''Tests, what happens when is set as None object'''
+        paths = self.crawler.parse_robot_text(None)
+        self.assertEqual(len(paths), 0)
+
+    def test_get_content_with_none_params(self):
+        '''what happens when user given url is None '''
+        self.assertTrue(False)
+
+    def test_extract_urls_with_no_links(self):
+        '''test what happens when there is no link to extract'''
+        self.assertTrue(False)
+
+#-- RUN TEST ------------------------------------------------------------------
+if __name__ == "__main__":
+    unittest.main()

Chapter6/test_simple_crawlerurl.py

+'''
+Test cases for naive crawler.
+
+Because that code handles lot of IO tasks,
+theres should be unit-tests.
+
+'''
+
+import unittest
+import hashlib
+
+from simple_crawler import CrawlerUrl
+
+class TestCrawlUrl(unittest.TestCase):
+    '''
+        This class includes test cases for CrawlerUrl.
+
+    '''
+    url_string = "http://en.wikipedia.org"
+
+
+    def setUP(self):
+        ''' set up objects, connections, db'''
+        pass
+    def tearDown(self):
+        pass
+
+    def  test_initiliazing(self):
+        '''
+        does iniitializing works correctly?
+        '''
+        url = CrawlerUrl("http://en.wikipedia.org")
+        self.assertEqual(url.url_string, self.url_string)
+        self.assertEqual(url.depth, 1)
+        url2 = CrawlerUrl("http://et.wikipedia.org/wiki/", depth = 2)
+        self.assertEqual(url2.depth, 2)
+
+    def test_hashing(self):
+        ''' test hashing with some critical values'''
+        url =  CrawlerUrl(self.url_string)
+        hasher = hashlib.md5(self.url_string)
+        self.assertEqual(url.hash_code(), hasher.hexdigest())
+
+    def test_url_parser(self):
+        ''' does url_parser works?'''
+        url = CrawlerUrl(self.url_string +"/wiki?q=demo_query")
+        parsed = url.parse_url()
+        self.assertEqual(parsed.query, "q=demo_query")
+        self.assertEqual(parsed.path, "/wiki")
+
+    def test_comparing(self):
+        '''Does comparing works'''
+        url1 = CrawlerUrl(self.url_string)
+        url2 = CrawlerUrl(self.url_string)
+        self.assertTrue(url1 == url2)
+        #whatif one of them is emtpy object
+        url2 = CrawlerUrl(None)
+        self.assertTrue(url1 != url2)
+        #and what happens when both are just None-value
+        url1 = CrawlerUrl(None)
+        with self.assertRaises(TypeError):
+            url1 == url2
+
+if __name__ == "__main__":
+    unittest.main()