Commits

Mike Ruckman committed 7979f98

Further cleanup and added rudimentary nose tests.

  • Participants
  • Parent commits 7a7aec6

Comments (0)

Files changed (2)

File nose_tests.py

+#!/usr/bin/env python
+
+import unittest
+
+import scraper
+
+class TestScraper(unittest.TestCase):
+    '''unittest TestCase for scraper.py
+    Tests the core methods of the Scraper class.
+    '''
+    def setUp(self):
+        """Define the defaults for the tests."""
+        self.url = 'http://news.google.com/'
+        self.search_tag = '[0-9]{13}'
+        self.test = scraper.Scraper(self.url, self.search_tag)
+
+    def test_get_regular_expression(self):
+        """Ensure the RE gets either a passed arg or something from 
+        raw_input()."""
+        self.assertEqual(self.test.search_tag, self.search_tag)
+
+    def test_fetch_page(self):
+        """Ensure a page is fetched from the URL. """
+        self.fetched = self.test.fetch_page()
+        self.assertTrue(str(type(self.fetched)),"<type 'str'>")
+
+    def test_scrape_page(self):
+        """Ensure the scrape returns a list (it can be empty). """
+        self.test.fetch_page()
+        self.results = self.test.scrape_page()
+        self.assertTrue(str(type(self.results)),
+                        "<class 'BeautifulSoup.ResultSet'>")
+
+if __name__ == '__main__':
+    unittest.main()
 import sys
 import getopt
 import re
+import argparse
 
 import mechanize
 import BeautifulSoup as BS
                             "Mozilla/5.0 (compatible; IWantYourData/0.1)")]
 
     def get_regular_expression(self):
-        self.search_tag = raw_input("Enter a RE: ")
-        return self.search_tag
+        if self.search_tag is '': 
+            self.search_tag = raw_input("Enter a RE: ")
+            return self.search_tag
+        else:
+            return None
 
-    def fetch_page(self, url):
+    def fetch_page(self):
         """
         Open a url to pass to the scrape method. 
         Returns a string (careful, they can be really long.
         """
-        self.data = mechanize.urlopen(url)
-        self.html = data.read()
-        self.page = BS.BeautifulSoup(html)
-        return self.page
+        self.data = mechanize.urlopen(self.page)
+        self.html = self.data.read()
+        self.parsed_page = BS.BeautifulSoup(self.html)
+        return self.parsed_page
 
     def scrape_page(self):
-        """Parse the results of fetch_page(), return a list. """
-        pass
+        """Parse the results of fetch_page(), return a list.
+        Currently only supports searching to html class attributes.
+        More to come.
+        """
+        self.scrapes = self.parsed_page.findAll(attrs={'class' : 
+                                            re.compile(self.search_tag)})
+        return self.scrapes
 
-def main(argv):
+def main():
     """ The defaults presume that they just want to return a scrape 
     from Google News. This is really just for testing to make sure 
     it will actually return some data.
     """
     
-    # Take nifty cmd line args to vary the searches          
-
-    # RE functions to grab titles and snippets from the       
-    # google news page.                                       
-
-    def grab_google_news(text):
-        titles = text.findAll(attrs={'class' : re.compile("[0-9]{13}")})
-        return titles
-
-    # Make the UserAgent of the browser something legitimate  
-    # to get past the robots.txt through a custom opener.     
-
-    opener = mechanize.Browser()
-    opener.addheaders = [("User-agent", 
-                          "Mozilla/5.0 (compatible; TestBrowser/0.1)")]
-
-    # Define the things we want to grab from the page and     
-    # which page to grab from.                                
 
     url = "http://news.google.com/news/search?aq=f&pz=1&cf=all&ned=us&hl=en&q=new+zealand&btnmeta_news_search=Search+News"
 
-    # Actually do the work. Grab the stuff and print it.      
-    # Newlines are meant to ensure prints don't get lost on   
-    # the terminal.                                           
+    url_to_search, re_to_use = url, '[0-9]{13}'
+    search_terms = []
+    out_file = 'summary.html'
 
-    print "Reading Google News search results for 'New Zealand'...\n"
+    try:
+        for option, argument in argv:
+            if option is '-p':
+                url_to_search = argument
+            elif option is '-r':
+                re_to_use = argument
+            elif option is '-o':
+                out_file = argument
+            elif option is '-h':
+                print help_message()
+                sys.exit(0)
+    except:
+        pass
 
-    page = mechanize.urlopen(url)
-    html = page.read()
-    soup = BS.BeautifulSoup(html)
+    url_to_search = "http://news.google.com/news/search?aq=f&pz=1&cf=all&ned=us&hl=en&q=new+zealand&btnmeta_news_search=Search+News"
 
-    print "Writing summaries to file.\n"
+    page_scrape = Scraper(url_to_search, re_to_use)
 
-    file = open("summary.html", 'w')
+    fetched = page_scrape.fetch_page()
 
-    htmlOut = grab_google_news(soup)
+    result = page_scrape.scrape_page()
 
-    file.write(str(htmlOut))  # Has to use str() because BS is funky
+    file = open(out_file, 'w')
+
+    file.write(str(result))  # Has to use str() because BS is funky
 
     print "Scraping complete! Open the file 'summary.html'"
 
     return message
 
 if __name__ == '__main__':
-    main(getopt.getopt(sys.argv[1:], 'hp:o:', 're='))
+    main()