Commits

Mike Ruckman committed 96dbf9e

Added ability to take command line arguments.

Comments (0)

Files changed (2)

         self.search_tag = '[0-9]{13}'
         self.test = scraper.Scraper(self.url, self.search_tag)
 
-    def test_get_regular_expression(self):
-        """Ensure the RE gets either a passed arg or something from 
-        raw_input()."""
-        self.assertEqual(self.test.search_tag, self.search_tag)
+    def test_set_regular_expression(self):
+        """Ensure the RE gets passed an arg."""
+        self.test.set_regular_expression('[a-z]')
+        self.assertEqual(self.test.search_tag, '[a-z]')
 
     def test_fetch_page(self):
         """Ensure a page is fetched from the URL. """
 
         Args: 
             page: is url you want to scrape (defaults to Google News).
-           
             search_tag: is what re you want to use to yank the correct 
             block of html (defaults to 13 numbers for a GNews story id.
         
         self.addheaders = [("User-agent", 
                             "Mozilla/5.0 (compatible; IWantYourData/0.1)")]
 
-    def get_regular_expression(self):
-        if self.search_tag is '': 
-            self.search_tag = raw_input("Enter a RE: ")
+    def set_regular_expression(self, regex=None):
+            self.search_tag = regex
             return self.search_tag
-        else:
-            return None
 
     def fetch_page(self):
         """
     from Google News. This is really just for testing to make sure 
     it will actually return some data.
     """
+
+    default_url = "http://news.google.com/news/search?aq=f&pz=1&cf=all&ned=us&hl=en&q=new+zealand&btnmeta_news_search=Search+News"
+
+    # Set up a parser to handle cmd line args
+    parser = argparse.ArgumentParser(prog=sys.argv[0])
     
+    parser.add_argument('-p', action='store', default=default_url,
+                        help='the page you want to search.',
+                        metavar='page')
+    parser.add_argument('-r', action='store', default='[0-9]{13}',
+                        help='the regular expression you want to do.',
+                        metavar='regex')
+    parser.add_argument('-o', action='store', default='summary.html',
+                        help='Specify the output file.', metavar='output')
 
-    url = "http://news.google.com/news/search?aq=f&pz=1&cf=all&ned=us&hl=en&q=new+zealand&btnmeta_news_search=Search+News"
+    parser.parse_args(args=sys.argv[1:], namespace=Scraper)
 
-    url_to_search, re_to_use = url, '[0-9]{13}'
-    search_terms = []
-    out_file = 'summary.html'
-
-    try:
-        for option, argument in argv:
-            if option is '-p':
-                url_to_search = argument
-            elif option is '-r':
-                re_to_use = argument
-            elif option is '-o':
-                out_file = argument
-            elif option is '-h':
-                print help_message()
-                sys.exit(0)
-    except:
-        pass
-
-    url_to_search = "http://news.google.com/news/search?aq=f&pz=1&cf=all&ned=us&hl=en&q=new+zealand&btnmeta_news_search=Search+News"
-
-    page_scrape = Scraper(url_to_search, re_to_use)
+    page_scrape = Scraper(Scraper.p, Scraper.r)
 
     fetched = page_scrape.fetch_page()
 
     result = page_scrape.scrape_page()
 
-    file = open(out_file, 'w')
-
+    file = open(Scraper.o, 'w')
     file.write(str(result))  # Has to use str() because BS is funky
 
-    print "Scraping complete! Open the file 'summary.html'"
+    print "Scraping complete! Open the file '%s'"    % Scraper.o
 
 
-def help_message():
-    message = """Scraper pulls specific sections of text from a url.
-    
-    Usage: python scraper.py "search term" [options]
-
-    Options:
-        -p    -- Expects a valid url. 
-                 Default: Google News.
-        
-        -r    -- Expects a valid re for searching.
-                 Default: story id for Google News. 
-       
-        -o    -- Designate the desired output file.
-                 Default: summary.html
-    """
-    return message
-
 if __name__ == '__main__':
     main()