Commits

Mike Ruckman  committed 7a7aec6

Cleaned code up to meet PEP 8. Reorginized to be more sensical.

  • Participants
  • Parent commits c49acd3

Comments (0)

Files changed (1)

 #!/usr/bin/env python
 
-import mechanize, re
-from BeautifulSoup import BeautifulSoup
+import sys
+import getopt
+import re
+
+import mechanize
+import BeautifulSoup as BS
 
 class Scraper(mechanize.Browser):
-    ''' Subclass mechanize.Browser() to create a wrapper for simply scraping sites.
-    Attempt to act as a easy import for simple scraping scripts. 
-    '''
-    def __init__(self, page=None, outFile=None, searchTag=None):
-        '''
-    There are 3 arguments you can pass this constructor: page, outFile and searchTag.
-    
-        page is url you want to scrape (defaults to Google News).
-    
-        outFile is where you want to direct the output (Defaults to 'summary.html').
-    
-        searchTag is what re you want to use to yank the correct block of html 
-        (defaults to 13 numbers for a GNews story id.
+    """ 
+    Subclass mechanize.Browser() to create a wrapper for scraping sites. 
+    """
+    def __init__(self, page, search_tag):
+        """Construct a new Scraper. 
+
+        Args: 
+            page: is url you want to scrape (defaults to Google News).
+           
+            search_tag: is what re you want to use to yank the correct 
+            block of html (defaults to 13 numbers for a GNews story id.
         
-        The defaults presume that they just want to return a scrape from Google News. This is
-        really just for testing to make sure it will actually return some data. 
-        The defaults are soon to be amended.
+        """
+        self.page = page
+        self.search_tag = search_tag
+        self.addheaders = [("User-agent", 
+                            "Mozilla/5.0 (compatible; IWantYourData/0.1)")]
 
-        '''
-        self.page = 'http://news.google.com/'
-        if outFile == None:
-            self.outFile = open("summary.html", 'w')
-        else: 
-            self.outFile = open(str(outFile), 'w')
-        self.searchTag = '[0-9]{13}'
-        self.addheaders = [("User-agent", "Mozilla/5.0 (compatible; IWantYourData/0.1)")]
+    def get_regular_expression(self):
+        self.search_tag = raw_input("Enter a RE: ")
+        return self.search_tag
 
-    def getRE(self):
-        self.searchTag = raw_input("Enter a RE: ")
-        return self.searchTag
+    def fetch_page(self, url):
+        """
+        Open a url to pass to the scrape method. 
+        Returns a string (careful, they can be really long.
+        """
+        self.data = mechanize.urlopen(url)
+        self.html = data.read()
+        self.page = BS.BeautifulSoup(html)
+        return self.page
 
-    def openURL(self, url=None):
-        ''' open a url to pass to the scrape method. returns a string (careful, they can be really long. '''
-        if url == None:
-            urlInput = raw_input("Enter a url to scrape: ")
-            if urlInput == "":      # Allow for a default to be used.
-                data = mechanize.urlopen(self.page)
-            else:
-                data = mechanize.urlopen(urlInput)
-        else:
-            data = mechanize.urlopen(url)
-        html = data.read()
-        return html
-
-    def scrape(self):
-        '''Open a URL and pull the data, returns a list. '''
+    def scrape_page(self):
+        """Parse the results of fetch_page(), return a list. """
         pass
 
-if __name__ == '__main__':
+def main(argv):
+    """ The defaults presume that they just want to return a scrape 
+    from Google News. This is really just for testing to make sure 
+    it will actually return some data.
+    """
+    
+    # Take nifty cmd line args to vary the searches          
 
-    # RE functions to grab titles and snippets from the       #
-    # google news page.                                       #
+    # RE functions to grab titles and snippets from the       
+    # google news page.                                       
 
-    def grabGNews(text):
+    def grab_google_news(text):
         titles = text.findAll(attrs={'class' : re.compile("[0-9]{13}")})
         return titles
 
-    # Make the UserAgent of the browser something legitimate  #
-    # to get past the robots.txt through a custom opener.     #
+    # Make the UserAgent of the browser something legitimate  
+    # to get past the robots.txt through a custom opener.     
 
     opener = mechanize.Browser()
-    opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; TestBrowser/0.1)")]
+    opener.addheaders = [("User-agent", 
+                          "Mozilla/5.0 (compatible; TestBrowser/0.1)")]
 
-    # Define the things we want to grab from the page and     #
-    # which page to grab from.                                #
+    # Define the things we want to grab from the page and     
+    # which page to grab from.                                
 
     url = "http://news.google.com/news/search?aq=f&pz=1&cf=all&ned=us&hl=en&q=new+zealand&btnmeta_news_search=Search+News"
 
-    # Actually do the work. Grab the stuff and print it.      #
+    # Actually do the work. Grab the stuff and print it.      
+    # Newlines are meant to ensure prints don't get lost on   
+    # the terminal.                                           
 
     print "Reading Google News search results for 'New Zealand'...\n"
 
     page = mechanize.urlopen(url)
     html = page.read()
-    soup = BeautifulSoup(html)
+    soup = BS.BeautifulSoup(html)
 
     print "Writing summaries to file.\n"
 
     file = open("summary.html", 'w')
 
-    htmlOut = grabGNews(soup)
+    htmlOut = grab_google_news(soup)
 
-    file.write(str(htmlOut))
+    file.write(str(htmlOut))  # Has to use str() because BS is funky
 
     print "Scraping complete! Open the file 'summary.html'"
+
+
+def help_message():
+    message = """Scraper pulls specific sections of text from a url.
+    
+    Usage: python scraper.py "search term" [options]
+
+    Options:
+        -p    -- Expects a valid url. 
+                 Default: Google News.
+        
+        -r    -- Expects a valid re for searching.
+                 Default: story id for Google News. 
+       
+        -o    -- Designate the desired output file.
+                 Default: summary.html
+    """
+    return message
+
+if __name__ == '__main__':
+    main(getopt.getopt(sys.argv[1:], 'hp:o:', 're='))