Tim Sherratt avatar Tim Sherratt committed bccc4e8

Some minor bug fixes

Comments (0)

Files changed (1)


         The 'title' filter also ensures that a greater variety of titles are exposed.
         if titles:
-            title = self.titles_by_id[random.choice(titles)]
+            try:
+                title = self.titles_by_id[int(random.choice(titles))]
+            except KeyError:
+                title = self.titles_by_id[random.choice(titles)]
             title_id = title['id']
         elif state and state in STATES:
             if not year:
         page = BeautifulSoup(self.response)
         self.total_results = (page.find('div', attrs={'id': 'newspapers'})
                                         .find('div', 'hdrresult')
-                                        .p.strong.string.strip())
+                                        .p.strong.string.strip().replace(',',''))
         self.results = [self.extract_details(result) 
                         for result in page.find('div', {'id': 'newspapers'})
         article['newspaper_title'] = newspaper_title
         article['newspaper_details'] = newspaper_details
         article['issue_date'] = publication_fields.b.string.strip()
+        article['issue_year'], article['issue_month'], article['issue_day'] = extract_date(article['issue_date'])
         article['page'], article['type'] = (re.search(r'(\d+) (.*)', 
                                             page.find('span', 'about')
         article['issue_date'] = page.find('div', 'issue').strong.string.strip()
+        article['issue_year'], article['issue_month'], article['issue_day'] = extract_date(article['issue_date'])
         article['page'] = (page.find('select', attrs = {'name': 'id'})
                             .find('option', attrs = {'selected': 'selected'}).string.strip())
         page_id = re.search(r'var pageId = \'(\d+)\'', self.response).group(1)
                 if response is not None:
                     success = True
-                    if try_num == 10:
+                    if try_num == self.tries:
                         raise ServerError('Nothing was returned')
                         try_num += 1
             return response
+def extract_date(date_string):
+    '''
+    Extracts year, month and day integers from issue date string.
+    '''
+    cleaned_date = re.match('^(\w+ \d{1,2} \w+ \d{4})', date_string).group(1)
+    date_tuple = time.strptime(cleaned_date, '%A %d %B %Y')
+    year = date_tuple[0]
+    month = date_tuple[1]
+    day = date_tuple[2]
+    return (year, month, day)
 class ServerError(Exception):
 if __name__ == "__main__":
     np = TroveNewspapersClient()
-    #np.search(q="wragge")
+    #np.search(exactPhrase="inclement wragge")
+    np.search(url="http://trove.nla.gov.au/newspaper/result?q=&exactPhrase=inclement+wragge")
     #np.get_random_articles(year="1880", kw_all="kelly", kw_any="ireland irish")
-    np.get_random_articles(year="1880", filters=['title', 'month'])
-    #np.get_article('41339211')
+    #np.get_random_articles(year="1880", filters=['title', 'month'])
+    #np.get_random_articles(year="1880", titles=['35'])
+    #np.get_article('2806947')
     print np.query
     print np.total_results
     print np.results
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.