Commits

Tim Sherratt committed 430cd7e

Ooops removed harcoded testing url. Also better handling of 'coming soon' articles.

  • Participants
  • Parent commits bccc4e8

Comments (0)

Files changed (1)

File src/trovenewspapers/harvest.py

         self.csv_file = None
         self.text_zip_file = None
         self.pdf_zip_file = None
+        self.zip_dir = ''
     
     def set_output_files(self, filename, text, pdf):
         '''
             else:
                 self.pdf_zip_file = ZipFile('%s_pdf.zip' % self.path, 'w')
 
-    def harvest(self, query, filename=None, start=0, text=None, pdf=None):
+    def harvest(self, query, filename=None, start=0, text=None, pdf=None, zip_dir='title'):
         '''
         Harvest the results of the supplied query, saving a CSV to the 
         (optional) filename. If no filename is given 
         '''
         self.query = query
+        self.zip_dir = zip_dir
         self.set_output_files(filename, text, pdf)
         if start:
             self.completed = int(start)
                 news.reset()
                 news.tries = 10                
                 page_url = '%s&s=%s' % (self.query, self.completed)
+                print page_url
                 try:
                     news.search(url=page_url)
                 except Exception, error:
                                               news.results['title'])
                     self.csv_file.writerow(news.results)
                     if self.text_zip_file or self.pdf_zip_file:
-                        directory = '%s-%s' % (news.results['newspaper_id'], 
-                                               string.replace(news.results['newspaper_title'], ' ', '-'))
-                        filename = '%s-%s-p%s' % (news.results['id'], 
-                                                  string.replace(news.results['issue_date'], ' ', '-'), 
-                                                  news.results['page'])
+                        if self.zip_dir == 'year':
+                            directory = str(news.results['issue_year'])
+                            filename = '%s-%s-%s-%s-p%s' % (news.results['newspaper_id'], 
+                                                            string.replace(news.results['newspaper_title'], ' ', '-'),
+                                                            news.results['id'], 
+                                                            string.replace(news.results['issue_date'], ' ', '-'), 
+                                                            news.results['page'])
+                        else:
+                            directory = '%s-%s' % (news.results['newspaper_id'], 
+                                                   string.replace(news.results['newspaper_title'], ' ', '-'))
+                            filename = '%s-%s-p%s' % (news.results['id'], 
+                                                      string.replace(news.results['issue_date'], ' ', '-'), 
+                                                      news.results['page'])
                     if self.text_zip_file:
                         self.text_zip_file.writestr(('%s/%s.txt' % 
                                                      #encode added to filename because of problem with Python 2.5
                             self.pdf_zip_file.writestr(('%s/%s.pdf' % 
                                                        (directory, filename)).encode('utf-8'), 
                                                        content.read())
-                    self.completed += 1
                     time.sleep(1)
+            self.completed += 1
+
                     
     def try_url(self, url):
         '''