Commits

Ned Batchelder committed d29b1d4

Multi-threaded downloading. Done quickly, not great.

Comments (0)

Files changed (1)

 # "I need to parse HTML, I'm going to use regexes," and an expert answers, 
 # "You shouldn't parse HTML with regexes," they are both confused.
 
-import glob, json, re, shutil, sys, time, urllib, urllib2, zipfile
+import glob, itertools, json, Queue, re, shutil, sys, threading, time, urllib, urllib2, zipfile
 import os, os.path
 from cStringIO import StringIO
 from contextlib import closing
 
         return ids
 
-    def download_tabblos(self, ids):
+    def download_tabblos_threaded(self, ids):
+        """Download all the tabblos in the `ids` list."""
+        # q is a list of tuples: tabblo id, seq_number, num_retries
+        todo = Queue.Queue()
+        for item in zip(ids, itertools.count(1), itertools.repeat(1)):
+            todo.put(item)
+
+        class DownloaderThread(threading.Thread):
+            def __init__(self, tid, harvester):
+                threading.Thread.__init__(self)
+                self.tid = tid
+                self.harv = harvester
+
+            def log(self, msg):
+                self.harv.log("[%s] %s" % (self.tid, msg))
+
+            def run(self):
+                while True:
+                    try:
+                        id, i, num_retries = todo.get(False)
+                    except Queue.Empty:
+                        self.log("No more to download.")
+                        return
+                    n_of_m = "[%s] %s of %s, %s to go (try %s)" % (self.tid, i, len(ids), todo.qsize(), num_retries)
+                    try:
+                        self.harv.download_tabblo(id, n_of_m)
+                    except HarvesterException:
+                        raise
+                    except urllib2.HTTPError, e:
+                        if e.getcode() in (403, 500):
+                            self.log("Can't download %s, skipped. (%s)" % (id, e))
+                            self.harv.skipped.append(id)
+                            continue
+                        else:
+                            self.log("Retry later... (%s)" % e)
+                    except Exception, e:
+                        self.log("Retry later... (%s)" % e)
+                    else:
+                        continue
+
+                    # Didn't finish it, add back into the queue to try again later
+                    todo.put((id, i, num_retries+1))
+
+        downloaders = [DownloaderThread(tid, self) for tid in range(10)]
+        for dl in downloaders:
+            dl.start()
+
+        for dl in downloaders:
+            dl.join()
+
+    def download_tabblos_requeue(self, ids):
+        """Download all the tabblos in the `ids` list."""
+        # q is a list of tuples: tabblo id, seq_number, num_retries
+        q = zip(ids, itertools.count(1), itertools.repeat(1))
+        while q:
+            id, i, num_retries = q.pop(0)
+            n_of_m = "%s of %s, %s to go (try %s)" % (i, len(ids), len(q), num_retries)
+            try:
+                self.download_tabblo(id, n_of_m)
+            except HarvesterException:
+                raise
+            except urllib2.HTTPError, e:
+                if e.getcode() in (403, 500):
+                    self.log("Can't download %s, skipped. (%s)" % (id, e))
+                    self.skipped.append(id)
+                    continue
+                else:
+                    self.log("Retry later... (%s)" % e)
+            except Exception, e:
+                self.log("Retry later... (%s)" % e)
+            else:
+                continue
+
+            # Didn't finish it, add back into the queue to try again later
+            q.insert(40, (id, i, num_retries+1))
+
+    def download_tabblos_old_retry_immediately(self, ids):
         """Download all the tabblos in the `ids` list."""
         for i, id in enumerate(ids, 1):
             n_of_m = "%s of %s" % (i, len(ids))
             if e:
                 raise e
 
+    download_tabblos = download_tabblos_threaded
+
     def download_tabblo(self, id, n_of_m):
         """Download a single tabblo.
 
 
     def links(self, html):
         """Generate a sequence of relative links in the html."""
+        yield "thumbnail.png"
         for link in re.findall(r"""<img [^>]*src='([^']+)'""", html):
             yield link
         for link in re.findall(r"""onclick='location.href="([^"]+)"'""", html):
             self.log("Skipped %d tabblos: %s." % (len(self.skipped), ", ".join(str(i) for i in self.skipped)))
             self.log("Correct those tabblos (if you haven't already deleted them) and rerun Lifeboat.")
             os.remove(self.ids_txt_fname)
-        else:
-            ids = sorted(set(ids) - set(self.skipped))
-            self.generate_toc(ids)
+
+        ids = sorted(set(ids) - set(self.skipped))
+        self.generate_toc(ids)
 
 
 def escape_html(s):