Commits

Brian Curtin  committed 4884117

Initial checkin

  • Participants

Comments (0)

Files changed (1)

+from urllib.request import urlopen
+from collections import namedtuple
+from datetime import datetime, timedelta
+import sys
+import re
+import os
+import time
+
+if sys.version_info[0] < 3:
+    print("Use Python 3")
+
+BASE_URL = "http://www.python.org/webstats/"
+CACHE_DIR = "webstats_cache"
+
+# Need to limit the amount we read from each month's stats page.
+# In Jan 2010 we started storing tons of buildbot pages and downloads are
+# usually at the top of the list anyway. Only check the first 1 MB of the file.
+MAX_SIZE = 1024 * 1000 # 1 MB
+
+pattern = ("^(?P<count>\d*)" # Hit count
+           ".*" # A bunch of percentages we don't care about
+           "/ftp/python//?" # The start of the download path
+           # Pick out the version, e.g., 3.2.1, 3.2, or "nt" (1.5.x)
+           "(?P<nt>(?:nt)?(?:win32)?)?((?P<major>\d)\.(?P<minor>\d)\.?(?P<revision>\d)?)?"
+           "/(?P<filename>(?:(?:\S*.exe)?(?:\S*.msi)?){1})$")
+expr = re.compile(pattern)
+
+Stats = namedtuple("Stats", ["hits", "version", "file"])
+_Version = namedtuple("Version", ["major", "minor", "revision"])
+
+class Version(_Version):
+    """Give Version a better str"""
+    def __str__(self):
+        return "{}.{}{}{}".format(self.major, self.minor,
+                                  "." if self.revision else "",
+                                  self.revision if self.revision else "")
+
+def parse(line):
+    match = expr.match(line)
+    if match:
+        count = int(match.group("count"))
+        if match.group("nt"):
+            # TODO: Figure out the revision.
+            ver = Version(1, 5, None)
+        else:
+            ver = Version(match.group("major"), match.group("minor"),
+                          match.group("revision"))
+        file = match.group("filename")
+        # The regex is screwed up, if we don't get a name, skip.
+        if not file:
+            return None
+        return Stats(count, ver, file)
+    return None
+
+def main():
+    # Database starts at 200601, and we want to go through the last full month
+    year, month = 2006, 1
+    dates = []
+    while True:
+        now = datetime.now()
+        if (now.year, now.month) == (year, month):
+            break
+        dates.append("{0}{1:02}".format(year, month))
+        if month == 12:
+            month = 1
+            year += 1
+        else:
+            month += 1
+
+    try:
+        os.mkdir(CACHE_DIR)
+    except WindowsError:
+        pass
+
+    for date in dates:
+        cached_file = os.path.join(CACHE_DIR, "url_{}.html".format(date))
+        if os.path.exists(cached_file):
+            with open(cached_file, "r") as f:
+                text = f.read()
+        else:
+            print("{} was not cached, requesting...".format(date))
+            url = urlopen("{}/url_{}.html".format(BASE_URL, date))
+
+            text = b""
+            bytes_read = 0
+            def _read(file, size):
+                nonlocal bytes_read
+                while True:
+                    val = file.read(size)
+                    if not val:
+                        break
+                    #yield val.decode("latin1", "replace")
+                    yield val
+                    bytes_read += len(val)
+                    if bytes_read > MAX_SIZE:
+                        break
+
+            t = time.time()
+            for chunk in _read(url, 4096):
+                text += chunk
+            text = text.decode("latin1", "replace")
+            print("Reading {} bytes read took {}".format(bytes_read,
+                                                         time.time() - t))
+
+            with open(cached_file, "w") as f:
+                f.write(text)
+
+        start, end = [text.find(x) for x in ("<PRE>", "</PRE>")]
+        start += len("<PRE>\n")
+        end -= len("</PRE>")
+
+        major_stats = {}
+        for line in text[start:end].split("\n"):
+            stats = parse(line)
+            if not stats:
+                continue
+
+            majorminor = "{}.{}".format(stats.version.major, stats.version.minor)
+            if majorminor == "None.None":
+                print(line)
+            if not majorminor in major_stats:
+                major_stats[majorminor] = 0
+            major_stats[majorminor] += stats.hits
+
+        print(date)
+        print(major_stats)
+        print("*"*50)
+
+if __name__ == "__main__":
+    try:
+        exit(main())
+    except KeyboardInterrupt:
+        pass