Commits

Martin von Löwis  committed f5797f5

Import from PyPI r764.

  • Participants
  • Parent commits 951f231

Comments (0)

Files changed (2)

File pep381client/apache_reader.py

+"""
+Reads apache log files
+"""
+import bz2
+import gzip
+import re
+import os
+
+# list of recognized user agents
+SETUPTOOLS_UA = (re.compile((r'^.* setuptools/(?P<version>[0-9]\..*)$')), 'setuptools/%s')
+URLLIB_UA = (re.compile(r'^Python-urllib/(?P<version>[23]\.[0-9])$'), 'Python-urllib/%s')
+SAFARI_UA = (re.compile(r'^Mozilla.* .* Version/(?P<version>.*) Safari/.*$'), 'Safari/%s')
+GOOGLEBOT = (re.compile(r'Googlebot-Mobile/(?P<version>.*);'), 'Googlebot-Mobile/%s')
+MSNBOT = (re.compile(r'^msnbot/(?P<version>.*) '), 'msnbot/%s')
+FIREFOX_UA = (re.compile(r'^Mozilla.*? Firefox/(?P<version>[23])\..*$'), 'Firefox/%s')
+PLAIN_MOZILLA = (re.compile(r'^Mozilla/(?P<version>.*?) '), 'Mozilla/%s')
+
+logre = re.compile(r"\[(?P<day>..)/(?P<month>...)/(?P<year>....):"
+                   r"(?P<hour>..):(?P<min>..):(?P<sec>..) "
+                   r'(?P<zone>.*)\] "GET (?P<path>[^ "]+) HTTP/1.." 200 .*? (?:".*?")? '
+                   r'"(User-Agent: )?(?P<useragent>.*)"$', re.DOTALL)
+
+month_names=['jan','feb','mar','apr','may','jun',
+             'jul','aug','sep','oct','nov','dec']
+month_index = {}
+
+for i in range(12):
+    month_index[month_names[i]] = i+1
+
+def month_to_index(month):
+    return month_index[month.lower()]
+
+class ApacheLogReader(object):
+    """provides an iterator over apache logs"""
+
+    def __init__(self, filename, files_url='', mode=None):
+        if mode is None:
+            ext = os.path.splitext(filename)[-1]
+            if ext in ('.bz2', '.gz'):
+                mode = 'r:%s' % ext[1:]
+            else:
+                mode = 'r'
+        if ':' in mode:
+            mode, compr = mode.split(':')
+        else:
+            mode, compr = mode, None
+        if compr not in ('bz2', 'gz', None):
+            raise ValueError('%s mode not supported' % compr)
+        if compr == 'bz2':
+            self._data = bz2.BZ2File(filename, mode)
+        elif compr == 'gz':
+            self._data = gzip.open(filename)
+        else:
+            self._data = open(filename, mode)
+
+        self.files_url = files_url
+
+    def __iter__(self):
+        return self
+
+    def package_name(self, path):
+        path = [p for p in path.split('/') if p != '']
+        return path[-2]
+
+    def get_simplified_ua(self, user_agent):
+        """returns a simplified version of the user agent""" 
+        for expr, repl in (URLLIB_UA, SETUPTOOLS_UA, SAFARI_UA, GOOGLEBOT, 
+                           MSNBOT, FIREFOX_UA, PLAIN_MOZILLA):
+            res = expr.search(user_agent)
+            if res is not None:
+                return repl % res.group('version')
+        return user_agent
+
+    def next(self):
+
+        while True:
+            line = self._data.next().strip() 
+            m = logre.search(line)
+            if m is None:
+                continue
+            path = m.group('path')
+            filename = os.path.basename(path)
+            filename = filename.split('?')[0]
+            if not path.startswith(self.files_url) or filename == '':
+                continue
+            res = m.groupdict()
+            res['month'] = month_to_index(res['month'])
+            res['useragent'] = self.get_simplified_ua(res['useragent'])
+            res['filename'] = filename
+            res['packagename'] = self.package_name(path)
+            res['day'] = int(res['day'])
+            res['year'] = int(res['year'])
+            res['hour'] = int(res['hour'])
+            res['minute'] = int(res['min'])
+            return res
+
+        raise StopIteration
+

File pep381client/apache_stats.py

+import os
+import csv 
+import bz2
+import gzip
+import re
+import urllib2
+import socket
+
+from apache_reader import ApacheLogReader
+
+class LocalStats(object):
+    """Base class that writes the log file
+    """
+    def _get_logs(self, logfile, file_urls):
+        """Needs to return an iterator. Each entry
+        should be a dictionary"""
+        if callable(logfile):
+            return logfile(file_urls)
+        raise NotImplementedError
+
+    def _get_file_obj(self, path, mode='r', compression=None):
+        """returns a file object"""
+        if compression == 'bz2':
+            return bz2.BZ2File(path, mode)
+        elif compression == 'gz':
+            return gzip.open(path, mode)
+        return open(path, mode)
+    
+    def _build_stats(self, logfile, fileobj, files_url='/packages', 
+                     filter=None, compression=None):
+        """Builds a stats file
+        
+        - logfile: path to the original log file, or callable
+        - fileobj : a file object or a path to create a file
+        - files_url : a filter that define the beginnin of package urls 
+        - filter: if given, a callable that receives the 
+        current line. if the callable returns True, 
+        the line is not included
+        """
+        if isinstance(fileobj, str):
+            fileobj = self._get_file_obj(fileobj, 'w', compression)
+            file_created = True
+        else:
+            file_created = False
+
+        writer = csv.writer(fileobj)
+        downloads = {}
+        for log in self._get_logs(logfile, files_url):
+            if filter is not None:
+                if filter(log):
+                    continue
+            filename = log['filename']
+            user_agent = log['useragent'] 
+            package_name = log['packagename']
+            key = (filename, user_agent, package_name)
+            count = log.get('count', 1)
+            if key in downloads:
+                downloads[key] += count
+            else:
+                downloads[key] = count
+        filenames = downloads.keys()
+        filenames.sort()
+        for key in filenames:
+            filename, user_agent, package_name = key
+            count = downloads[key]
+            writer.writerow((package_name, filename, user_agent, count))
+        if file_created:
+            fileobj.close()
+
+    def build_daily_stats(self, year, month, day, logfile, fileobj,
+                          files_url='/packages', compression=None):
+        """creates a daily stats file using an apache log file.
+        
+        - year, month, day: values for the day 
+        - logfile : path to the log file, or callable
+        - fileobj : a file object or a path to create a file
+        - files_url : a filter that define the beginning of package urls
+        """
+        def _filter(log):
+            return (day != log['day'] or month != log['month'] or 
+                    year != log['year'])
+
+        self._build_stats(logfile, fileobj, files_url, _filter, compression)
+
+
+    def build_monthly_stats(self, year, month, logfile, fileobj,
+                            files_url='/packages', compression=None):
+        """creates a monthly stats file using an apache log file.
+        
+        - year, month: values for the month
+        - logfile : path to the log file
+        - fileobj : a file object or a path to create a file
+        - files_url : a filter that define the beginnin of package urls
+        """
+        def _filter(log):
+            return (month != log['month'] or year != log['year'])
+
+        self._build_stats(logfile, fileobj, files_url, _filter, compression)
+
+    def read_stats(self, stats_file):
+        """Returns an iterator over a stats file"""
+        if isinstance(stats_file, str):
+            ext = os.path.splitext(stats_file)[-1][1:]
+            stats_file = self._get_file_obj(stats_file, 'r', ext)
+        reader = csv.reader(stats_file)
+        for line in reader:
+            yield {'packagename': line[0],
+                   'filename': line[1],
+                   'useragent': line[2],
+                   'count': line[3]}
+        #reader.close()
+
+    def build_local_stats(self, year, month, day, logfile, directory=None):
+        """builds local stats with default values"""
+        filename = '%d-%d-%d.bz2' % (year, month, day)
+        if directory is not None:
+            filename = os.path.join(directory, filename)
+
+        self.build_daily_stats(year, month, day, logfile, filename, 
+                               compression='bz2')
+
+class ApacheLocalStats(LocalStats):
+    """concrete class that uses the ApacheLogReader"""
+    def _get_logs(self, logfile, files_url):
+        return ApacheLogReader(logfile, files_url)
+
+class ApacheDistantLocalStats(ApacheLocalStats):
+    """Concrete class that gets the data from a distant file"""
+    is_url = re.compile(r'^http://')
+
+    def __init__(self, cache_folder='', timeout=5):
+        self.cache_folder = cache_folder
+        if not os.path.exists(cache_folder):
+            os.makedirs(cache_folder)
+        self.timeout = timeout
+
+    def get_and_cache(self, url):
+        """retrieve the distant file and add it in the local 
+        cache"""
+        basename = url.split('/')[-1]
+        filename = os.path.join(self.cache_folder, basename)
+        if os.path.exists(filename):
+            # in cache, let's return it
+            return filename, open(filename)
+        
+        # not in cache, we need to retrieve it
+        # and store it
+        oldtimeout = socket.getdefaulttimeout()
+        socket.setdefaulttimeout(self.timeout)
+        try:
+            try:
+                content = urllib2.urlopen(url).read()
+            except (urllib2.URLError, socket.timeout):
+                return '', None
+        finally:
+            socket.setdefaulttimeout(oldtimeout)
+
+        f = open(filename, 'w')
+        try:
+            f.write(content)
+        finally:
+            f.close()
+        
+        return filename, open(filename)
+
+    def read_stats(self, stats_file):  
+        """retrieve a distant file and works with it"""
+        if self.is_url.search(stats_file) is not None: 
+            path, fileobj = self.get_and_cache(stats_file)
+            if path == '':
+                return iter([])
+        return ApacheLocalStats.read_stats(self, path)
+