Commits

Gregory Petukhov committed 6d10dbf

Most of work is done

Comments (0)

Files changed (18)

-.*\.pyc$
-^update\.sh$
+syntax:glob
+*.pyc
+*.pyo
+*.swp
+*.swo
+*.orig
+*.egg-info/
+dist/
+sape.old/
-Grigoriy Petukhov: http://web-brains.com
+Grigoriy Petukhov - http://lorien.name

INSTALL.txt

-Thanks for using this django application.
-
-Installation instruction:
- * just copy it somewhere and then use :-)
-
-Installation to django project
- * add "sape" to INSTALLED_APPS
- * edit settings.py:
-   * add SAPE_USER = your sape.ru account id
-   * add SAPE_DB_FILE = path to cache file, check it permissions!
-   * optional: add SAPE_VERBOSE = True if you want see error messages when something goes wrong
-===========
-Sape Client
-===========
+Installation
+============
 
-This library allows you to integrate sape.ru service to your python website.
+Just install `sape` package via easy_install, pip or from repository.
 
-For installation instructions see the file "INSTALL.txt" in this
-directory.
+Repository URL: http://bitbucket.org/lorien/sape
 
-For django users:
- * Make all steps from INSTALL.txt then write in templates following:
 
--- begin --
-{% load sape_extras %}
+Usage in django project
+=======================
 
-{% sape_links %}
--- end --
+ * Create directory where local links database should be saved
+ * Put 'sape.django' into settings.INSTALLED_APPS
+ * Put 'sape.django.context_processors.sape'
+   into settings.TEMPLATE_CONTEXT_PROCESSORS
+ * Put path to local links database into settings.SAPE_DATABASE
+ * Put sape.ru account ID to settings.SAPE_USER
+ * Put site hostname to settings.SAPE_HOST
+ * Setup cron to run periodically the command `manage.py sape_refresh`.
+   That command download fresh version of links database.
+   Sample cron entry: * * * * * cd /web/project; ./manage.py sape_refresh
+ * Put `{{ sape.links|join:", " }} on appropriate place in the site's templates
+ 
 
-You can control number of links:
+Usage in arbitrary python project
+=================================
 
--- begin --
-{% sape_links 3 %}
-{% sape_links 2 %}
-{% sape_links %}
--- end --
-
-If you call sape_links several times than do not forget remove number argument from
-last call that will ensures that all links have been displayed.
+ * Write script which calls sape.provider.refresh_local_database function and passes
+   it correct arguments (path to local database, sape.ru account ID, site hostname)
+ * Call that script periodically with cron or anything else
+ * Use sape.client.SapeClient instance to get links for the page.

TODO.txt

Empty file removed.
-from util import SapeClient, SapeException
+import itertools
+import logging
+
+from provider import read_database_key
+
+
+class Client(object):
+    def __init__(self, path, url):
+        """
+        Args:
+            path: path to local database file
+            url: the *escaped* url of site page for which links should be
+                extracted from local database
+        """
+
+        self.path = path
+        self.url = url
+        self.error = ''
+
+
+    def links(self):
+        """
+        Return all links on current page.
+        """
+
+        if not hasattr(self, '_links'):
+            try:
+                self._links = read_database_key(self.path, self.url)
+            except Exception, ex:
+                self._links = []
+                self.error = unicode(ex)
+                logging.error(u'sape: %s' % unicode(ex))
+        return self._links
+
+
+    def next_links(self, number=1):
+        """
+        Return next `number` links on current page.
+        """
+
+        if not hasattr(self, '_iterator'):
+            self._iterator = itertools.chain(self.links())
+        return list(itertools.islice(self._iterator, number))
+
+
+if __name__ == '__main__':
+    client = Client('/tmp/links.db', 'five')
+    print client.next_links(1)
+    print client.next_links(2)
+    print client.next_links(10)

sape/django/__init__.py

Empty file added.

sape/django/context_processors.py

+import urllib
+from sape.client import Client
+
+from django.conf import settings
+
+def sape(request):
+    path = urllib.quote(request.path.encode('utf-8'), safe='/')
+    qs = request.META.get('QUERY_STRING', '')
+    host = request.get_host()
+    url = 'http://%s%s%s' % (host, (qs and '?' or ''), qs)
+    client = Client(settings.SAPE_DATABASE, url)
+    return {'sape': client}

sape/django/management/__init__.py

Empty file added.

sape/django/management/commands/__init__.py

Empty file added.

sape/django/management/commands/sape_refresh.py

+import logging
+
+from django.core.management.base import BaseCommand, CommandError
+from django.conf import settings
+
+from sape.provider import refresh_local_database
+
+class Command(BaseCommand):
+    help = 'Refresh local database containing sape.ru links'
+
+    def handle(self, *args, **kwargs):
+        logging.basicConfig(level=logging.DEBUG)
+        refresh_local_database(settings.SAPE_DATABASE, settings.SAPE_USER,
+                               settings.SAPE_HOST)
+import anydbm
+import httplib
+httplib.HTTPConnection.debuglevel = 1
+import urllib2
+from StringIO import StringIO
+from gzip import GzipFile
+import logging
+import socket
+
+import version
+import phpserialize
+
+USER_AGENT = 'python-sape/%s http://bitbucket.org/lorien/sape' % version.VERSION
+REMOTE_DATABASE_PATH = '/code.php?user=%(user)s&host=%(host)s&charset=utf-8'
+SAPE_SERVERS = ['dispenser-01.sape.ru', 'dispenser-02.sape.ru']
+DEFAULT_TIMEOUT = 10
+DB_DELIMITER = '||'
+
+socket.setdefaulttimeout(DEFAULT_TIMEOUT)
+
+def build_database_url(user, client_host, sape_server):
+    """
+    Build absolute URL of remote file containing links.
+
+    Args:
+        user: sape.ru account ID
+        host: hostname of the client site
+    """
+
+    path = REMOTE_DATABASE_PATH % ({'user': user, 'host': client_host})
+    return 'http://%s%s' % (sape_server, path)
+
+
+def fetch_remote_file(url):
+    """
+    Retreive remove file and save it locally.
+
+    Args:
+        url: absolute URL of remote file
+        localpath: local path where file should be saved
+
+    Return:
+        Content of retreived file or None if somthing went wrong.
+    """
+
+    logging.debug(u'Fetching remote file from %s' % url)
+    req = urllib2.Request(url)
+    req.add_header('User-Agent', USER_AGENT)
+    req.add_header('Accept-Encoding', 'gzip')
+    opener = urllib2.build_opener()
+
+    try:
+        response = opener.open(url)
+    except Exception, ex:
+        logging.error(u'Error: %s' % ex)
+    else:
+        if response.code == 200:
+            data = response.read()
+            logging.debug(u'Response headers: %s' % dict(response.headers))
+            if 'gzip' in response.headers.get('Content-Encoding', ''):
+                data = GZipFile(fileobj=StringIO(data)).read()
+            return data
+        else:
+            logging.error(u'Invalid response status: %d' % response.code)
+
+    return None
+
+
+def fetch_database(user, host):
+    """
+    Return content of remote database for given `user` and `host`.
+
+    Args:
+        user: sape.ru account ID
+        host: hostname of the client site
+    """
+
+    for sape_server in SAPE_SERVERS:
+        url = build_database_url(user, host, sape_server)
+        data = fetch_remote_file(url)
+        if data:
+            if data.startswith('FATAL ERROR'):
+                logging.error(u'Sape.ru error: %s' % data)
+            else:
+                return data
+    return None
+
+
+def refresh_local_database(path, user, host):
+    """
+    Fetch remote database, parse it and replace the local database.
+    """
+
+    data = fetch_database(user, host)
+    if data:
+        try:
+            mapping = parse_database(data)
+        except Exception, ex:
+            logging.error(u'Invalid database structure: %s' % ex)
+        else:
+            save_database(path, mapping)
+
+
+def parse_database(data):
+    """
+    Parse the raw data fetched from sape.ru server.
+    """
+
+    links = {}
+    dump = phpserialize.loads(data)
+    for key, value in dump.iteritems():
+        if isinstance(value, dict):
+            value = value.values()
+        else:
+            value = [value]
+        links[key] = value
+    return links
+
+
+def save_database(path, mapping):
+    """
+    Save database in dbm file.
+
+    Args:
+        mapping: str -> (str,)
+    """
+
+    try:
+        db = anydbm.open(path, 'c')
+        for key, value in mapping:
+            value = DB_DELIMITER.join(value)
+            db[key] = value
+    except Exception, ex:
+        logging.error(u'Error while saving database: %s' % ex)
+
+
+def read_database_key(path, key):
+    """
+    Read the key value from database.
+
+    If key was not found try to read "__sape_new_url__" key.
+    If nothing was found return empty string.
+    """
+
+    db = anydbm.open(path)
+    try:
+        value = db[key]
+    except KeyError:
+        value = db.get('__sape_new_url__', '')
+    value = value.split(DB_DELIMITER)
+    return value
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')
+    #fetch_remote_database('http://a.local')
+    #fetch_database('user', 'host')
+    print read_database('/tmp/links.db', 'foo')

sape/templatetags/__init__.py

Empty file removed.

sape/templatetags/sape_extras.py

-"""
-Examples of useage:
-{% sape_links %}
-or
-{% sape_links 1 %}...{% sape_links 2 %}...{% sape_links %}
-or
-<ul class="foo">{% sape_links_list 2 %}</ul>...
-"""
-from django import template
-from django.conf import settings
-
-from sape import SapeClient, SapeException
-
-register = template.Library()
-
-SAPE_VERBOSE = getattr(settings, 'SAPE_VERBOSE', False)
-CLIENT_KEY = 'sape_client_cached'
-
-
-def build_sape_links_node(parser, token, join):
-    parts = token.split_contents()
-    if len(parts) > 1:
-        number = int(parts[1])
-    else:
-        number = 0
-    return SapeLinksNode(number, join=join)
-
-
-@register.tag
-def sape_links(parser, token):
-    return build_sape_links_node(parser, token, join=True) 
-
-
-@register.tag
-def sape_links_list(parser, token):
-    return build_sape_links_node(parser, token, join=False) 
-
-
-class SapeLinksNode(template.Node):
-    def __init__(self, number, join):
-        self.number = number
-        self.join = join
-
-    def render(self, context):
-        if not CLIENT_KEY in context:
-            request = context['request']
-            try:
-                context[CLIENT_KEY] = build_sape_client(request)
-            except SapeException, ex:
-                return SAPE_VERBOSE and unicode(ex) or ''
-        client = context[CLIENT_KEY]
-
-        if not self.join:
-            return ',\n'.join('<li>%s</li>' % x for x in client.return_links(self.number))
-        else:
-            return client.return_links(self.number, join=True)
-
-
-@register.tag
-def sape_debug(parser, token):
-    return SapeDebugNode()
-
-
-class SapeDebugNode(template.Node):
-    def render(self, context):
-        request = context['request']
-        try:
-            client = build_sape_client(request)
-        except SapeException, ex:
-            error = unicode(ex)
-        else:
-            error = ''
-
-        pages_count = 0
-        uris_count = 0
-        for key, items in client.links.iteritems():
-            if not key.startswith('__'):
-                pages_count += 1
-                uris_count += len(items)
-
-        tpl = 'User: %s, host: %s, error: %s, pages in cache: %s, uris in cache: %s,' +\
-              ' cache time: %s, cache updated: %s'
-        args = (client.user, client.host, error, pages_count, uris_count,
-                client.db_file_mtime, client.db_file_updated)
-        return tpl % args
-
-
-def build_sape_client(request):
-    # META['QUERY_STRING'] could be None
-    qs = request.META.get('QUERY_STRING') or ''
-    uri = ''.join([request.path, len(qs) and '?' or '', qs])
-    if hasattr(settings, 'SAPE_HOST'):
-        host = settings.SAPE_HOST
-    else:
-        if 'HTTP_HOST' in request.META:
-            host = request.META['HTTP_HOST']
-        else:
-            host = request.META['SERVER_NAME']
-
-    for key in ('SAPE_USER', 'SAPE_DB_FILE'):
-        if not hasattr(settings, key):
-            raise Exception('settings.%s is undefined' % key)
-
-    client = SapeClient(host=host, user=settings.SAPE_USER,
-                        request_uri=uri, db_file=settings.SAPE_DB_FILE)
-    return client

sape/util.py

-"""
-This library allows to integrate sape.ru service into python website.
-This library is NOT official sape.ru script.
-
-This code uses fcntl library which is not windows compatible.
-This library is port from PHP version 1.0.3.
-I tryed to save original structure (even identifier names) of code to easely comparing with original.
-You can find source of PHP version here: http://dumpz.org/2883/
-
-Example of usage:
-
-sape = SapeClient(host='YOUR SITE DOMAIN',
-                  request_uri='THE URI OF REQUESTED PAGE',
-                  user='SAPE USER ID',
-                  db_file='path where sape links should be saved')
-links = sape.return links()
-
-You can send questions, bugreports and wishes to lizendir@gmail.com
-"""
-
-import random
-import socket
-import fcntl
-import os
-from datetime import datetime, timedelta
-import time
-import urllib2
-import urllib
-
-import phpserialize
-
-# Default settings
-SAPE_VERBOSE = False
-SAPE_CHARSET = None
-SAPE_SERVER_LIST = ['dispenser-01.sape.ru', 'dispenser-02.sape.ru']
-SAPE_CACHE_LIFETIME = 3600
-SAPE_CACHE_RELOADTIME = 600
-SAPE_ERROR = ''
-SAPE_FETCH_REMOTE_TYPE = 'file_get_contents'
-SAPE_SOCKET_TIMEOUT = 6
-SAPE_DB_FILE = ''
-SAPE_USER_AGENT = 'SAPE_Client python'
-SAPE_FORCE_SHOW_CODE = True
-
-class SapeException(Exception):
-    pass
-
-
-class SapeBase(object):
-    def __init__(self, *args, **kwargs):
-        self.host = kwargs['host']
-        if self.host.startswith('www.'):
-            self.host = self.host[4:]
-        self.request_uri = kwargs['request_uri']
-        self.user = kwargs['user']
-        self.db_file = kwargs['db_file']
-
-        keys = ('verbose', 'charset', 'socket_timeout',
-                'cache_lifetime', 'cache_reloadtime',
-                'force_show_code', 'debug', 'server_list')
-        for key in keys:
-            default = globals().get('SAPE_%s' % key.upper())
-            setattr(self, key, kwargs.get(key, default))
-
-        self.dispenser_path = '/code.php?user=%s&host=%s&charset=utf-8' % (self.user, self.host)
-
-        cookies = kwargs.get('cookies', {})
-        self.is_our_bot = cookies.get('sape_cookie') == self.user
-        self.debug = cookies.get('sape_debug') == '1'
-
-        random.shuffle(self.server_list)
-
-    
-    def fetch_remote_file(self, host, path):
-        old_timeout = socket.getdefaulttimeout()
-        socket.setdefaulttimeout(self.socket_timeout)
-
-        url = 'http://%s%s' % (host, path)
-        req = urllib2.Request(url)
-        req.add_header('User-Agent', SAPE_USER_AGENT)
-        try:
-            resp = urllib2.urlopen(req)
-        except urllib2.URLError:
-            socket.setdefaulttimeout(old_timeout)
-            raise SapeException('Network error')
-        else:
-            socket.setdefaulttimeout(old_timeout)
-            return resp.read()
-
-
-
-    def _open(self, fname, mode, lock):
-        """
-        Open file in MODE mode and the lock it with LOCK level.
-        """
-
-        try:
-            fh = file(fname, mode)
-        except IOError:
-            raise SapeError('Could not open %s in %s mode' % (fname, mode))
-        else:
-            try:
-                fcntl.flock(fh, lock)
-            except IOError:
-                raise SapeException('Could not lock file %s' % fname)
-            else:
-                return fh
-
-
-    def _read(self, fname):
-        """
-        Read the file safely.
-        """
-
-        fh = self._open(fname, 'r', fcntl.LOCK_SH)
-        data = fh.read()
-        fcntl.flock(fh, fcntl.LOCK_UN)
-        fh.close()
-        return data
-
-
-    def _write(self, fname, data):
-        """
-        Write to file safely.
-        """
-
-        fh = self._open(fname, 'w', fcntl.LOCK_EX)
-        fh.write(data)
-        fcntl.flock(fh, fcntl.LOCK_UN)
-        fh.close()
-
-
-    def load_data(self):
-        """
-        Load cached links and refresh them from sape.ru site if they is too old.
-        """
-
-        if not os.path.exists(self.db_file):
-            try:
-                file(self.db_file, 'w').write('')
-                os.chmod(self.db_file, 0666)
-            except IOError:
-                raise SapeException('Could not create %s' % self.db_file)
-
-        mtime = datetime.fromtimestamp(os.stat(self.db_file).st_mtime)
-        check_time = datetime.now() - timedelta(seconds=self.cache_lifetime)
-
-        self.db_file_mtime = mtime
-        self.db_file_updated = False
-
-        if mtime < check_time or not os.path.getsize(self.db_file):
-            self.db_file_updated = True
-
-            new_mtime = check_time + timedelta(seconds=self.cache_reloadtime)
-            ts = time.mktime(new_mtime.timetuple())
-            os.utime(self.db_file, (ts, ts))
-
-            for server in self.server_list:
-                data = self.fetch_remote_file(server, self.dispenser_path)
-                if data.startswith('FATAL ERROR'):
-                    raise SapeException(data)
-                else:
-                    try:
-                        # check the integrity of data
-                        phpserialize.loads(data)
-                    except:
-                        raise SapeException('Could not deserialize repsonse from server')
-                    else:
-                        self._write(self.db_file, data)
-
-        data = self._read(self.db_file)
-        return phpserialize.loads(data)
-
-
-class SapeClient(SapeBase):
-    
-    def __init__(self, *args, **kwargs):
-        self.links_delimiter = ''
-        self.links = []
-        self.links_page = []
-        
-        super(SapeClient, self).__init__(self, *args, **kwargs)
-        self.set_data(self.load_data())
-
-
-    def return_links(self, number=None, join=False):
-        """
-        Return links for current URI.
-
-        You can call this function several times with NUMBER argument.
-        Note that last call MUST NOT have NUMBER argument.
-        """
-
-        if isinstance(self.links_page, list):
-            total_page_links = len(self.links_page)
-            if not number or number > total_page_links:
-                number = total_page_links
-            links = self.links_page[:number]
-            self.links_page = self.links_page[number:]
-
-            links = map(self.decode, links)
-            if join: 
-                html = self.links_delimiter.join(links)
-                if self.is_our_bot:
-                    html = '<sape_noindex>%s</sape_noindex>' % html
-                return html 
-            else:
-                return links
-        else:
-            return self.links_page
-
-
-    def set_data(self, data):
-        self.links = data
-
-        if '__sape_delimiter__' in self.links:
-            self.links_delimiter = self.links['__sape_delimiter__']
-
-        uri_links = self.links.get(urllib.quote(
-            self.request_uri.encode('utf-8'), safe='/?&='), None)
-
-        if isinstance(uri_links, dict):
-            self.links_page = uri_links.values()
-        else:
-            new_url = self.links.get('__sape_new_url__', None)
-            if new_url:
-                # Note that default force_show_code value is True
-                # I did this because I'm not sure that every user
-                # of this library will pass cookies dict to SapeClient
-                # constructor (cookies required for detecting sape bot)
-                if self.is_our_bot or self.force_show_code:
-                    self.links_page = [new_url]
-
-
-    def decode(self, data):
-        data = data.decode('utf-8')
-        if self.charset:
-            data = data.encode(self.charset)
-        return data
+VERSION = '0.1.0'
-#!/usr/bin/env python
-from distutils.core import setup
+import os
+from setuptools import setup
 
-setup_data = {
-      'name': 'sape',
-      'version': '0.1',
-      'author': 'Grigoriy Petukhov',
-      'author_email': 'lizendir@gmail.com',
-      'url': 'http://hg.pydev.ru/sape',
-      'description': 'sape.ru API',
-      'long_description': """Library for communicate with sape.ru""",
-      'packages': ['sape'],
-      'license': "BSD License",
-      'platforms': "All",
-      'classifiers': [
-          'Topic :: Internet :: WWW/HTTP :: Dynamic Content',
-          'License :: OSI Approved :: BSD License',
-          'Operating System :: POSIX',
-          'Programming Language :: Python',
-          'Natural Language :: Russian',
-          'Development Status :: 4 - Beta',
-          'Intended Audience :: End Users/Desktop',
-          'Environment :: Web Environment',
-        ],
-    }
+# Compile the list of packages available, because distutils doesn't have
+# an easy way to do this.
 
-setup(**setup_data)
+packages, data_files = [], []
+root_dir = os.path.dirname(__file__)
+if root_dir:
+    os.chdir(root_dir)
+
+PACKAGE = 'sape'
+
+for dirpath, dirnames, filenames in os.walk(PACKAGE):
+    for i, dirname in enumerate(dirnames):
+        if dirname in ['.', '..']:
+            del dirnames[i]
+    if '__init__.py' in filenames:
+        pkg = dirpath.replace(os.path.sep, '.')
+        if os.path.altsep:
+            pkg = pkg.replace(os.path.altsep, '.')
+        packages.append(pkg)
+    elif filenames:
+        prefix = dirpath[len(PACKAGE) + 1:] # Strip package directory + path separator
+        for f in filenames:
+            data_files.append(os.path.join(prefix, f))
+
+setup(
+    version = '0.1.0',
+    description = 'Sape.ru client library'
+    author = 'Grigoriy Petukhov',
+    author_email = 'lorien@lorien.name',
+    url = 'http://bitbucket.org/lorien/sape',
+    name = 'sape',
+
+    packages = packages,
+    package_data = {'sape': data_files},
+
+    license = "BSD",
+    keywords = "django application sape.ru",
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Environment :: Web Environment',
+        'Framework :: Django',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: BSD License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Utilities'
+    ],
+)