Commits

Andrew Montalenti  committed b0c9ae3
  • Participants

Comments (0)

Files changed (10)

+.svn
+.pyc
+Copyright (c) 2009, Simon Willison
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+History
+=======
+This project was forked from http://code.google.com/p/openplatform-python/, an official Python binding produced by the Guardian team, but which has not seen any improvements since March 2009.
+
+Forked by Andrew Montalenti <andrew@getparsely.com>, CTO of Parse.ly (http://parse.ly).
+
+Dependencies
+============
+
+* simplejson
+* httplib2 (optional, enables caching)
+
+Usage
+=====
+
+>>> from guardianapi import Client
+>>> client = Client('my-api-key-goes-here')
+>>> results = client.search(q = 'ocelots')
+>>> results.count()
+36
+>>> for item in results:
+...     print item['headline']
+
+This will return the first ten results.
+
+To access the filters (most popular tags) for a result set:
+
+>>> for filter in results.filters():
+...    print filter
+
+To retrieve everything (by paginating across all pages automatically), use the
+following:
+
+>>> for item in results.all():
+...     print item['headline']
+
+This will complete faster if you ask for 50 results per page:
+
+>>> for item in client.search(q = 'ocelots', count = 50).all():
+...     print item['headline']
+
+By default, this will sleep for one second between requesting each page of 
+results. If you find yourself tripping the API's rate limit, you can increase 
+the sleep duration:
+
+>>> for item in client.search(q = 'ocelots', count = 50).all(sleep = 2):
+...     print item['headline']
+
+Some API responses include URLs to make further requests. Here's how to start
+a request using URL returned from a previous API call:
+
+>>> first_filter_url = results.filters()[0]['apiUrl']
+>>> new_results = client.request(first_filter_url)

File guardianapi/__init__.py

+from client import Client

File guardianapi/client.py

+try:
+    import simplejson
+except ImportError:
+    from django.utils import simplejson
+import urllib, urlparse, time, re, cgi
+from errors import APIKeyError, ItemNotFound, URLNotRecognised
+import fetchers
+
+class Client(object):
+    base_url = 'http://api.guardianapis.com/'
+    # Map paths (e.g. /content/search) to their corresponding methods:
+    path_method_lookup = (
+        (re.compile('^/content/search$'), 'search'),
+        (re.compile('^/content/tags$'), 'tags'),
+        (re.compile('^/content/item/(\d+)$'), 'item'),
+    )
+    
+    def __init__(self, api_key, fetcher=None):
+        self.api_key = api_key
+        self.fetcher = fetcher or fetchers.best_fetcher()
+    
+    def _do_call(self, endpoint, **kwargs):
+        url = '%s?%s' % (
+            urlparse.urljoin(self.base_url, endpoint),
+            urllib.urlencode(self.fix_kwargs(kwargs), doseq=True)
+        )
+        try:
+            headers, response = self.fetcher.get(url)
+        except fetchers.HTTPError, e:
+            if e.status_code == 403:
+                raise APIKeyError(self.api_key, e)
+            else:
+                raise
+        return simplejson.loads(response)
+    
+    def fix_kwargs(self, kwargs):
+        kwargs2 = dict([ # underscores become hyphens
+            (key.replace('_', '-'), value)
+            for key, value in kwargs.items()
+        ])
+        kwargs2['format'] = 'json'
+        kwargs2['api_key'] = self.api_key
+        return kwargs2
+    
+    def search(self, **kwargs):
+        json = self._do_call('/content/search', **kwargs)
+        return SearchResults(self, kwargs, json)
+    
+    def tags(self, **kwargs):
+        json = self._do_call('/content/tags', **kwargs)
+        return TagResults(self, kwargs, json)
+    
+    def item(self, item_id):
+        try:
+            json = self._do_call('/content/item/%s' % item_id)
+        except HTTPError, h:
+            if str(h.status_code) == '404':
+                raise ItemNotFound(item_id)
+            else:
+                raise
+        return json['content']
+    
+    def request(self, url):
+        "Start with an already constructed URL e.g. apiUrl from a response"
+        bits = urlparse.urlparse(url)
+        path = bits.path
+        kwargs = cgi.parse_qs(bits.query)
+        found_method = None
+        args = tuple()
+        for r, method in self.path_method_lookup:
+            m = r.match(path)
+            if m:
+                found_method = method
+                args = m.groups()
+        if not found_method:
+            raise URLNotRecognised(url)
+        return getattr(self, found_method)(*args, **kwargs)
+    
+    def __repr__(self):
+        return '<%s: %s>' % (self.__class__.__name__, self.base_url)
+
+class Results(object):
+    client_method = None
+    default_per_page = 10 # Client library currently needs to know this
+    
+    def __init__(self, client, kwargs, json):
+        self.client = client
+        self.kwargs = kwargs
+        self.json = json
+    
+    def __iter__(self):
+        return self.results()
+    
+    def all(self, **kwargs):
+        "Iterate over all results, handling pagination transparently"
+        return AllResults(self, **kwargs)
+    
+    def count(self):
+        return 0
+    
+    def start_index(self):
+        return 0
+    
+    def per_page(self):
+        return self.kwargs.get('count', self.default_per_page)
+    
+    def __getitem__(self, key):
+        return self.json[key]
+    
+    def results(self):
+        return []
+    
+    def has_next(self):
+        return self.start_index() + self.per_page() < self.count()
+    
+    def next(self):
+        "Return next Results object in pagination sequence, or None if at end"
+        if not self.has_next():
+            return None
+        method = getattr(self.client, self.client_method)
+        kwargs = dict(self.kwargs)
+        start_index = kwargs.get('start_index', 0)
+        count = kwargs.get('count', self.default_per_page)
+        # Adjust the pagination arguments
+        kwargs['count'] = count
+        kwargs['start_index'] = start_index + count
+        return method(**kwargs)
+    
+    def __iter__(self):
+        for result in self.results():
+            yield result
+    
+    def __repr__(self):
+        return '<%s: %d-%d/%d for %r>' % (
+            self.__class__.__name__,
+            self.start_index(), self.start_index() + len(self.results()), 
+            self.count(), self.kwargs
+        )
+
+class SearchResults(Results):
+    client_method = 'search'
+    default_per_page = 10
+    
+    def count(self):
+        return self.json['search']['count']
+    
+    def start_index(self):
+        return self.json['search']['startIndex']
+    
+    def results(self):
+        return self.json['search']['results']
+    
+    def filters(self):
+        return self.json['search']['filters']
+
+class TagResults(Results):
+    client_method = 'tags'
+    default_per_page = 10
+    
+    def count(self):
+        return self.json['subjects']['count']
+    
+    def start_index(self):
+        return self.json['subjects']['startIndex']
+    
+    def results(self):
+        return self.json['subjects']['tags']
+
+class AllResults(object):
+    "Results wrapper that knows how to auto-paginate a result set"
+    def __init__(self, results, sleep=1, debug=lambda msg: None):
+        self.results = results
+        self.sleep = sleep
+        self.debug = debug
+    
+    def __iter__(self):
+        results = self.results
+        while results:
+            self.debug(results)
+            for result in results.results():
+                yield result
+            time.sleep(self.sleep)
+            results = results.next()
+    
+    def __repr__(self):
+        return '<%s: %d for %r>' % (
+            self.__class__.__name__, self.results.count(), self.results.kwargs
+        )

File guardianapi/errors.py

+class APIError(Exception):
+    pass
+
+class APIKeyError(APIError):
+    def __init__(self, api_key, e):
+        self.api_key = api_key
+        self.wrapped_exception = e
+
+    def __repr__(self):
+        return '<APIKeyError: %s is a bad API key>' % self.api_key
+
+class ItemNotFound(APIError):
+    def __init__(self, item_id):
+        self.item_id = item_id
+    
+    def __repr__(self):
+        return '<ItemNotFoundError: %s>' % self.item_id
+
+class URLNotRecognised(APIError):
+    def __init__(self, url):
+        self.url = url
+
+    def __repr__(self):
+        return '<URLNotRecognised: %s>' % self.url
+
+class HTTPError(APIError):
+    def __init__(self, status_code, info=None):
+        self.status_code = status_code
+        self.info = info
+
+    def __repr__(self):
+        return '<HTTPError: %s>' % self.status_code

File guardianapi/fetchers.py

+import urllib2, pickle
+try:
+    import httplib2
+except ImportError:
+    httplib2 = None
+from errors import HTTPError
+
+def best_fetcher():
+    if httplib2:
+        return CacheFetcher() # Uses an in-memory cache
+    else:
+        return Fetcher()
+
+class Fetcher(object):
+    "Default implementation, using urllib2"
+    def get(self, url):
+        try:
+            u = urllib2.urlopen(url)
+        except urllib2.HTTPError, e:
+            raise HTTPError(e.code, e)
+        headers = u.headers.dict
+        return headers, u.read()
+
+class InMemoryCache(object):
+    def __init__(self):
+        self._cache = {}
+    
+    def get(self, key):
+        return self._cache.get(key)
+    
+    def set(self, key, value):
+        self._cache[key] = value
+    
+    def delete(key):
+        if key in self._cache[key]:
+            del self._cache[key]
+
+class CacheFetcher(object):
+    "Uses httplib2 to cache based on the max-age header. Requires httplib2."
+    def __init__(self, cache=None):
+        if cache is None:
+            cache = InMemoryCache()
+        self.http = httplib2.Http(cache)
+    
+    def get(self, url):
+        headers, response = self.http.request(url)
+        if headers['status'] != '200':
+            raise HTTPError(int(headers['status']), headers)
+        return headers, response
+
+class ForceCacheFetcher(object):
+    "Caches every response forever, ignoring the max-age header"
+    def __init__(self, fetcher=None, cache=None):
+        self.fetcher = fetcher or Fetcher()
+        self.cache = cache or InMemoryCache()
+    
+    def get(self, url):
+        cached_value = self.cache.get(url)
+        if cached_value:
+            return pickle.loads(cached_value)
+        headers, response = self.fetcher.get(url)
+        self.cache.set(url, pickle.dumps((headers, response)))
+        return headers, response

File guardianapi/mockapi.py

+"""
+A fetcher that returns fake replies, for running tests.
+"""
+
+from fetchers import Fetcher
+import urlparse, cgi, simplejson, datetime
+from hashlib import md5
+
+class MockFetcher(Fetcher):
+    def __init__(self):
+        self.reset()
+    
+    def reset(self):
+        self.fetched = [] # (url, kwargs-dict) pairs
+        self.fake_total_results = 101
+    
+    def get(self, url):
+        bits = urlparse.urlparse(url)
+        endpoint = bits.path.split('/')[-1]
+        args = tuple()
+        if endpoint not in ('search', 'tags'):
+            if bits.path.startswith('/content/item'):
+                args = (endpoint,)
+                endpoint = 'item'
+            else:
+                assert False, 'Unrecognised URL: %s' % url
+        
+        kwargs = cgi.parse_qs(bits.query)
+        # foo=bar becomes {'foo': ['bar']} - collapse single values
+        for key in kwargs:
+            if isinstance(kwargs[key], list) and len(kwargs[key]) == 1:
+                kwargs[key] = kwargs[key][0]
+        
+        method = getattr(self, 'do_%s' % endpoint)
+        json = method(*args, **kwargs)
+        
+        self.record(url, kwargs, json)
+        
+        return {}, simplejson.dumps(json, indent=4)
+    
+    def record(self, url, args, json):
+        "Record attempted URL fetches so we can run assertions against them"
+        self.fetched.append((url, args))
+        # print '     ', url
+        # print '     ', args
+        # try:
+        #     print '      Got %s results' % len(json['search']['results'])
+        # except KeyError:
+        #     pass
+    
+    def do_search(self, **kwargs):
+        start_index = int(kwargs.get('start-index', 0))
+        count = int(kwargs.get('count', 10))
+        # How many results should we return?
+        num_results = min(
+            self.fake_total_results - start_index, count
+        )
+        
+        return {
+            "search": {
+                "count": self.fake_total_results,
+                "startIndex": start_index,
+                "results": [
+                    self.fake_article(article_id) 
+                    for article_id in range(
+                        start_index, start_index + num_results
+                    )
+                ],
+                "filters": [{
+                    "name": "Article",
+                    "type": "content-type",
+                    "filter": "/global/article",
+                    "apiUrl": "http://mockgdnapi/content/search?filter=/global/article",
+                    "webUrl": "http://www.guardian.co.uk/global/article",
+                    "count": 989610,
+                    "filterUrl": "http://mockgdnapi/content/search?format=json&filter=/global/article"
+                } for i in range(4)]
+            }
+        }
+    
+    def do_tags(self, **kwargs):
+        start_index = int(kwargs.get('start-index', 0))
+        count = int(kwargs.get('count', 10))
+        # How many results should we return?
+        num_results = min(
+            self.fake_total_results - start_index, count
+        )
+        
+        return {
+            "subjects": {
+                "count": self.fake_total_results,
+                "startIndex": start_index,
+                "tags": [{
+                    "name": "Tag %s" % i,
+                    "section": "Tags",
+                    "filter": "/tag/%s" % i,
+                    "apiUrl": "http://mockgdnapi/content/search?filter=/tag/%s" % i,
+                    "webUrl": "http://www.guardian.co.uk/faketag/%s" % i
+                } for i in range(start_index, start_index + num_results)],
+            }
+        }
+    
+    def do_item(self, rest_of_url, **kwargs):
+        return {'content': self.fake_article(rest_of_url.replace('/', ''))}
+    
+    def fake_article(self, article_id):
+        # All generated publication dates are within 365 days of now
+        delta = datetime.timedelta(days = (
+            hash(md5(str(article_id)).hexdigest()) % 365
+        ))
+        publicationDate = datetime.datetime.now() - delta
+        return {
+            "id": str(article_id),
+            "type": "article",
+            "publication": "The Guardian",
+            "headline": "Mock headline %s" % article_id,
+            "standfirst": "Mock standfirst %s" % article_id,
+            "byline": "Mock Byline",
+            "sectionName": "Mock section",
+            "trailText": "Mock trailText %s" % article_id,
+            "linkText": "Mock linkText %s" % article_id,
+            "webUrl": "http://www.guardian.co.uk/fake-url/%s" % article_id,
+            "apiUrl": "http://mockgdnapi/content/item/%s" % article_id,
+            "publicationDate": publicationDate.strftime('%Y-%m-%dT%H:%M:%S'),
+            "typeSpecific": {
+                "@class": "article",
+                "body": "Mock content for article %s" % article_id,
+            },
+            "tags": self.fake_tags(article_id)
+        }
+    
+    def fake_tags(self, article_id):
+        return [{
+            "name": "Article",
+            "type": "content-type",
+            "filter": "/global/article",
+            "apiUrl": "http://mockgdnapi/content/search?filter=/global/article",
+            "webUrl": "http://www.guardian.co.uk/global/article"
+        } for i in range(4)]
+

File guardianapi/tests.py

+import client, mockapi, fetchers
+import unittest, pickle
+
+class BaseTestCase(unittest.TestCase):
+    api_key = 'fake-api-key'
+    
+    def setUp(self):
+        self.fetcher = mockapi.MockFetcher()
+        self.client = self.make_client(self.api_key)
+    
+    def tearDown(self):
+        self.fetcher.reset()
+    
+    def make_client(self, api_key):
+        return client.Client(api_key, fetcher = self.fetcher)
+    
+    def assertRequestCount(self, count):
+        self.assertEqual(len(self.fetcher.fetched), count, 
+            "Expected %d HTTP requests, got %d" % (
+                count, len(self.fetcher.fetched)
+            )
+        )
+    
+    def assertIn(self, needle, haystack):
+        self.assert_(needle in haystack, "Expected to find '%s' in '%s'" % (
+            needle, haystack
+        ))
+    
+class MockFetcherTestCase(BaseTestCase):
+    
+    def test_mock_fetcher(self):
+        "MockFetcher should intercept and record URL retrieval attempts"
+        search_term = 'hello'
+        self.assertRequestCount(0)
+        results = self.client.search(q = search_term)
+        self.assertRequestCount(1)
+        self.assertEqual(self.fetcher.fetched[0][1]['q'], search_term)
+    
+    def test_mock_fetcher_correct_pagination(self):
+        "start-index=90&count=30 on a 101 sized result set should return 11"
+        self.fetcher.fake_total_results = 101
+        self.assertRequestCount(0)
+        results = self.client.search(start_index = 90, count = 30)
+        self.assertEqual(results.start_index(), 90)
+        self.assertEqual(results.count(), 101)
+        self.assertRequestCount(1)
+        self.assertEqual(len(results.results()), 11)
+
+class ClientTestCase(BaseTestCase):
+    
+    def test_results_has_next(self):
+        "results.has_next() should give the correct answers"
+        class MockResults(client.Results):
+            def __init__(self, total_results, start_index, per_page):
+                self._total_results = total_results
+                self._start_index = start_index
+                self._per_page = per_page
+                self.kwargs = {'count': per_page, 'start_index': start_index}
+            
+            def count(self):
+                return self._total_results
+            
+            def start_index(self):
+                return self._start_index
+        
+        r = MockResults(total_results = 10, start_index = 0, per_page = 20)
+        self.assert_(not r.has_next())
+        
+        r = MockResults(total_results = 10, start_index = 0, per_page = 5)
+        self.assert_(r.has_next())
+        
+        r = MockResults(total_results = 101, start_index = 90, per_page = 10)
+        self.assert_(r.has_next())
+        
+        r = MockResults(total_results = 101, start_index = 100, per_page = 10)
+        self.assert_(not r.has_next())
+    
+    def test_api_key(self):
+        "api_key given to Client constructor should be handled automatically"
+        results = self.make_client(api_key = 'foo').search()
+        self.assertEqual(self.fetcher.fetched[-1][1]['api_key'], 'foo')
+        results = self.make_client(api_key = 'bar').search()
+        self.assertEqual(self.fetcher.fetched[-1][1]['api_key'], 'bar')
+    
+    def test_tags(self):
+        "tags() method should return tags"
+        self.assertRequestCount(0)
+        results = self.client.tags(count = 20)
+        self.assertRequestCount(1)
+        self.assertIn('tags', self.fetcher.fetched[-1][0])
+        self.assertEqual(len(list(results)), 20)
+    
+    def test_search(self):
+        "search() method should return results and filters"
+        results = self.client.search(q = 'foo', count = 20)
+        self.assertEqual(len(results.results()), 20)
+        self.assert_(isinstance(results.filters(), list))
+    
+    def test_all_search(self):
+        "search().all() should magically paginate"
+        self.fetcher.fake_total_results = 101
+        self.assertRequestCount(0)
+        results = self.client.search(q = 'foo', count = 30)
+        self.assertRequestCount(1)
+        self.assertEqual(len(results.results()), 30)
+        all_results = list(results.all(sleep = 0))
+        self.assertRequestCount(4)
+        self.assertEqual(len(all_results), 101)
+    
+    def test_all_tags(self):
+        "tags().all() should magically paginate"
+        self.fetcher.fake_total_results = 301
+        self.assertRequestCount(0)
+        results = self.client.tags(count = 100)
+        self.assertRequestCount(1)
+        self.assertEqual(len(results.results()), 100)
+        all_tags = list(results.all(sleep = 0))
+        self.assertRequestCount(4)
+        self.assertEqual(len(all_tags), 301)
+    
+    def test_request_search(self):
+        "client.request(url-to-search-results) should work correctly"
+        url = 'http://gdn/content/search?q=obama'
+        self.assertRequestCount(0)
+        results = self.client.request(url)
+        self.assertEqual(results.kwargs['q'][0], 'obama')
+        self.assertRequestCount(1)
+        self.assert_(isinstance(results, client.SearchResults))
+    
+    def test_request_content(self):
+        "client.fetch(url-to-content) should work correctly"
+        url = 'http://gdn/content/item/123'
+        self.assertRequestCount(0)
+        results = self.client.request(url)
+        self.assertRequestCount(1)
+        self.assert_(isinstance(results, dict))
+        self.assertEqual(results['id'], '123')
+    
+    def test_force_cache_fetcher(self):
+        "ForceCacheFetcher should always return values from a cache"
+        try:
+            import httplib2
+        except ImportError:
+            print "Warning: test_cache_fetcher depends on httplib2"
+            return
+        cache = fetchers.InMemoryCache()
+        force_cache_fetcher = fetchers.ForceCacheFetcher(
+            cache = cache,
+            fetcher = self.fetcher
+        )
+        cache_client = client.Client(
+            self.api_key, fetcher = force_cache_fetcher
+        )
+        self.assertEqual(len(cache._cache), 0)
+        # Run a query
+        self.assertRequestCount(0)
+        results1 = str(cache_client.search(q = 'foo', count = 20))
+        self.assertRequestCount(1)
+        # Run it again
+        results2 = str(cache_client.search(q = 'foo', count = 20))
+        self.assertRequestCount(1)
+        self.assertEqual(
+            results1, results2, 'Cached results should be the same'
+        )
+        # Check that changing the query results in a cache miss
+        results3 = str(cache_client.search(q = 'bar', count = 20))
+        self.assertRequestCount(2)
+
+if __name__ == '__main__':
+    unittest.main()

File guardianapi/utils.py

+# Syntactic sugar enabling classes
+class AttrDictList(list):
+    def transform(self, value):
+        if isinstance(value, dict):
+            return AttrDict(value)
+        elif isinstance(value, list):
+            return AttrDictList(value)
+        else:
+            return value
+    
+    def __getitem__(self, index):
+        value = super(AttrDictList, self).__getitem__(index)
+        return self.transform(value)
+    
+    def __iter__(self):
+        for value in super(AttrDictList, self).__iter__():
+            yield self.transform(value)
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        try:
+            value = self[key]
+        except KeyError:
+            raise AttributeError, key
+        if isinstance(value, dict):
+            value = AttrDict(value)
+        if isinstance(value, list):
+            value = AttrDictList(value)
+        return value