Evgeniy Tatarkin avatar Evgeniy Tatarkin committed 622ddf6

prefix examples file names

Comments (0)

Files changed (5)

examples/01_pythonnews.py

-"""
-Extract python news from python.org
-"""
-import sys
-import re
-import logging
-from pomp.core.base import BaseCrawler, BasePipeline
-from pomp.core.item import Item, Field
-from pomp.contrib.urllibtools import UrllibDownloader
-
-
-logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
-news_re = re.compile(
-    r'<h2 class="news">(.*?)</h2>([\s\S]*?)<div class="pubdate">(.*?)</div>')
-
-
-class PythonNewsItem(Item):
-    title = Field()
-    published = Field()
-
-    def __repr__(self):
-        return '%s\n\t%s\n' % (
-            self.title,
-            self.published,
-        )
-
-
-class PythonNewsCrawler(BaseCrawler):
-    ENTRY_REQUESTS = 'http://python.org/news/'
-
-    def extract_items(self, response):
-        for i in news_re.findall(response.body.decode('utf-8')):
-            item = PythonNewsItem()
-            item.title, item.published = i[0], i[2]
-            yield item
-
-    def next_requests(self, response):
-        return None  # one page crawler
-
-
-class PrintPipeline(BasePipeline):
-
-    def process(self, crawler, item):
-        print(item)
-
-
-if __name__ == '__main__':
-    from pomp.core.engine import Pomp
-
-    pomp = Pomp(
-        downloader=UrllibDownloader(),
-        pipelines=[PrintPipeline()],
-    )
-
-    pomp.pump(PythonNewsCrawler())

examples/02_livejournal.py

-# -*- coding: utf-8 -*-
-"""
-    LiveJournal friends of http://grrm.livejournal.com/ user
-
-    requires: lxml
-
-    store csv data to /tmp/friends.csv
-    print result dict
-
-"""
-import sys
-import logging
-import pprint
-from collections import defaultdict
-
-from lxml import html
-
-from pomp.core.base import BaseCrawler
-from pomp.contrib.pipelines import CsvPipeline
-from pomp.core.base import BasePipeline, BaseDownloaderMiddleware
-from pomp.contrib.urllibtools import UrllibHttpRequest
-from pomp.core.item import Item, Field
-
-
-logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
-
-
-class LXMLDownloaderMiddleware(BaseDownloaderMiddleware):
-
-    def __init__(self, encoding=None):
-        self.encoding = encoding
-
-    def process_request(self, request):
-        return request
-
-    def process_response(self, response):
-        if self.encoding:
-            response.tree = html.fromstring(
-                response.body.decode(self.encoding))
-        else:
-            response.tree = html.fromstring(response.body)
-        return response
-
-
-class DictCollectPipeline(BasePipeline):
-
-    def start(self, crawler):
-        self.result = defaultdict(lambda: {'friends': []})
-
-    def process(self, crawler, item):
-        self.result[item.friend_to]['friends'].append(item.username)
-        return item
-
-
-class FriendItem(Item):
-    username = Field()
-    friend_to = Field()
-
-    def __repr__(self):
-        return 'username: %s friend to:%s' % (
-            self.username,
-            self.friend_to,
-        )
-
-
-class FriendLevelRequest(UrllibHttpRequest):
-    def __init__(self, *args, **kwargs):
-        self.level = kwargs.get('level', 0)
-        self.username = kwargs['username']
-        # cleanup kwargs for passing to urllib
-        if 'level' in kwargs:
-            del kwargs['level']
-        if 'username' in kwargs:
-            del kwargs['username']
-        super(FriendLevelRequest, self).__init__(*args, **kwargs)
-
-
-class LJFriendSpider(BaseCrawler):
-    QS = '?socconns=pfriends&mode_full_socconns=1'
-    BASE_URL = 'http://{0}.livejournal.com/profile/friendlist' + QS
-
-    ENTRY_REQUESTS = (
-        FriendLevelRequest(
-            BASE_URL.format('grrm'),
-            username='grrm'
-        ),
-    )
-
-    FRIENDS_XPATH = '//dl[contains(@data-widget-options, "socconns")]' \
-                    '/dd[@class="b-profile-group-body"]' \
-                    '/div[@class="b-tabs-content"]/a'
-
-    def __init__(self, max_level=2, friends_limit=2):
-        """LiveJournal spider
-
-            :param max_level: max level dive-in
-            :param friends_limit: count of first friends to follow
-        """
-        self.max_level = max_level
-        self.friend_limit = friends_limit
-        self._next_requests = []
-        super(LJFriendSpider, self).__init__()
-
-    def extract_items(self, response):
-        items = []
-        k = 0
-        # find item section
-        for i in response.tree.xpath(self.FRIENDS_XPATH):
-            item = FriendItem()
-            item.username = i.text
-
-            # associate parsed user with hist friend from parent request
-            item.friend_to = response.request.username
-            items.append(item)
-
-            # follow to item.username
-            if response.request.level < self.max_level \
-                    and k < self.friend_limit:
-                # generate new url to follow
-                url = i.get('href') + self.QS
-                self._next_requests.append(FriendLevelRequest(
-                    url,
-                    username=item.username,
-                    level=response.request.level + 1
-                ))
-                k += 1
-        return items
-
-    def next_requests(self, response):
-        # when users parsed pomp call next_url method
-        # for getting next targets
-        def _urls():
-            if self._next_requests:
-                yield self._next_requests.pop()
-        return list(_urls())
-
-
-if __name__ == '__main__':
-    from pomp.core.engine import Pomp
-
-    try:
-        from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader \
-            as dnl
-    except ImportError:
-        from pomp.contrib import ThreadedDownloader as dnl
-
-    middlewares = [LXMLDownloaderMiddleware(encoding='utf-8')]
-
-    dict_pipe = DictCollectPipeline()
-    pomp = Pomp(
-        downloader=dnl(middlewares=middlewares, timeout=10),
-        pipelines=[
-            dict_pipe,
-            CsvPipeline('/tmp/friends.csv'),
-        ],
-    )
-
-    pomp.pump(LJFriendSpider())
-
-    print('Result:')
-    pprint.pprint(dict(dict_pipe.result))

examples/e01_pythonnews.py

+"""
+Extract python news from python.org
+"""
+import sys
+import re
+import logging
+from pomp.core.base import BaseCrawler, BasePipeline
+from pomp.core.item import Item, Field
+from pomp.contrib.urllibtools import UrllibDownloader
+
+
+logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+news_re = re.compile(
+    r'<h2 class="news">(.*?)</h2>([\s\S]*?)<div class="pubdate">(.*?)</div>')
+
+
+class PythonNewsItem(Item):
+    title = Field()
+    published = Field()
+
+    def __repr__(self):
+        return '%s\n\t%s\n' % (
+            self.title,
+            self.published,
+        )
+
+
+class PythonNewsCrawler(BaseCrawler):
+    ENTRY_REQUESTS = 'http://python.org/news/'
+
+    def extract_items(self, response):
+        for i in news_re.findall(response.body.decode('utf-8')):
+            item = PythonNewsItem()
+            item.title, item.published = i[0], i[2]
+            yield item
+
+    def next_requests(self, response):
+        return None  # one page crawler
+
+
+class PrintPipeline(BasePipeline):
+
+    def process(self, crawler, item):
+        print(item)
+
+
+if __name__ == '__main__':
+    from pomp.core.engine import Pomp
+
+    pomp = Pomp(
+        downloader=UrllibDownloader(),
+        pipelines=[PrintPipeline()],
+    )
+
+    pomp.pump(PythonNewsCrawler())

examples/e02_livejournal.py

+# -*- coding: utf-8 -*-
+"""
+    LiveJournal friends of http://grrm.livejournal.com/ user
+
+    requires: lxml
+
+    store csv data to /tmp/friends.csv
+    print result dict
+
+"""
+import sys
+import logging
+import pprint
+from collections import defaultdict
+
+from lxml import html
+
+from pomp.core.base import BaseCrawler
+from pomp.contrib.pipelines import CsvPipeline
+from pomp.core.base import BasePipeline, BaseDownloaderMiddleware
+from pomp.contrib.urllibtools import UrllibHttpRequest
+from pomp.core.item import Item, Field
+
+
+logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+
+
+class LXMLDownloaderMiddleware(BaseDownloaderMiddleware):
+
+    def __init__(self, encoding=None):
+        self.encoding = encoding
+
+    def process_request(self, request):
+        return request
+
+    def process_response(self, response):
+        if self.encoding:
+            response.tree = html.fromstring(
+                response.body.decode(self.encoding))
+        else:
+            response.tree = html.fromstring(response.body)
+        return response
+
+
+class DictCollectPipeline(BasePipeline):
+
+    def start(self, crawler):
+        self.result = defaultdict(lambda: {'friends': []})
+
+    def process(self, crawler, item):
+        self.result[item.friend_to]['friends'].append(item.username)
+        return item
+
+
+class FriendItem(Item):
+    username = Field()
+    friend_to = Field()
+
+    def __repr__(self):
+        return 'username: %s friend to:%s' % (
+            self.username,
+            self.friend_to,
+        )
+
+
+class FriendLevelRequest(UrllibHttpRequest):
+    def __init__(self, *args, **kwargs):
+        self.level = kwargs.get('level', 0)
+        self.username = kwargs['username']
+        # cleanup kwargs for passing to urllib
+        if 'level' in kwargs:
+            del kwargs['level']
+        if 'username' in kwargs:
+            del kwargs['username']
+        super(FriendLevelRequest, self).__init__(*args, **kwargs)
+
+
+class LJFriendSpider(BaseCrawler):
+    QS = '?socconns=pfriends&mode_full_socconns=1'
+    BASE_URL = 'http://{0}.livejournal.com/profile/friendlist' + QS
+
+    ENTRY_REQUESTS = (
+        FriendLevelRequest(
+            BASE_URL.format('grrm'),
+            username='grrm'
+        ),
+    )
+
+    FRIENDS_XPATH = '//dl[contains(@data-widget-options, "socconns")]' \
+                    '/dd[@class="b-profile-group-body"]' \
+                    '/div[@class="b-tabs-content"]/a'
+
+    def __init__(self, max_level=2, friends_limit=2):
+        """LiveJournal spider
+
+            :param max_level: max level dive-in
+            :param friends_limit: count of first friends to follow
+        """
+        self.max_level = max_level
+        self.friend_limit = friends_limit
+        self._next_requests = []
+        super(LJFriendSpider, self).__init__()
+
+    def extract_items(self, response):
+        items = []
+        k = 0
+        # find item section
+        for i in response.tree.xpath(self.FRIENDS_XPATH):
+            item = FriendItem()
+            item.username = i.text
+
+            # associate parsed user with hist friend from parent request
+            item.friend_to = response.request.username
+            items.append(item)
+
+            # follow to item.username
+            if response.request.level < self.max_level \
+                    and k < self.friend_limit:
+                # generate new url to follow
+                url = i.get('href') + self.QS
+                self._next_requests.append(FriendLevelRequest(
+                    url,
+                    username=item.username,
+                    level=response.request.level + 1
+                ))
+                k += 1
+        return items
+
+    def next_requests(self, response):
+        # when users parsed pomp call next_url method
+        # for getting next targets
+        def _urls():
+            if self._next_requests:
+                yield self._next_requests.pop()
+        return list(_urls())
+
+
+if __name__ == '__main__':
+    from pomp.core.engine import Pomp
+
+    try:
+        from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader \
+            as dnl
+    except ImportError:
+        from pomp.contrib import ThreadedDownloader as dnl
+
+    middlewares = [LXMLDownloaderMiddleware(encoding='utf-8')]
+
+    dict_pipe = DictCollectPipeline()
+    pomp = Pomp(
+        downloader=dnl(middlewares=middlewares, timeout=10),
+        pipelines=[
+            dict_pipe,
+            CsvPipeline('/tmp/friends.csv'),
+        ],
+    )
+
+    pomp.pump(LJFriendSpider())
+
+    print('Result:')
+    pprint.pprint(dict(dict_pipe.result))
 commands=
     python -c"import sys, subprocess; sys.exit(subprocess.call('python docs/examples/minimalapp.py', shell=True, stdout=subprocess.PIPE))"
     python -c"import sys, subprocess; sys.exit(subprocess.call('python docs/examples/customdowloader.py', shell=True, stdout=subprocess.PIPE))"
-    python -c"import sys, subprocess; sys.exit(subprocess.call('python examples/01_pythonnews.py', shell=True, stdout=subprocess.PIPE))"
-    python -c"import sys, subprocess; sys.exit(subprocess.call('python examples/02_livejournal.py', shell=True, stdout=subprocess.PIPE))"
+    python -c"import sys, subprocess; sys.exit(subprocess.call('python examples/e01_pythonnews.py', shell=True, stdout=subprocess.PIPE))"
+    python -c"import sys, subprocess; sys.exit(subprocess.call('python examples/e02_livejournal.py', shell=True, stdout=subprocess.PIPE))"
 
 [testenv:docs]
 basepython=python
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.