pomp / examples /

Extract python news from
import sys
import re
import logging
from pomp.core.base import BaseCrawler, BasePipeline
from pomp.core.item import Item, Field
from pomp.contrib.urllibtools import UrllibDownloader

logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
news_re = re.compile(
    r'<h2 class="news">(.*?)</h2>([\s\S]*?)<div class="pubdate">(.*?)</div>')

class PythonNewsItem(Item):
    title = Field()
    published = Field()

    def __repr__(self):
        return '%s\n\t%s\n' % (

class PythonNewsCrawler(BaseCrawler):

    def extract_items(self, response):
        for i in news_re.findall(response.body.decode('utf-8')):
            item = PythonNewsItem()
            item.title, item.published = i[0], i[2]
            yield item

    def next_requests(self, response):
        return None  # one page crawler

class PrintPipeline(BasePipeline):

    def process(self, crawler, item):

if __name__ == '__main__':
    from pomp.core.engine import Pomp

    pomp = Pomp(

Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.