Chris Grubbs avatar Chris Grubbs committed f545106

adding scrapy project for recipe gathering

Comments (0)

Files changed (7)

spider/scrapy.cfg

+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = spider.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = spider
Add a comment to this file

spider/spider/__init__.py

Empty file added.

spider/spider/items.py

+from scrapy.item import Item, Field
+
+class RawRecipeItem(Item):
+    name = Field()
+    url = Field()
+    description = Field()
+    content = Field()

spider/spider/pipelines.py

+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+
+class SpiderPipeline(object):
+    def process_item(self, item, spider):
+        return item

spider/spider/settings.py

+# Scrapy settings for spider project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'spider'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['spider.spiders']
+NEWSPIDER_MODULE = 'spider.spiders'
+USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
+DOWNLOAD_DELAY = 1

spider/spider/spiders/__init__.py

+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

spider/spider/spiders/sk_spider.py

+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import HtmlXPathSelector
+
+from spider.items import RawRecipeItem
+
+import difflib
+
+
+class SmittenKitchenSpider(CrawlSpider):
+    name = "smittenkitchen"
+    allowed_domains = ["smittenkitchen.com"]
+    start_urls = [
+        "http://smittenkitchen.com/recipes/"
+    ]
+    
+    rules = (
+        Rule(SgmlLinkExtractor(allow="/blog/[0-9]+/[0-9]+/[\w/-]+$"), process_links='filter_links', callback="parse_item"),
+    )
+    
+    def filter_links(self, urls):
+        accepted_urls = []
+        for url in urls:
+            if url.url.endswith('print/') or url.url.endswith('email/'):
+                continue
+            accepted_urls.append(url)
+        return accepted_urls
+
+    def parse_item(self, response):
+        self.log('Found recipe at: {}'.format(response.url))
+
+        hxs = HtmlXPathSelector(response)
+        title = hxs.select('//head/title/text()').extract()[0].strip().split('|')[0].strip().title()
+        content = hxs.select('//div[@class="entry"]')
+        cleaned_description = []
+        cleaned_content = []
+        content_text = ''.join([paragraph.extract_unquoted() for paragraph in content.select('.//text()')])
+        seen_title = False
+
+        for paragraph in content_text.split('\n'):
+            if len(paragraph.strip()) and 'year ago:' not in paragraph.lower():
+                # We've hit the end of the actual content if we see the "see more" links
+                if 'see more:' in paragraph.lower():
+                    break
+                
+                if not seen_title:
+                    # Detect the recipe title in the content text, which marks the division between description and recipe content
+                    title_matcher = difflib.SequenceMatcher(None, title.lower(), paragraph.lower())
+                    if title_matcher.ratio() > .5:
+                        seen_title = True
+                        continue
+                    cleaned_description.append(paragraph)
+                else:
+                    cleaned_content.append(paragraph)
+        
+        recipe_item = RawRecipeItem()
+        recipe_item['name'] = [title]
+        recipe_item['url'] = [response.url]
+        recipe_item['description'] = ['\n'.join(cleaned_description)]
+        recipe_item['content'] = ['\n'.join(cleaned_content)]
+        return recipe_item
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.