Commits

Lammert Hilarides committed 0c8d1d8 Draft

Scrapers abstracted to DB instead of different files. One scrape.py script reads config per scraper from DB

Comments (0)

Files changed (3)

-from scraper.models import Scraper, ScraperItem
+from scraper.models import Scraper, ScraperItem, ScraperLinkSetting, ScraperDescriptionSetting, ScraperTitleSetting, ScraperItemListSettings
 from django.contrib import admin
 
 
+class ScraperItemListInline(admin.TabularInline):
+    model = ScraperItemListSettings
+    extra = 0
+
+class ScraperTitleInline(ScraperItemListInline):
+    model = ScraperTitleSetting
+
+class ScraperLinkInline(ScraperItemListInline):
+    model = ScraperLinkSetting
+
+
+class ScraperDescriptionInline(ScraperItemListInline):
+    model = ScraperDescriptionSetting
+
+
 class ScraperAdmin(admin.ModelAdmin):
-    pass
+    inlines = [ScraperItemListInline, ScraperTitleInline, ScraperLinkInline, ScraperDescriptionInline]
 
-admin.site.register(Scraper)
+admin.site.register(Scraper, ScraperAdmin)
 
 
 class ScraperItemAdmin(admin.ModelAdmin):

scraper/management/commands/scrape.py

+from django.core.management.base import BaseCommand
+from django.utils import timezone
+from django.utils.encoding import smart_unicode
+
+from scraper.models import *
+
+import requests
+import lxml
+from lxml import html
+from urlparse import urlparse
+
+
+class Command(BaseCommand):
+    help = 'Scrapes configured site for news items'
+
+    def handle(self, *args, **options):
+
+
+        for scraper in Scraper.objects.filter(is_active=True):
+            self.stdout.write('\nScraping %s started at %s\n' % (scraper.title, str(timezone.localtime(timezone.now()))))
+            r = requests.get(scraper.page_url)
+            page = lxml.html.fromstring(r.content)
+
+            bits = urlparse(scraper.page_url)
+            base_url = bits[0] + bits[1]
+
+            # loop for item lists on page
+            for item_list in scraper.scraperitemlistsettings_items.all():
+                # loop for news items within item list
+                # reverse because newest item is usually on top
+                for item in sorted(page.cssselect(item_list.selector), reverse=True):
+                    # link
+                    link = None
+                    for option in scraper.scraperlinksetting_items.all():
+                        try:
+                            link = smart_unicode(item.cssselect(option.selector)[0].get('href'))
+                        except:
+                            continue
+                        else:
+                            break;
+
+                    if not link.startswith("http"):
+                        if link.startswith("/"):
+                            link = base_url + link
+                        else:
+                            link = scraper.page_url + link
+
+                    # get title
+                    title = None
+                    for option in scraper.scrapertitlesetting_items.all():
+                        try:
+                            title = smart_unicode(item.cssselect(option.selector)[0].text_content().strip())
+                        except:
+                            continue
+                        else:
+                            break;
+
+                    # get description
+                    description = None
+                    for option in scraper.scraperdescriptionsetting_items.all():
+                        try:
+                            description = smart_unicode(item.cssselect(option.selector)[0].text_content().strip())
+                        except:
+                            continue
+                        else:
+                            break;
+
+                    if scraper.items.filter(guid=link).count() == 0:
+                        item = ScraperItem(scraper=scraper, guid=link, link=link, title=title[:255], description=description[:1024], pubdate=timezone.localtime(timezone.now()))
+                        item.save()
+
+
+"""
+        base_url = 'http://www.scheepsbouw.nl'
+        news_url = 'http://www.scheepsbouw.nl/Nieuws/Nieuws_actueel'
+        r = requests.get(news_url)
+        page = lxml.html.fromstring(r.content)
+
+            # Find the correct html elements via lxml cssselector (or xpath if you prefer)
+        for li in page.cssselect('ul li[class=item]'):
+                # concat base_url to unicode encoded relative url --> (css select 1st <a> with class item_title and extract href attribute)
+            link = base_url + smart_unicode(li.cssselect('a[class=item_title]')[0].get('href'))
+            title = smart_unicode(li.cssselect('a[class=item_title]')[0].text_content().strip())
+            if len(li.cssselect('div[class=item_description]')) > 0:
+                description = smart_unicode(li.cssselect('div[class=item_description]')[0].text_content().strip())
+            elif len(li.cssselect('div[class=item_description_photo]')) > 0:
+                description = smart_unicode(li.cssselect('div[class=item_description_photo]')[0].text_content().strip())
+
+            if not scraper.items.filter(link=link):
+                item = ScraperItem(scraper=scraper, guid=link, link=link, title=title[:255], description=description[:1024], pubdate=timezone.localtime(timezone.now()))
+                item.save()
+"""
 
 class Scraper(models.Model):
     title = models.CharField(_("title"), max_length=255, unique=True)
-    
+    is_active = models.BooleanField(_("is active"), default=True, db_index=True)
+    page_url = models.URLField(_("news url"), max_length=255)
+
     class Meta:
         verbose_name = _("scraper")
         verbose_name_plural = _("scrapers")
         return self.title
 
 
+class BaseScraperSetting(models.Model):
+    scraper = models.ForeignKey(Scraper, related_name="%(class)s_items")
+    selector = models.CharField(_("css selector"), max_length=255)
+    order = models.PositiveIntegerField(_("Order"), db_index=True, default=0)
+
+    class Meta:
+        abstract = True
+        unique_together = (
+            ("scraper", "order"),
+            ("scraper", "selector")
+        )
+        ordering = ["order"]
+
+class ScraperItemListSettings(BaseScraperSetting):
+    class Meta(BaseScraperSetting.Meta):
+        verbose_name = _("scraper item setting")
+        verbose_name_plural = _("scraper item settings")
+
+    def __unicode__(self):
+        return u"%s" % (self.selector)
+
+
+class ScraperTitleSetting(BaseScraperSetting):
+    class Meta(BaseScraperSetting.Meta):
+        verbose_name = _("scraper title setting")
+        verbose_name_plural = _("scraper title settings")
+
+    def __unicode__(self):
+        return u"%s" % (self.selector)
+
+
+class ScraperLinkSetting(BaseScraperSetting):
+    class Meta(BaseScraperSetting.Meta):
+        verbose_name = _("scraper link setting")
+        verbose_name_plural = _("scraper link settings")
+
+    def __unicode__(self):
+        return u"%s" % (self.selector)
+
+
+class ScraperDescriptionSetting(BaseScraperSetting):
+    class Meta(BaseScraperSetting.Meta):
+        verbose_name = _("scraper description setting")
+        verbose_name_plural = _("scraper description settings")
+
+    def __unicode__(self):
+        return u"%s" % (self.selector)
+
+
+
 class ScraperItem(models.Model):
     scraper = models.ForeignKey(Scraper, related_name="items")
-    guid = models.CharField(_("GUID"), max_length=255, db_index=True, 
+    guid = models.CharField(_("GUID"), max_length=255, db_index=True,
                             help_text=_("unique identifier per scraper"))
     title = models.CharField(_("title"), max_length=255, db_index=True)
     description = models.TextField(_("description"))
     link = models.URLField(_("link"), max_length=255)
     pubdate = models.DateTimeField(_("publication date"), db_index=True)
-    
+
     class Meta:
         verbose_name = _("scraper item")
         verbose_name_plural = _("scraper items")