Commits

Lammert Hilarides committed 3240619 Draft

scraped items can now be accessed in a feed (feedds.py added), get_or_create bug squashed in spiders

  • Participants
  • Parent commits e23549e

Comments (0)

Files changed (6)

File alt_scraper/admin.py

-from alt_scraper.models import ScheepsbouwNL, Hme
+from alt_scraper.models import Scraper, ScraperItem
 from django.contrib import admin
 
-admin.site.register(ScheepsbouwNL)
-admin.site.register(Hme)
+admin.site.register(Scraper)
+admin.site.register(ScraperItem)

File alt_scraper/feeds.py

+from django.contrib.syndication.views import Feed, FeedDoesNotExist
+from django.shortcuts import get_object_or_404
+from alt_scraper.models import Scraper, ScraperItem
+
+
+class ScraperFeed(Feed):
+    description = 'omschrijvinkje'
+    link = '/link/'
+    def get_object(self, request, scraper_id):
+        return get_object_or_404(Scraper, pk=scraper_id)
+
+    def title(self, obj):
+        return obj.title
+
+    def items(self, obj):
+        return ScraperItem.objects.filter(scraper=obj).order_by('-pubdate')[:10]
+
+    def item_link(self, item):
+        return item.link
+
+    def item_description(self, item):
+        return item.description
+
+    def item_pubdate(self, item):
+        return item.pubdate
+
+    def item_title(self, item):
+        return item.title

File alt_scraper/management/commands/scrape_hme.py

 from django.core.management.base import BaseCommand
-from alt_scraper.models import Hme
+from django.utils import timezone
+from django.utils.encoding import smart_unicode
+
+from alt_scraper.models import Scraper, ScraperItem
 
 import re
 import requests
 import lxml
 from lxml import html
-import time, datetime
 
-from django.utils.encoding import smart_unicode
-from pprint import pprint
 
 class Command(BaseCommand):
     help = 'Scrapes the HME site for news items'
 
     def handle(self, *args, **options):
-        self.stdout.write('\nScraping started at %s\n' % str(datetime.datetime.now()))
+
+        scraper, created = Scraper.objects.get_or_create(title='HME')
+        self.stdout.write('\nScraping %s started at %s\n' % (scraper.title, str(timezone.localtime(timezone.now()))))
         news_url = 'http://www.hme.nl/category/news/'
         r = requests.get(news_url)
         page = lxml.html.fromstring(r.content)
+
             # Find the correct html elements via lxml cssselector (or xpath if you prefer)
-        for article in page.cssselect('article'):
+        articles = sorted(page.cssselect('article'), reverse=True)
+        for article in articles:
                 # concat base_url to unicode encoded relative url --> (css select 1st <a> and extract href attribute)
-            url = smart_unicode(article.cssselect('a')[0].get('href'))
+            link = smart_unicode(article.cssselect('a')[0].get('href'))
             title = smart_unicode(article.cssselect('a')[0].text_content().strip())
             description = article.cssselect('div p')[0].text_content().strip()
-            if not Hme.objects.filter(url=url):
-                item = Hme(url=url, title=title[:255], description=description[:1024])
+
+            if not ScraperItem.objects.filter(link=link):
+                item = ScraperItem(scraper=scraper, link=link, title=title[:255], description=description[:1024], pubdate=timezone.localtime(timezone.now()))
                 item.save()

File alt_scraper/management/commands/scrape_scheepsbouw_nl.py

 from django.core.management.base import BaseCommand
-from alt_scraper.models import ScheepsbouwNL
+from django.utils import timezone
+from django.utils.encoding import smart_unicode
+
+from alt_scraper.models import Scraper, ScraperItem
 
 import requests
 import lxml
 from lxml import html
-import time, datetime
+#import time, datetime
 
-from django.utils.encoding import smart_unicode
-#from pprint import pprint
 
 class Command(BaseCommand):
     help = 'Scrapes the Scheepsbouw NL site for news items'
 
     def handle(self, *args, **options):
+
+        scraper, created = Scraper.objects.get_or_create(title='Scheepsbouw NL')
+        self.stdout.write('\nScraping %s started at %s\n' % (scraper.title, str(timezone.localtime(timezone.now()))))
         base_url = 'http://www.scheepsbouw.nl'
         news_url = 'http://www.scheepsbouw.nl/Nieuws/Nieuws_actueel'
         r = requests.get(news_url)
         page = lxml.html.fromstring(r.content)
+
             # Find the correct html elements via lxml cssselector (or xpath if you prefer)
         for li in page.cssselect('ul li[class=item]'):
                 # concat base_url to unicode encoded relative url --> (css select 1st <a> with class item_title and extract href attribute)
-            url = base_url + smart_unicode(li.cssselect('a[class=item_title]')[0].get('href'))
-            #pprint(url)
+            link = base_url + smart_unicode(li.cssselect('a[class=item_title]')[0].get('href'))
             title = smart_unicode(li.cssselect('a[class=item_title]')[0].text_content().strip())
-            #pprint(title)
             description = smart_unicode(li.cssselect('div[class=item_description]')[0].text_content().strip())
-            #pprint(description)
-            if not ScheepsbouwNL.objects.filter(url=url):
-                item = ScheepsbouwNL(url=url, title=title[:255], description=description[:1024])
+
+            if not ScraperItem.objects.filter(link=link):
+                item = ScraperItem(scraper=scraper, link=link, title=title[:255], description=description[:1024], pubdate=timezone.localtime(timezone.now()))
                 item.save()

File alt_scraper/models.py

 from django.db import models
 
-# Create your models here.
 
-class ScheepsbouwNL(models.Model):
-    title = models.CharField(max_length=255)
-        # We will check for existing articles by comparing URL, so indexed field
-    url = models.URLField(max_length=255, db_index=True)
-    description = models.TextField(max_length=1024)
+class Scraper(models.Model):
+
+    title = models.CharField(max_length=255, unique=True)
     def __unicode__(self):
         return self.title
 
-class Hme(models.Model):
+
+class ScraperItem(models.Model):
+
+    scraper = models.ForeignKey('Scraper')
     title = models.CharField(max_length=255)
-        # We will check for existing articles by comparing URL, so indexed field
-    url = models.URLField(max_length=255, db_index=True)
     description = models.TextField(max_length=1024)
+    link = models.URLField(max_length=255, db_index=True)
+    pubdate = models.DateTimeField()
     def __unicode__(self):
         return self.title

File alt_scraper/urls.py

 from django.conf.urls import patterns, include, url
+from alt_scraper.feeds import ScraperFeed
 
 # Uncomment the next two lines to enable the admin:
 from django.contrib import admin
     # Examples:
     # url(r'^$', 'alt_scraper.views.home', name='home'),
     # url(r'^alt_scraper/', include('alt_scraper.foo.urls')),
+    (r'^scraper/(?P<scraper_id>\d+)/rss/$', ScraperFeed()),
+    #(r'^scraper/rss/', ScraperFeed()),
 
     # Uncomment the admin/doc line below to enable admin documentation:
     url(r'^admin/doc/', include('django.contrib.admindocs.urls')),