Source

magic_utilities / scraper.py

Full commit
# coding: utf-8
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import re
import os
import json
import threading
import urllib

from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned

from django.core.management import setup_environ
import settings

setup_environ(settings)
from mtg.models import Card, Edition

class ScrapeEngine():
    def __init__(self):
        self.base_url = 'http://magiccards.info'
        # Browser
        self.br = mechanize.Browser()

        # Cookie Jar
        self.cj = cookielib.LWPCookieJar()
        self.br.set_cookiejar(self.cj)

        # Browser options
        self.br.set_handle_equiv(True)
        self.br.set_handle_gzip(True)
        self.br.set_handle_redirect(True)
        self.br.set_handle_referer(True)
        self.br.set_handle_robots(False)

        # Follows refresh 0 but not hangs on refresh > 0
        self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
        self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
        self.br.addheaders = [('Referer', 'http://google.com')]

    def scrape(self):
        self.categories = self.get_categories()
        for edition_data in self.categories:
            try:
                edition = Edition.objects.get(name=edition_data['name'], url=edition_data['url'], category=edition_data['category'])
                print 'Edition %s found in database.' % edition_data['name']
            except ObjectDoesNotExist:
                edition = Edition(name=edition_data['name'], url=edition_data['url'], category=edition_data['category'])
                edition.save()
                print 'Edition %s is new and will be added to the database.' % edition_data['name']
        editions = Edition.objects.filter(completed=False).all()
        for edition in editions:
            print 'Edition %s is not complete and will be scraped.' % edition.name
            # We partially scrape cards from the edition list.
            self.get_cards_from_edition(edition)
            # Now we fill them in if needed.
            self.update_cards_from_edition(edition)
            edition.completed = True
            edition.save()

        self.join_double_sided_cards()
        self.fix_card_types()

    def grab_url(self, url):
        r = self.br.open(url)
        html = r.read()
        soup = BeautifulSoup(html)
        return soup

    def update_cards_from_edition(self, edition):
        cards = Card.objects.filter(completed=False, edition=edition).all()
        for card in cards:
            full_url = self.base_url + card.url
            soup = self.grab_url(full_url)
            table = soup.findAll('table')[3]
            try:
                abilities = []
                ability_text = table.find("p", { "class" : "ctext" }).find('b').contents
                for ability in ability_text:
                    try:
                        if hasattr(ability, 'encode'):
                            abilities.append(ability.encode('utf-8'))
                    except TypeError:
                        pass
            except AttributeError:
                abilities = None

            card.abilities = abilities
            card.oracle_text = '\n'.join(abilities)

            uls = table.findAll('ul')
            flavor_text = table.findAll('i')
            if len(flavor_text):
                card.flavor_text = flavor_text[0].text

            if len(uls) > 1:
                card.rulings = json.dumps([ruling.text for ruling in uls[0].findAll('li')])
                card.legalities = json.dumps([legality.text for legality in uls[1].findAll('li')])
            else:
                card.legalities = json.dumps([legality.text for legality in uls[0].findAll('li')])

            if card.cost:
                cost = re.sub('\{([^}]*)\}', 'O', card.cost)
                converted_cost = 0
                for char in cost:
                    try:
                        float(char)
                    except (ValueError, TypeError):
                        # not numeric
                        if char.capitalize() != 'X':
                            converted_cost += 1
                    else:
                        # numeric
                        pass
                numeric_value = re.sub('[^0-9]', '', cost)
                if numeric_value:
                    converted_cost += int(numeric_value)
                card.converted_cost = converted_cost

            card_image = table.find("img", { 'style': re.compile("solid black") })
            try:
                card.image_url = card_image['src'].replace(self.base_url, '')

                thread = threading.Thread(target=self.save_image, args=(card, edition.name))
                thread.start()
            except TypeError:
                card.image_url = None

            card.completed = True
            card.save()
            print "%s: %s has been completely scraped." % (edition.name, card.name)

    def make_directories_if_needed(self, filename):
        directory = os.path.dirname(filename)
        if not os.path.exists(directory):
            os.makedirs(directory)

    def fix_card_types(self):
        def fix_cards(cards):
            for card in cards:
                sub_type_split_string = card.type.replace(u' — ', ' ').split(' ')
                power_toughness = sub_type_split_string[-1]
                pt_list = power_toughness.split('/')
                power = pt_list[0]
                toughness = pt_list[1]
                if sub_type_split_string[0] == 'Creature':
                    sub_type_list = sub_type_split_string[1:-1]
                    sub_types = ' '.join(sub_type_list)
                else:
                    sub_type_list = sub_type_split_string[:-1]
                    sub_types = ' '.join(sub_type_list).replace('Creature ', '')
                card.type = 'Creature'
                card.sub_types = sub_types
                card.power = power
                card.toughness = toughness
                card.save()

        fix_cards(Card.objects.filter(type__startswith='Summon ').all())
        fix_cards(Card.objects.filter(type__contains=u'Creature — ').all())

    def join_double_sided_cards(self):
        cards = Card.objects.filter(mci_id__endswith='a', other_side=None).all()
        for card in cards:
            other_side = Card.objects.filter(mci_id=card.mci_id.replace('a', 'b'), edition=card.edition).get()
            card.other_side = other_side.id
            other_side.other_side = card.id
            card.save()
            other_side.save()
            print "%s is joined with %s" % (card.name, other_side.name)

    def save_image(self, card, edition_name):
        base_path = os.path.realpath(__file__).strip(__file__)
        save_as = os.path.normpath(base_path + card.image_url)
        self.make_directories_if_needed(save_as)
        urllib.urlretrieve(self.base_url + card.image_url, save_as)
        print "Downloaded artwork for: %s: %s" % (edition_name, card.name)

    def get_cards_from_edition(self, edition):
        soup = self.grab_url(edition.url)
        try:
            table = soup.findAll('table')[3]
        except IndexError:
            table = soup.findAll('table')[2]

        card_rows = table.findAll('tr')
        for card_row in card_rows:
            card_columns = card_row.findAll('td')
            if len(card_columns) == 7:
                mci_id = card_columns[0].contents[0]
                name_url = card_columns[1].find('a')
                name = name_url.text
                url = name_url['href']
                type_string = card_columns[2].contents[0]
                sub_types = None
                power = None
                toughness = None
                try:
                    cost = card_columns[3].contents[0]
                except IndexError:
                    cost = None

                rarity = card_columns[4].contents[0]
                illustrator = card_columns[5].contents[0]

                if ' - ' in type_string:
                    # We have to split the string to parse the subtypes
                    split_type = type_string.split(' - ')
                    sub_type_string = split_type[1]
                    card_type = split_type[0]
                    if '/' in sub_type_string:
                        # Creature
                        sub_type_split_string = sub_type_string.split(' ')
                        power_toughness = sub_type_split_string[-1]
                        pt_list = power_toughness.split('/')
                        power = pt_list[0]
                        toughness = pt_list[1]
                        sub_type_list = sub_type_split_string[:-1]
                        sub_types = ' '.join(sub_type_list)
                    else:
                        # Not a creature
                        sub_types = sub_type_string
                else:
                    # Basic type with no subtype.
                    card_type = type_string
                try:
                    card = Card.objects.get(name=name, edition=edition, url=url)
                    print "%s: %s is already added." % (edition.name, card.name)
                except ObjectDoesNotExist:
                    card = Card(mci_id=mci_id, name=name, url=url, type=card_type,
                                sub_types=sub_types, power=power, toughness=toughness,
                                completed=False, cost=cost, rarity=rarity, illustrator=illustrator,
                                edition=edition)
                    card.save()
                    print "%s: %s partially saved." % (edition.name, card.name)


    def get_categories(self):
        soup = self.grab_url('http://magiccards.info/sitemap.html')
        table = soup.findAll('table')[1]
        urls = []
        for h3 in [category for category in table.findAll('h3')]:
            category_urls = h3.findNext('ul').findAll('a')
            for a in category_urls:
                urls.append({
                'category': h3.text,
                    'name': a.text,
                    'url': self.base_url + a['href']
                })
        return urls

if __name__ == '__main__':
    engine = ScrapeEngine()
    engine.scrape()