Source

scatter-share / demos / akara / amara / nyt2skos.py

Full commit
#!/usr/bin/env python

"""
Based on http://inkdroid.org/journal/2009/08/18/new-york-times-topics-as-skos/

Scrape the topics out of New York Times and emit them as SKOS.

The program basically will GET urls like:

  http://topics.nytimes.com/topics/reference/timestopics/all/[a-z]

and scrape the topics out of them, and then persist the SKOS data as RDF/XML to stdout
"""

import sys
import re
from string import lowercase, uppercase

#import amara
from amara.bindery import html
from amara.namespaces import SKOS_NAMESPACE, RDF_NAMESPACE
from amara.lib.util import first_item
from amara.writers.struct import structwriter, E, NS, ROOT, RAW, E_CURSOR

# set up some namespaces and the basic structure of the graph

TOPICS_NAMESPACE = u'http://topics.nytimes.com/top/reference/timestopics/'
PEOPLE = TOPICS_NAMESPACE + u'people#concept'
ORGANIZATIONS = TOPICS_NAMESPACE + u'organizations#concept'
CONCEPT_SCHEME = u'http://topics.nytimes.com/top/reference/timestopics#conceptScheme'

#Set up the output XML envelope, establishing a cursor coroutine into which
#Additional output will be inserted

w = structwriter(indent=True)
cursor = w.cofeed(
ROOT(
  E_CURSOR((RDF_NAMESPACE, u'rdf:RDF')),
    NS(u'skos', SKOS_NAMESPACE),
    NS(u't', TOPICS_NAMESPACE),
  )
)

# do the scrape

url_template = u'http://topics.nytimes.com/top/reference/timestopics/subjects/%s/index.html'
#'http://topics.nytimes.com/top/reference/timestopics/all/%s'
for letter in uppercase:
    stem_uri = url_template % letter
    #print >> sys.stderr, stem_uri
    hdoc = html.parse(stem_uri)
    #print >> sys.stderr, hdoc.xml_select(u'//div[@id="columnistColumns"]//a/@href')
    for a in ( node for node in hdoc.xml_select(u'//div[@id="columnistColumns"]//a') ):
        uri = re.sub(r'/index.html$', '', a.href) # + '#concept'
        #print >> sys.stderr, uri
        cursor.send(
          E((SKOS_NAMESPACE, u'skos:Concept'), {(RDF_NAMESPACE, u'rdf:about'): uri},
            E((SKOS_NAMESPACE, u'skos:prefLabel'), unicode(a)),
            E((SKOS_NAMESPACE, u'skos:inScheme'), {(RDF_NAMESPACE, u'rdf:resource'): CONCEPT_SCHEME}),
            (E((SKOS_NAMESPACE, u'skos:broader'), {(RDF_NAMESPACE, u'rdf:resource'): PEOPLE})) if 'timestopics/people' in uri else (),
            (E((SKOS_NAMESPACE, u'skos:broader'), {(RDF_NAMESPACE, u'rdf:resource'): ORGANIZATIONS})) if 'timestopics/organizations' in uri else (),
          )
        )
    break

cursor.close()