Source

News Semantic Search / main.py

Full commit
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'support'))

from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
from google.appengine.ext.webapp import template
from google.appengine.ext import db

from models import Entity, News, Result, NewsCategory
import settings
import cgi
import md5
import urllib2
import simplejson as json
from StringIO import StringIO
import logging

from calais.calais import Calais

class BaseHandler(webapp.RequestHandler):
    def render_template(self, params, name='templates/index.html'):
        path = os.path.join(os.path.dirname(__file__), name)
        self.response.out.write(template.render(path, params))

class MainHandler(BaseHandler):

    def get(self):
        if not self.request.get('q'):
            self.render_template({'types': Entity.TYPES})
        else:
            query = Entity.all()
            q = cgi.escape(self.request.get('q'))
            news_type = cgi.escape(self.request.get('type'))
            query.search(q)
            query.filter("type =", news_type)
            entities = query.fetch(100)
            result_data = []
            for entity in entities:
                query = Result.all()
                query.filter('related_entity =', entity)
                query.order('-relevance')
                results = query.fetch(100)
                entity_news = {}
                entity_relevancy = 0
                for result in results:
                    key = result.news.url
                    if key in entity_news:
                        entity_news[key]['relevancy'] += result.relevance
                        entity_news[key]['results'].append(result)
                    else:
                        entity_news[key] = {}
                        entity_news[key]['news'] = result.news
                        entity_news[key]['relevancy'] = result.relevance
                        entity_news[key]['results'] = [result]
                for k in entity_news.keys():
                    entity_news[k]['results'] = sorted(entity_news[k]['results'],
                                                         key=lambda el: -el.relevance
                                                        )
                    entity_relevancy = max(entity_news[k]['relevancy'], entity_relevancy)

                result_data.append({'entity': entity,
                                    'news': sorted(entity_news.values(),
                                                   key=lambda el: -el['relevancy']
                                                  ),
                                    'relevancy': entity_relevancy,
                                   })
            self.render_template({'types': Entity.TYPES,
                                  'entities':sorted(result_data,
                                                    key=lambda el: -el['relevancy']
                                                   ),
                                  'q':q,
                                  'news_type':news_type,
                                })

class CrawlerHandler(BaseHandler):
    """docstring for CrawlerHandler"""
    
    calais = None
    
    def get_url_list(self, page=1):
        response = urllib2.urlopen('%s&page=%d' % (settings.TWEETMEME_URL, int(page)))
        logging.info('Fetching url: %s&page=%d' % (settings.TWEETMEME_URL, int(page)))
        data = response.read()
        result = json.loads(data)
        if result['status'] != 'success':
            raise Exception('Error fetching url list', 'Error fetching url: %s' % settings.TWEETMEME_URL)
        else:
            return result['stories']
        
    def parse(self, info):
        self.calais = Calais(settings.CALAIS_API_KEY, submitter="news_search")
        if News.all().filter('url =', info['url']).fetch(1):
            logging.info('Already parsed url: %s' % info['url'])
            return
        logging.info('Parsing url: %s' % info['url'])
        result = self.calais.analyze_url(info['url'])
        title = result.doc['info']['docTitle'] or info['title'] or info['url']
        news = News.get_or_insert(key_name=info['url'], url=info['url'], body='', title=title, thumbnail='thumbnail' in info and info['thumbnail'] or None)
        for topic in result.topics:
            entity = NewsCategory.get_or_insert(key_name=topic['category'],
                                                news = news.key(),
                                                category=topic['categoryName'],
                                                url=topic['category'])
        for entity_dict in result.entities:
            if entity_dict['_type'] in Entity.TYPES:
                entity = Entity.get_or_insert(key_name=entity_dict['__reference'],
                                              name=entity_dict['name'],
                                              type=entity_dict['_type'],
                                              url=entity_dict['__reference'])
                detection, prefix, suffix = None, None, None
                for instance in entity_dict['instances']:
                    detection = instance['detection']
                    prefix = instance['prefix']
                    suffix = instance['suffix']
                    result = Result.get_or_insert(key_name="%s%s" % (news.key(),
                                                                    entity.key()
                                                                    ),
                                news=news.key(),
                                related_entity=entity.key(),
                                detection=detection, prefix=prefix, suffix=suffix,
                                relevance=float(entity_dict['relevance'])
                             )

    def get(self, page):
        for info in self.get_url_list(page):
            try:
                results = self.parse(info)
                logging.info('Parsed')
            except Exception, ex:
                logging.error('Error parsing: %s' % ex)
                
        self.render_template({'entities': News.all()}, 'templates/crawler.html')
        
def main():
    logging.getLogger().setLevel(logging.INFO)
    application = webapp.WSGIApplication([('/', MainHandler),
                                          ('/search/', MainHandler),
                                          ('/crawler/(?P<page>\d+)/', CrawlerHandler),
                                         ],
                                         debug=True)
    util.run_wsgi_app(application)


if __name__ == '__main__':
    main()