Commits

Anton Agafonov committed a624e62

Initial commit

Comments (0)

Files changed (11)

+support/*
+application: newssemanticsearch
+version: 1
+runtime: python
+api_version: 1
+
+handlers:
+- url: .*
+  script: main.py
+indexes:
+
+# AUTOGENERATED
+
+# This index.yaml is automatically updated whenever the dev_appserver
+# detects that a new type of query is run.  If you want to manage the
+# index.yaml file manually, remove the above marker line (the line
+# saying "# AUTOGENERATED").  If you want to manage some indexes
+# manually, move them above the marker line.  The index.yaml file is
+# automatically uploaded to the admin console when you next deploy
+# your application using appcfg.py.
+
+- kind: Result
+  properties:
+  - name: entity
+  - name: relevance
+    direction: desc
+
+- kind: Result
+  properties:
+  - name: related_entity
+  - name: relevance
+    direction: desc
+indexes:
+- kind: Entity
+  properties:
+  - name: __searchable_name_index
+  - name: name
+  - name: type
+
+- kind: News
+  properties:
+  - name: title
+  - name: date
+
+- kind: Result
+  properties:
+  - name: news
+  - name: related_entity
+  - name: created
+  - name: relevance
+    direction: desc
+#!/usr/bin/env python
+#
+# Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os, sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'support'))
+
+from google.appengine.ext import webapp
+from google.appengine.ext.webapp import util
+from google.appengine.ext.webapp import template
+from google.appengine.ext import db
+
+from models import Entity, News, Result, NewsCategory
+import settings
+import cgi
+import md5
+import urllib2
+import simplejson as json
+from StringIO import StringIO
+import logging
+
+from calais.calais import Calais
+
+class BaseHandler(webapp.RequestHandler):
+    def render_template(self, params, name='templates/index.html'):
+        path = os.path.join(os.path.dirname(__file__), name)
+        self.response.out.write(template.render(path, params))
+
+class MainHandler(BaseHandler):
+
+    def get(self):
+        if not self.request.get('q'):
+            self.render_template({'types': Entity.TYPES})
+        else:
+            query = Entity.all()
+            q = cgi.escape(self.request.get('q'))
+            query.search(q)
+            query.filter("type =", cgi.escape(self.request.get('type')))
+            entities = query.fetch(100)
+            result_data = []
+            for entity in entities:
+                query = Result.all()
+                query.filter('related_entity =', entity)
+                query.order('-relevance')
+                results = query.fetch(100)
+                entity_news = {}
+                entity_relevancy = 0
+                for result in results:
+                    key = result.news.url
+                    if key in entity_news:
+                        entity_news[key]['relevancy'] += result.relevance
+                        entity_news[key]['results'].append(result)
+                    else:
+                        entity_news[key] = {}
+                        entity_news[key]['news'] = result.news
+                        entity_news[key]['relevancy'] = result.relevance
+                        entity_news[key]['results'] = [result]
+                for k in entity_news.keys():
+                    entity_news[k]['results'] = sorted(entity_news[key]['results'],
+                                                         key=lambda el: -el.relevance
+                                                        )
+                    entity_relevancy = max(entity_news[key]['relevancy'], entity_relevancy)
+
+                result_data.append({'entity': entity,
+                                    'news': sorted(entity_news.values(),
+                                                   key=lambda el: -el['relevancy']
+                                                  ),
+                                    'relevancy': entity_relevancy,
+                                   })
+            self.render_template({'types': Entity.TYPES,
+                                  'entities':sorted(result_data,
+                                                    key=lambda el: -el['relevancy']
+                                                   )
+                                })
+
+class CrawlerHandler(BaseHandler):
+    """docstring for CrawlerHandler"""
+    
+    calais = None
+    
+    def get_url_list(self, limit=5):
+        response = urllib2.urlopen('%s&count=%d' % (settings.TWEETMEME_URL, limit))
+        logging.info('Fetching url: %s&count=%d' % (settings.TWEETMEME_URL, limit))
+        data = response.read()
+        result = json.loads(data)
+        if result['status'] != 'success':
+            raise Exception('Error fetching url list', 'Error fetching url: %s' % settings.TWEETMEME_URL)
+        else:
+            return result['stories']
+#        return ['http://news.bbc.co.uk/sport2/hi/motorsport/formula_one/8766250.stm',
+#                'http://news.bbc.co.uk/sport2/hi/football/world_cup_2010/8765523.stm'
+#               ]
+        
+    def parse(self, info):
+        self.calais = Calais(settings.CALAIS_API_KEY, submitter="news_search")
+        logging.info('Parsing url: %s' % info['url'])
+        result = self.calais.analyze_url(info['url'])
+        title = result.doc['info']['docTitle'] or info['title'] or info['url']
+        news = News.get_or_insert(key_name=info['url'], url=info['url'], body='', title=title, thumbnail='thumbnail' in info and info['thumbnail'] or None)
+        for topic in result.topics:
+            entity = NewsCategory.get_or_insert(key_name=topic['category'],
+                                                news = news.key(),
+                                                category=topic['categoryName'],
+                                                url=topic['category'])
+        for entity_dict in result.entities:
+            if entity_dict['_type'] in Entity.TYPES:
+                entity = Entity.get_or_insert(key_name=entity_dict['__reference'],
+                                              name=entity_dict['name'],
+                                              type=entity_dict['_type'],
+                                              url=entity_dict['__reference'])
+                detection, prefix, suffix = None, None, None
+                for instance in entity_dict['instances']:
+                    detection = instance['detection']
+                    prefix = instance['prefix']
+                    suffix = instance['suffix']
+                    result = Result.get_or_insert(key_name="%s%s" % (news.key(),
+                                                                    entity.key()
+                                                                    ),
+                                news=news.key(),
+                                related_entity=entity.key(),
+                                detection=detection, prefix=prefix, suffix=suffix,
+                                relevance=float(entity_dict['relevance'])
+                             )
+
+    def get(self):
+        for info in self.get_url_list():
+            try:
+                results = self.parse(info)
+                logging.info('Parsed')
+            except Exception, ex:
+                logging.error('Error parsing: %s' % ex)
+                
+        self.render_template({'entities': News.all()}, 'templates/crawler.html')
+        
+def main():
+    logging.getLogger().setLevel(logging.INFO)
+    application = webapp.WSGIApplication([('/', MainHandler),
+                                          ('/search/', MainHandler),
+                                          ('/crawler/', CrawlerHandler),
+                                         ],
+                                         debug=True)
+    util.run_wsgi_app(application)
+
+
+if __name__ == '__main__':
+    main()
+from google.appengine.ext import db
+from django.utils.html import strip_tags
+import re
+from google.appengine.ext import search 
+
+class Entity(search.SearchableModel):
+    TYPES = ['Person', 'Company', 'Product', 'Country', 'City']
+    type = db.StringProperty(choices=set(TYPES))
+    url = db.LinkProperty()
+    name = db.StringProperty()
+    extra_info = db.TextProperty()
+
+
+class News(db.Model):
+    url = db.LinkProperty()
+    body = db.TextProperty()
+    title = db.StringProperty()
+    thumbnail = db.LinkProperty()
+    category = db.StringProperty()
+    date = db.DateTimeProperty(auto_now_add=True)
+
+
+class Result(db.Model):
+    news = db.ReferenceProperty(News)
+    related_entity = db.ReferenceProperty(Entity)
+    prefix = db.StringProperty()
+    suffix = db.StringProperty()
+    detection = db.StringProperty()
+    relevance = db.FloatProperty()
+    created = db.DateTimeProperty(auto_now_add=True)
+    extra_info = db.TextProperty()
+    
+    @property
+    def formated_detection(self):
+        text = strip_tags(self.detection)
+        return text.replace('[', '<span class="prefix-suffix">').replace(']', '</span>')
+
+
+class NewsCategory(db.Model):
+    url = db.LinkProperty()
+    news = db.ReferenceProperty(News)
+    category = db.StringProperty()
+    relevance = db.FloatProperty()
+    extra_info = db.TextProperty()
+  
Binary file added.
+CALAIS_API_KEY = '7ath9t9nmfdwazf4x3ksfamv'
+TWEETMEME_URL = 'http://api.tweetmeme.com/stories/popular.json?media=news'
Binary file added.

templates/crawler.html

+<h1>Crawled Pages</h1>
+<ul>
+{% for entity in entities %}
+    <li>
+        {{ entity.url }} - 
+        {{ entity.type }}
+    </li>
+{% endfor %}
+</ul>

templates/index.html

+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+
+    <title>index</title>
+    
+</head>
+
+<body>
+    <div id="container">
+        <div id="logo">
+        </div>
+        <form id="search-form" method="get">
+            <fieldset>
+                <input name="q" value="Type Your search here" />
+                <select name="type">
+                    {% for type in types %}
+                        <option value="{{ type }}">{{ type }}</option>
+                    {% endfor %}
+                </select>
+                <input type="submit" value="Search" />
+            </fieldset>
+        </form>
+        {% if entities %}
+            <h2>Results</h2>
+            <ul id="entities">
+                {% for item in entities %}
+                    <li>
+                        {{ item.entity.name }} ({{ item.relevancy }})
+                        <ul>
+                            {% for news in item.news %}
+                                <li>
+                                    <a href="{{ news.news.url }}">
+                                        {% if news.news.thumbnail %}
+                                            <img src="{{ news.news.thumbnail }}" />
+                                        {% endif %}
+                                        {{ news.news.title}}
+                                     </a> 
+                                     ({{ news.relevancy }})
+                                    <ul>
+                                        {% for result in news.results %}
+                                            <li>
+                                                {{ result.formated_detection }}
+                                                ({{ result.relevance }})
+                                            </li>
+                                        {% endfor %}
+                                    </ul>
+                                </li>
+                            {% endfor %}
+                        </ul>
+                    </li>
+                {% endfor %}
+            </ul>
+        {% endif %}
+    </div>
+</body>
+</html>