Commits

Anton Agafonov committed c4b362f

added cron, made redesign

  • Participants
  • Parent commits a624e62

Comments (0)

Files changed (11)

-support/*
+support/*
+.DS_Store
+.pyc
 api_version: 1
 
 handlers:
+- url: /css
+  static_dir: static/css
+- url: /img
+  static_dir: static/img
+- url: /crawler/(?P<page>\d+)/
+  script: main.py
 - url: .*
   script: main.py
+cron:
+- description: fetch popular news on page 1
+  url: /crawler/1/
+  schedule: every 20 minutes
+- description: fetch popular news on page 2
+  url: /crawler/2/
+  schedule: every 20 minutes
+- description: fetch popular news on page 3
+  url: /crawler/3/
+  schedule: every 20 minutes
+- description: fetch popular news on page 4
+  url: /crawler/4/
+  schedule: every 20 minutes
+- description: fetch popular news on page 20
+  url: /crawler/20/
+  schedule: every 20 minutes
+- description: fetch popular news on page 6
+  url: /crawler/6/
+  schedule: every 20 minutes
+- description: fetch popular news on page 7
+  url: /crawler/7/
+  schedule: every 20 minutes
+- description: fetch popular news on page 8
+  url: /crawler/8/
+  schedule: every 20 minutes
+- description: fetch popular news on page 9
+  url: /crawler/9/
+  schedule: every 20 minutes
+- description: fetch popular news on page 10
+  url: /crawler/10/
+  schedule: every 20 minutes
 indexes:
+- kind: Entity
+  properties:
+  - name: __searchable_name_index
+  - name: name
+  - name: type
 
-# AUTOGENERATED
-
-# This index.yaml is automatically updated whenever the dev_appserver
-# detects that a new type of query is run.  If you want to manage the
-# index.yaml file manually, remove the above marker line (the line
-# saying "# AUTOGENERATED").  If you want to manage some indexes
-# manually, move them above the marker line.  The index.yaml file is
-# automatically uploaded to the admin console when you next deploy
-# your application using appcfg.py.
+- kind: News
+  properties:
+  - name: title
+  - name: date
 
 - kind: Result
   properties:
-  - name: entity
+  - name: news
+  - name: related_entity
+  - name: created
   - name: relevance
-    direction: desc
-
-- kind: Result
-  properties:
-  - name: related_entity
-  - name: relevance
-    direction: desc
+    direction: desc

indexes.yaml

-indexes:
-- kind: Entity
-  properties:
-  - name: __searchable_name_index
-  - name: name
-  - name: type
-
-- kind: News
-  properties:
-  - name: title
-  - name: date
-
-- kind: Result
-  properties:
-  - name: news
-  - name: related_entity
-  - name: created
-  - name: relevance
-    direction: desc
         else:
             query = Entity.all()
             q = cgi.escape(self.request.get('q'))
+            news_type = cgi.escape(self.request.get('type'))
             query.search(q)
-            query.filter("type =", cgi.escape(self.request.get('type')))
+            query.filter("type =", news_type)
             entities = query.fetch(100)
             result_data = []
             for entity in entities:
                         entity_news[key]['relevancy'] = result.relevance
                         entity_news[key]['results'] = [result]
                 for k in entity_news.keys():
-                    entity_news[k]['results'] = sorted(entity_news[key]['results'],
+                    entity_news[k]['results'] = sorted(entity_news[k]['results'],
                                                          key=lambda el: -el.relevance
                                                         )
-                    entity_relevancy = max(entity_news[key]['relevancy'], entity_relevancy)
+                    entity_relevancy = max(entity_news[k]['relevancy'], entity_relevancy)
 
                 result_data.append({'entity': entity,
                                     'news': sorted(entity_news.values(),
             self.render_template({'types': Entity.TYPES,
                                   'entities':sorted(result_data,
                                                     key=lambda el: -el['relevancy']
-                                                   )
+                                                   ),
+                                  'q':q,
+                                  'news_type':news_type,
                                 })
 
 class CrawlerHandler(BaseHandler):
     
     calais = None
     
-    def get_url_list(self, limit=5):
-        response = urllib2.urlopen('%s&count=%d' % (settings.TWEETMEME_URL, limit))
-        logging.info('Fetching url: %s&count=%d' % (settings.TWEETMEME_URL, limit))
+    def get_url_list(self, page=1):
+        response = urllib2.urlopen('%s&page=%d' % (settings.TWEETMEME_URL, int(page)))
+        logging.info('Fetching url: %s&page=%d' % (settings.TWEETMEME_URL, int(page)))
         data = response.read()
         result = json.loads(data)
         if result['status'] != 'success':
             raise Exception('Error fetching url list', 'Error fetching url: %s' % settings.TWEETMEME_URL)
         else:
             return result['stories']
-#        return ['http://news.bbc.co.uk/sport2/hi/motorsport/formula_one/8766250.stm',
-#                'http://news.bbc.co.uk/sport2/hi/football/world_cup_2010/8765523.stm'
-#               ]
         
     def parse(self, info):
         self.calais = Calais(settings.CALAIS_API_KEY, submitter="news_search")
+        if News.all().filter('url =', info['url']).fetch(1):
+            logging.info('Already parsed url: %s' % info['url'])
+            return
         logging.info('Parsing url: %s' % info['url'])
         result = self.calais.analyze_url(info['url'])
         title = result.doc['info']['docTitle'] or info['title'] or info['url']
                                 relevance=float(entity_dict['relevance'])
                              )
 
-    def get(self):
-        for info in self.get_url_list():
+    def get(self, page):
+        for info in self.get_url_list(page):
             try:
                 results = self.parse(info)
                 logging.info('Parsed')
     logging.getLogger().setLevel(logging.INFO)
     application = webapp.WSGIApplication([('/', MainHandler),
                                           ('/search/', MainHandler),
-                                          ('/crawler/', CrawlerHandler),
+                                          ('/crawler/(?P<page>\d+)/', CrawlerHandler),
                                          ],
                                          debug=True)
     util.run_wsgi_app(application)
 CALAIS_API_KEY = '7ath9t9nmfdwazf4x3ksfamv'
-TWEETMEME_URL = 'http://api.tweetmeme.com/stories/popular.json?media=news'
+TWEETMEME_URL = 'http://api.tweetmeme.com/stories/popular.json?media=news&count=5'

settings.pyc

Binary file removed.

static/css/screen.css

+body { font-family: Helvetica, Arial, Times; font-size: 14px; padding:0; margin:50px 0 50px 0; background: #f8f8f8; }
+a { color:#1F98C7; text-decoration: none; }
+a:hover { text-decoration: underline; }
+fieldset { border:0; }
+h2 { }
+
+#container { width: 800px; margin: 0 auto; border: 1px solid #bbb; background: #fff; padding: 0 10px; border-radius:5px; }
+#footer { width: 800px; margin: 0 auto; padding:20px; text-align: right; }
+
+#header { width: 800px; height: 60px; margin-top:5px;}
+#header #tagline { padding: 5px 0; }
+#header #tagline h1 { font-weight: normal; letter-spacing:-1px; color: #555; font-size:32px; }
+#logo { margin: 20px 10px 15px 10px; width: 80px; height: 50px; background: url(/img/logo.png) no-repeat; float: left; }
+#q { width: 390px; }
+input, select, button { border:1px solid #1F98C7; font-size:18px; padding: 5px; height:30px; }
+button, select { height: 42px; }
+#type { width: 215px; margin-left:10px; }
+button { width: 130px; margin-left:10px; border-radius: 5px; color: #fff; background: #1F98C7;}
+
+#search-form { padding: 30px 0 30px 0; }
+#content { border-top: 1px solid #bbb; padding-top: 10px; }
+#entities { list-style: none; padding:0 10px 0 5px;}
+.entity-item { font-size:18px; list-style:none; }
+.entity-item h3 { padding-left:10px; font-weight: normal; }
+.news-body { margin-left:100px; }
+.news-item:nth-child(2n+1) { background: #fafafa; border-top: 1px solid #eee; color:#000; border-bottom: 1px solid #eee; color:#000; }
+.news { list-style: none; padding:0; margin: 20px 0 0 0;  }
+.news li.news-item img { float: left; margin:0 10px 10px 10px; }
+.news li.news-item { padding:10px 0px 0 0; margin: 0 0 0 3px; min-height: 90px; font-size:16px; }
+
+.results { padding: 10px 0 0 10px; font-size: 14px; }
+.result-item { margin-left:10px;  padding:0 10px;}
+
+.prefix-suffix { color:#888; font-style: italic; }
+.rel { color: #995555;}

static/img/logo.png

Added
New image

templates/index.html

 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
     <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-
-    <title>index</title>
+    <link rel="stylesheet" href="/css/screen.css" type="text/css" media="screen" charset="utf-8">
+    <script type="text/javascript" charset="utf-8" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>
+    <title>News Semantic Search</title>
     
 </head>
 
 <body>
     <div id="container">
-        <div id="logo">
+        <div id="header">
+            <div id="logo">
+            </div>
+            <div id="tagline">
+                <h1>Top News Search</h1>
+            </div>
         </div>
         <form id="search-form" method="get">
             <fieldset>
-                <input name="q" value="Type Your search here" />
-                <select name="type">
+                <input name="q" value="{{ q }}" id="q" />
+                <select name="type" id="type">
                     {% for type in types %}
-                        <option value="{{ type }}">{{ type }}</option>
+                        <option value="{{ type }}" {% ifequal news_type type %}selected{% endifequal %}>{{ type }}</option>
                     {% endfor %}
                 </select>
-                <input type="submit" value="Search" />
+                <button type="submit">Search</button>
             </fieldset>
         </form>
         {% if entities %}
-            <h2>Results</h2>
+        <div id="content">
             <ul id="entities">
                 {% for item in entities %}
-                    <li>
-                        {{ item.entity.name }} ({{ item.relevancy }})
-                        <ul>
+                    <li class="entity-item">
+                        <h3>{{ item.entity.name }} <span class='rel'>({{ item.relevancy }})</span></h3>
+                        <ul class="news">
                             {% for news in item.news %}
-                                <li>
-                                    <a href="{{ news.news.url }}">
-                                        {% if news.news.thumbnail %}
-                                            <img src="{{ news.news.thumbnail }}" />
-                                        {% endif %}
-                                        {{ news.news.title}}
-                                     </a> 
-                                     ({{ news.relevancy }})
-                                    <ul>
-                                        {% for result in news.results %}
-                                            <li>
-                                                {{ result.formated_detection }}
-                                                ({{ result.relevance }})
-                                            </li>
-                                        {% endfor %}
-                                    </ul>
+                                <li class="news-item">
+                                    {% if news.news.thumbnail %}
+                                        <img src="{{ news.news.thumbnail }}" />
+                                    {% endif %}
+                                    <div class="news-body">
+                                        <a href="{{ news.news.url }}">
+                                            {{ news.news.title}}
+                                         </a> 
+                                         <span class='rel'>({{ news.relevancy }})</span>
+                                        <ul class="results">
+                                            {% for result in news.results %}
+                                                <li class="result-item">
+                                                    {{ result.formated_detection }}
+                                                    <span class='rel'>({{ result.relevance }})</span>
+                                                </li>
+                                            {% endfor %}
+                                        </ul>
+                                    </div>
                                 </li>
                             {% endfor %}
                         </ul>
                     </li>
                 {% endfor %}
             </ul>
+        </div>
         {% endif %}
     </div>
+    <div id="footer">
+        Developed by <a href="mailto:equeny@gmail.com">equeny</a>
+    </div>
+    <script type="text/javascript" charset="utf-8">
+        $('.entity-item h3').append(' <a class="show-hide" href="#">-</a>');
+        $('.show-hide').live('click', function(){
+            $(this).parent().parent().children('.news').toggle('slow');
+            $(this).text('+');
+            return false;
+        });
+    </script>
 </body>
 </html>