Commits

Anonymous committed cb6578f

inclusion of Xapian search engine and templates, refinements to transform code

Comments (0)

Files changed (12)

 
 
 syntax: regexp
-^production\.db$
+^production\.db$
+syntax: regexp
+^db$

openletters/cli.py

 ''' Command Line Interface for setting up Open Letters stores
-Derived from open shakespeare - needs to include Redis details in model
+Derived from open shakespeare
 '''
+
 import os
 import sys
 
             #fileobj = open('openletters/docs/letter.txt')
             import openletters.main
             openletters.main.load_dickens_letters(fileobj)
-            #fileobj.close()
             
             openletters.main.load_source(file_obj)
-            #file_obj.close()
+
         else:
             print 'Action not recognized'
 
+
+class Index(BaseCommand):
+    '''Index the letters for a Xapian powered search
+    
+    index dickens  - indexes the Dickens letters
+    '''
+    summary = __doc__.split('\n')[0]
+    usage = __doc__
+    max_args = None
+    min_args = 1
+    
+    def command(self):
+        self._load_config()
+        cmd = self.args[0]
+        
+        if cmd == 'dickens':
+            type = 'dickens'
+            fileobj = 'openletters/docs/dickens_letters.xml'
+            import openletters.main
+            
+            openletters.main.index_letters(self, type, fileobj)
+        else:
+            print 'Action not recognized'

openletters/controllers/data.py

         rdf = rdf_transform()
         
         return rdf.create_rdf_end()
+
+    def book (self):
+        #response.headers['Content-Type'] = 'application/json'
+        json = json_transform()
+        query_string = model.Session.query(model.Letter).filter(model.Letter.type == author).all()
+        return json.book_json(query_string)
+    
+    def correspondent(self):
+        req = request.POST('search')
+        corres =  model.Session.query(model.Letter.correspondent).distinct().all()
+        
+        for c in corres:
+            if req in c:
+                b = '<li>%s</li>' % c
+        
+        return b    

openletters/controllers/letters.py

-import logging
-
-import genshi
-
-import urllib
+import logging, genshi, urllib
 
 from pylons import request, response, session, tmpl_context as c
 from pylons.controllers.util import abort, redirect_to
 
 from openletters.model import dbase
 
-from sqlalchemy import join, and_
 
 log = logging.getLogger(__name__)
 
 class LettersController(BaseController):
-    #def index(self):
-    #    c.site_title = "Open Correspondence"
-    #    
-    #    author = request.params.get('author', '')
-    #    if author:
-    #        pass
-    #    
-    #    else:
-    #        c.page_title = "Index of letters"
-    #        c.letters = model.Session.query(model.Letter).all()
 
-        #return render('letters/index.html')
-        #return render('index.html')
     ''' 
       Search for letters 
     '''
         letter_items = letter.items()
         letter_items.sort()
         
-        return letter_items
+        return letter_items
+

openletters/docs/dickens_letters.xml

 [111] An allusion to an unfounded rumour.
 
 [112] Charles Dickens's son, Alfred Tennyson.</letter><correspondent>Mr Alfred Tennyson Dickens</correspondent><salutation>ALFRED</salutation><lettertext>921</lettertext><date>1870-00-20</date>
-</div></openletters>
+</div></openletters>

openletters/main.py

+# -*- coding: latin-1 -*-
+import unicodedata
 '''
 Class to parse the Dickens letters and enter into a store
 '''
-from parseText import parse_text
-from parseText import parse_date
+from parse import parse_text, parse_date
+
 
 from xml.dom import minidom
 
 from openletters import model
 
+import xapian, urllib, os
+
 def getText(nodelist):
     rc = []
     for node in nodelist:
         if node.nodeType == node.TEXT_NODE:
-            rc.append(node.data)
+            rc.append(unicodedata.normalize('NFKC', node.data))
     return ''.join(rc)
 
 def handle_elements (elementname, element):
 
     
 def handle_parts (nodename, node):
-   # print "<%s>%s</%s>" % (nodename, getText(node.childNodes), nodename)
     return getText(node.childNodes)
     
 
             print('Source %s: \n\t ...' % (title))
             model.Session.remove()
         else:
-            print('Source : SKIPPING')
+            print('Source : SKIPPING')
+
+
+def index_letters(self, type, fileobj):
+
+
+    db_path = 'db'
+    
+    database = xapian.WritableDatabase(db_path, xapian.DB_CREATE_OR_OPEN)
+    indexer = xapian.TermGenerator()
+    indexer.set_stemmer(xapian.Stem('english'))
+    
+    xapian_file_name = 0
+    count = 0
+    text = minidom.parse(fileobj)
+    #split the body into individual letters
+    letters  = text.getElementsByTagName('div')
+    #open the XML, parse the letter id
+    for letter in letters:
+        count +=1
+        text=unicode(handle_elements("letter", letter))
+        corr=unicode(handle_elements("correspondent", letter))
+            
+        document = xapian.Document()
+        document.set_data(text)
+        #not sure this is going to work - rather than using the filename, use letter ids
+        letter_index = type + "/" + urllib.quote(corr) + "/" + str(count)
+
+        print "indexing %s" ,letter_index
+        document.add_value(xapian_file_name, letter_index)
+        
+        indexer.set_document(document)
+        indexer.index_text(text)
+        database.add_document(document)
+        
+    database.flush()

openletters/templates/letters/search.html

 <html xmlns:py="http://genshi.edgewall.org/"
   xmlns:xi="http://www.w3.org/2001/XInclude"
   py:strip = "True">
-
+    
   <py:def function="page_title">Letters - Search</py:def>
 
   <div py:def="content">
                <option value="${corr.correspondent}">${corr.correspondent}</option>
             </div>
        </select>
-       
+
     <input type="submit" name="Search Letters" />
     </form>
   </div>

openletters/tests/functional/test_timeline.py

-from openletters.tests import *
-
-class TestTimelineController(TestController):
-
-    def test_index(self):
-        response = self.app.get(url(controller='timeline', action='index'))
-        # Test response...

openletters/transform/ReadMe.txt

-This folder will contain tools to transform the letters into XML and RDF.
-
-17 April 2010 - IE to convert current PHP scripts into Python

openletters/transform/transform_json.py

             
         return self.jsonify(dict)
     
-        '''
+    '''
     Function to return the text as json
     '''
     def corr_json (self, author, letterobj):
             
         return self.jsonify(dict)
 
+    '''
+    Function to return the book graph
+    '''
+    def book_json (self, book_query):
+        
+        dict = '{'
+
+        for b in book_query:
+            dict +=  str(b.author) + ': [ '
+            dict += '"correspondent": "' + author
+            dict += '", "nick: "' + l
+        
+            dict += '"]'
+        
+        dict += '}'
+        
+        return self.jsonify(dict)
+    
     def jsonify (self, output):
         return json.dumps(output, sort_keys = True, indent=4)

pip-requirements.txt

 ## usage: pip -E ../pyenv-openletters install -r pip-requirements.txt
 # 1.12 is weird ...
-Routes<=1.11.99
-Pylons<=0.9.7.99
-Genshi>=0.5,<=0.5.99
-SQLAlchemy>=0.5,<=0.5.99
+Routes
+Pylons
+Genshi
+SQLAlchemy >=0.5
 -e .
 # -e hg+https://knowledgeforge.net/letters/hg#egg=openletters
     db = openletters.cli:ManageDb
     load = openletters.cli:Load
     fixtures = openletters.cli:Fixtures
+    index = openletters.cli:Index
     """,
 )