Whoosh integrated with VisTrails

Anonymous avatarAnonymous created an issue

VisTrails is a general purpose scientific work flow framework. When I make a Whoosh plug-in, there appears to be problems when I do wildcard searches on the Whoosh indexer. A stand-alone program works (outside of VisTrails). The problem appears to be codec related. Here are the details... I am using a Windows 7 box with python 2.7.3 and whoosh 0.3.18. VisTrails complains about a codec when you try to use Wildcards. Below is the crash dump

=======================================================

Traceback (most recent call last):
File "C:\Program Files (x86)\VisTrails\vistrails\core\modules\vistrails_module
.py", line 305, in update
self.compute()
File "C:\Users\markp\.vistrails\userpackages\..\userpackages\FindFile.py", lin
e 2594, in compute
results = s.search(q)
File "c:\Python27\Lib\site-packages\whoosh\searching.py", line 280, in search
for docnum, score in query.doc_scores(self)),
File "c:\Python27\Lib\site-packages\whoosh\query.py", line 202, in doc_scores
return iter(self.scorer(searcher, exclude_docs=exclude_docs))
File "c:\Python27\Lib\site-packages\whoosh\query.py", line 386, in scorer
for word in self._words(searcher.reader()):
File "c:\Python27\Lib\site-packages\whoosh\query.py", line 762, in _words
for text in candidates:
File "c:\Python27\Lib\site-packages\whoosh\reading.py", line 207, in expand_prefix
for fn, t, _, _ in self.iter_from(fieldid, prefix):
File "c:\Python27\Lib\site-packages\whoosh\reading.py", line 393, in _merge_iters
fnum, text, docfreq, termcount = it.next()
File "c:\Python27\Lib\site-packages\whoosh\filedb\filereading.py", line 169, in iter_from
for (fn, t), (totalfreq, _, postcount) in tt.items_from((fieldnum, text)):
File "c:\Python27\Lib\site-packages\whoosh\filedb\filetables.py", line 416, in items_from
yield (kd(key), vd(value))
File "c:\Python27\Lib\site-packages\whoosh\filedb\filetables.py", line 73, in
decode_termkey
return unpackushort(key[:_USHORT_SIZE]), utf8decode(key[_USHORT_SIZE:])[0]
File "C:\Program Files (x86)\VisTrails\vistrails\Python27\lib\encodings\utf_8.
py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
AttributeError: 'NoneType' object has no attribute 'utf_8_decode'

================= stand-alone code ======================

import sys
import os
import codecs
import whoosh
import whoosh.index as index
from whoosh.index import *
from whoosh.query import *
from whoosh.fields import *
from whoosh.qparser import QueryParser

def main():
    print "...Start"
    schema = Schema(path=TEXT(stored=True),file=TEXT(stored=True),ext=TEXT(store
d=True),content=TEXT(stored=True),md5=TEXT(stored=True))
    print "S1 ",schema

    ix = index.open_dir('c:\\enron_mail_index')
    print "S2 ", ix

    qp = QueryParser("file", schema=ix.schema)
    print "S3 ", qp

    uq = u'email_*1294'
    q = qp.parse(uq)
    print "S4 ", q

    s = ix.searcher()
    print "S5 ", s

    results = s.search(q)
    print "S5 ", results

    count = 0
    for result in results:
        if count < 30:
            for key,item in result.items():
                if key == 'path':
                    print item
                    count = count + 1
    print "...End"

if __name__=="__main__":
    main()

===================== plug-in code =============================

#!/bin/env python
import sys
import os
import whoosh
import whoosh.index as index
from whoosh.index import *
from whoosh.query import *
from whoosh.fields import *
from whoosh.qparser import QueryParser
import core.modules.module_registry
from core.modules.vistrails_module import Module

version = "0.0.9"
name = "Stuff"
identifier = "Stuff.ToolKit"

class FindFile(Module):

    #------------------------------------------------------
    # compute
    # Entry point for the FindFile VisTrails module
    def compute(self):
        token = unicode(self.getInputFromPort("token"))
        token = token.lower()
        instr = self.getInputFromPort("featspec")

        schema = Schema(path=TEXT(stored=True),file=TEXT(stored=True),ext=TEXT(stored=True),content=TEXT(stored=True),md5=TEXT(stored=True))

        ix = index.open_dir('c:\\enron_mail_index')
        qp = QueryParser("file", schema=ix.schema)
        uq = u'email_*1294'
        q = qp.parse(uq)
         s = ix.searcher()
         results = s.search(q)
        print "S5 ", results

        count = 0
        for result in results:
            if count < 30:
                for key,item in result.items():
                    if key == 'path':
                        print item
                        count = count + 1

        self.setResult("results", 'This is output')


def initialize(*args, **keywords):

    reg = core.modules.module_registry.registry

    reg.add_module(FindFile, namespace='Stuff')
    reg.add_input_port(FindFile, "featspec", (core.modules.basic_modules.String,
 'Index Location'))
    reg.add_input_port(FindFile, "token", (core.modules.basic_modules.String, 'S
earch token'))
    reg.add_output_port(FindFile, "results", (core.modules.basic_modules.String,
 'Filespec Results'))

================================================================

Thanks BTW, Whoosh is great...keep up the good work! --mark

Comments (3)

  1. Matt Chaput

    Hi sorry for taking so long to get to this. I'm afraid I'm not able to fix bugs in such an old version anymore. Please try with the latest version on PyPI or (better yet) Bitbucket.

  2. Log in to comment
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.