Commits

coady  committed 8c08b45

PyLucene 4.0 supported.

  • Participants
  • Parent commits caf76b3

Comments (0)

Files changed (11)

 Lupyne should run anywhere PyLucene does, though its primary testing is on the popular unix variants.
 
  * Python 2.6.6+, 2.7
- * PyLucene 3.2, 3.3, 3.4, 3.5, 3.6
+ * PyLucene 3.2, 3.3, 3.4, 3.5, 3.6, 4.0
  * CherryPy 3.1.2+, 3.2 (only required for server)
 
 Usage

File examples/indexers.py

 """
-Basic indexing and searching example adapted from http://lucene.apache.org/java/3_5_0/api/core/index.html
+Basic indexing and searching example adapted from http://lucene.apache.org/java/3_6_2/api/core/index.html
 """
 
 import lucene
 lucene.initVM()
 try:
-    from org.apache.lucene import document, index, queryParser, search, store, util
+    from org.apache.lucene import document, index, search, store, util
+    try:
+        from org.apache.lucene.queryparser import classic as queryParser
+    except ImportError:
+        from org.apache.lucene import queryParser
     from org.apache.lucene.analysis import standard
 except ImportError:
     document = index = queryParser = search = store = util = standard = lucene
 directory = store.RAMDirectory()
 # To store an index on disk, use this instead:
 #Directory directory = FSDirectory.open(File("/tmp/testindex"))
-iwriter = index.IndexWriter(directory, analyzer, True, index.IndexWriter.MaxFieldLength(25000))
+config = index.IndexWriterConfig(util.Version.LUCENE_CURRENT, analyzer)
+iwriter = index.IndexWriter(directory, config)
 doc = document.Document()
 text = "This is the text to be indexed."
 doc.add(document.Field("fieldname", text, document.Field.Store.YES, document.Field.Index.ANALYZED))
 for hit in hits:
     hitDoc = isearcher.doc(hit.doc)
     assert hitDoc['fieldname'] == text
-isearcher.close()
 ireader.close()
 directory.close()
 

File examples/sorting.py

 import lucene
 lucene.initVM()
 try:
-    from org.apache.lucene import search
+    from org.apache.lucene import search, util
     from org.apache.pylucene.search import PythonFieldComparator, PythonFieldComparatorSource
 except ImportError:
-    search = lucene
+    search = util = lucene
     from lucene import PythonFieldComparator, PythonFieldComparatorSource
 from lupyne import engine
 
 
 ### lucene ###
 
-searcher = search.IndexSearcher(indexer.directory)
-topdocs = searcher.search(search.MatchAllDocsQuery(), None, 10, search.Sort(search.SortField('color', search.SortField.STRING)))
+searcher = search.IndexSearcher(indexer.indexReader)
+sorttype = getattr(search.SortField, 'Type', search.SortField).STRING
+topdocs = searcher.search(search.MatchAllDocsQuery(), None, 10, search.Sort(search.SortField('color', sorttype)))
 assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)
 
 class ComparatorSource(PythonFieldComparatorSource):
             PythonFieldComparator.__init__(self)
             self.name = name
             self.values = [None] * numHits
-        def setNextReader(self, reader, base):
-            self.comparator = search.FieldCache.DEFAULT.getStrings(reader, self.name)
+        def setNextReader(self, reader, *args):
+            if not args:
+                reader = reader.reader()
+            if hasattr(search.FieldCache, 'getStrings'):
+                self.comparator = search.FieldCache.DEFAULT.getStrings(reader, self.name)
+            else:
+                br = util.BytesRef()
+                comparator = search.FieldCache.DEFAULT.getTerms(reader, self.name)
+                self.comparator = [comparator.getTerm(index, br).utf8ToString() for index in range(comparator.size())]
+            return self
         def compare(self, slot1, slot2):
             return cmp(self.values[slot1], self.values[slot2])
         def setBottom(self, slot):

File lupyne/engine/documents.py

 from future_builtins import map
 import datetime, calendar
 import operator
-import collections
 import warnings
 import lucene
 try:
     from java.lang import Double, Float, Long, Number, Object
+    from java.util import Arrays, HashSet
     from org.apache.lucene import document, search, util
     from org.apache.lucene.search import grouping
 except ImportError:
-    from lucene import Double, Float, Long, Number, Object
+    from lucene import Double, Float, Long, Number, Object, Arrays, HashSet
     document = search = util = grouping = lucene
 from .queries import Query
 
         Field.__init__(self, name, store)
         self.step = step or util.NumericUtils.PRECISION_STEP_DEFAULT
         self.index = index
+        if hasattr(document, 'FieldType'):
+            self.type = document.FieldType()
+            self.type.setNumericPrecisionStep(self.step)
+            self.type.setStored(self.store == document.Field.Store.YES)
+            self.type.setIndexed(index)
     def items(self, *values):
         "Generate lucene NumericFields suitable for adding to a document."
         for value in values:
-            field = document.NumericField(self.name, self.step, self.store, self.index)
-            if isinstance(value, float):
-                field.doubleValue = value
+            if hasattr(self, 'type'):
+                if isinstance(value, float):
+                    self.type.setNumericType(document.FieldType.NumericType.DOUBLE)
+                    field = document.DoubleField(self.name, value, self.type)
+                else:
+                    self.type.setNumericType(document.FieldType.NumericType.LONG)
+                    field = document.LongField(self.name, long(value), self.type)
             else:
-                field.longValue = long(value)
+                field = document.NumericField(self.name, self.step, self.store, self.index)
+                if isinstance(value, float):
+                    field.doubleValue = value
+                else:
+                    field.longValue = long(value)
             yield field
     def numeric(self, cls, start, stop, lower, upper):
         if isinstance(start, float) or isinstance(stop, float):
     "Multimapping of field names to values, but default getters return the first value."
     def __init__(self, doc):
         for field in doc.getFields():
-            self.setdefault(field.name(), []).append(field.binaryValue.string_ if field.binary else field.stringValue())
+            try:
+                value = field.binaryValue.string_ if field.binary else field.stringValue()
+            except AttributeError:
+                value = field.stringValue() or field.binaryValue() or field.numericValue().toString()
+            self.setdefault(field.name(), []).append(value)
     def __getitem__(self, name):
         return dict.__getitem__(self, name)[0]
     def get(self, name, default=None):
 
 def convert(value):
     "Return python object from java Object."
+    if hasattr(util, 'BytesRef') and util.BytesRef.instance_(value):
+        return util.BytesRef.cast_(value).utf8ToString()
     if not Number.instance_(value):
         return value.toString() if Object.instance_(value) else value
     value = Number.cast_(value)
         self.fields = fields
     def select(self, *fields):
         "Only load selected fields."
-        self.fields = document.MapFieldSelector(fields)
+        self.fields = getattr(document, 'MapFieldSelector', HashSet)(Arrays.asList(fields))
     def __len__(self):
         return len(self.scoredocs)
     def __getitem__(self, index):
             return type(self)(self.searcher, scoredocs, self.count, self.maxscore, self.fields)
         scoredoc = self.scoredocs[index]
         keys = search.FieldDoc.cast_(scoredoc).fields if search.FieldDoc.instance_(scoredoc) else ()
-        return Hit(self.searcher.doc(scoredoc.doc, self.fields), scoredoc.doc, scoredoc.score, keys)
+        if self.fields is None:
+            doc = self.searcher.doc(scoredoc.doc)
+        elif hasattr(search.IndexSearcher, 'document'):
+            doc = self.searcher.document(scoredoc.doc, self.fields)
+        else:
+            doc = self.searcher.doc(scoredoc.doc, self.fields)
+        return Hit(doc, scoredoc.doc, scoredoc.score, keys)
     @property
     def ids(self):
         return map(operator.attrgetter('doc'), self.scoredocs)

File lupyne/engine/indexers.py

 The final `Indexer`_ classes exposes a high-level Searcher and Writer.
 """
 
-from future_builtins import map, zip
+from future_builtins import filter, map, zip
 import os
 import itertools, operator
 import contextlib
 try:
     from java.io import File, StringReader
     from java.lang import Float
-    from java.util import HashMap
-    from org.apache.lucene import analysis, document, index, queryParser, search, store, util
+    from java.util import Arrays, HashMap, HashSet
+    from org.apache.lucene import analysis, document, index, search, store, util
     from org.apache.lucene.analysis import standard, tokenattributes
     from org.apache.lucene.index import memory
-    from org.apache.lucene.search import similar, spans
+    from org.apache.lucene.search import spans
     from org.apache.pylucene.analysis import PythonAnalyzer, PythonTokenFilter
-    from org.apache.pylucene.queryParser import PythonQueryParser
+    try:
+        from org.apache.lucene.queryparser import classic as queryParser
+        from org.apache.lucene.queries import mlt as similar
+        from org.apache.pylucene.queryparser.classic import PythonQueryParser
+    except ImportError:
+        from org.apache.lucene import queryParser
+        from org.apache.lucene.search import similar
+        from org.apache.pylucene.queryParser import PythonQueryParser
 except ImportError:
-    from lucene import File, StringReader, Float, HashMap, PythonAnalyzer, PythonTokenFilter, PythonQueryParser
+    from lucene import File, StringReader, Float, Arrays, HashMap, HashSet, PythonAnalyzer, PythonTokenFilter, PythonQueryParser
     analysis = document = index = queryParser = search = store = util = \
     standard = tokenattributes = memory = similar = spans = lucene
 from .queries import Query, TermsFilter, SortField, Highlighter, FastVectorHighlighter, SpellChecker, SpellParser
         reader = self.directory(reader)
         if isinstance(reader, index.IndexReader):
             reader.incRef()
+        elif isinstance(reader, index.IndexWriter):
+            reader = index.IndexReader.open(reader, True)
         else:
-            reader = index.IndexReader.open(reader, True)
+            reader = index.IndexReader.open(reader)
         return reader
 
 def copy(commit, dest):
     Optimized to use hard links if the destination is a file system path.
     """
     if isinstance(dest, store.Directory):
+        args = [store.IOContext.DEFAULT] if hasattr(store, 'IOContext') else []
         for filename in commit.fileNames:
-            commit.directory.copy(dest, filename, filename)
+            commit.directory.copy(dest, filename, filename, *args)
     else:
         src = IndexSearcher.path.fget(commit)
         os.path.isdir(dest) or os.makedirs(dest)
 
 class TokenStream(analysis.TokenStream):
     "TokenStream mixin with support for iteration and attributes cached as properties."
+    bytes = lucene.VERSION >= '4'
     def __iter__(self):
         return self
     def next(self):
     def payload(self):
         "Payload bytes."
         payload = self.Payload.payload
-        return payload and getattr(payload.data, 'string_', None)
+        return payload and (payload.utf8ToString() if self.bytes else getattr(payload.data, 'string_', None))
     @payload.setter
     def payload(self, data):
-        self.Payload.payload = index.Payload(lucene.JArray_byte(data.encode('utf8') if isinstance(data, unicode) else data))
+        data = lucene.JArray_byte(data.encode('utf8') if isinstance(data, unicode) else data)
+        self.Payload.payload = (util.BytesRef if self.bytes else index.Payload)(data)
     @property
     def positionIncrement(self):
         "Position relative to the previous token."
     @property
     def term(self):
         "Term text."
-        return self.Term.term()
+        return self.CharTerm.toString() if self.bytes else self.Term.term()
     @term.setter
     def term(self, text):
-        self.Term.setTermBuffer(text)
+        if self.bytes:
+            self.CharTerm.setEmpty()
+            self.CharTerm.append(text)
+        else:
+            self.Term.setTermBuffer(text)
     @property
     def type(self):
         "Lexical type."
 class Analyzer(PythonAnalyzer):
     """Return a lucene Analyzer which chains together a tokenizer and filters.
     
-    :param tokenizer: lucene Tokenizer or Analyzer
+    :param tokenizer: lucene Analyzer or Tokenizer factory
     :param filters: lucene TokenFilters
     """
     def __init__(self, tokenizer, *filters):
         PythonAnalyzer.__init__(self)
         self.tokenizer, self.filters = tokenizer, filters
-    def tokenStream(self, field, reader):
-        tokens = self.tokenizer.tokenStream(field, reader) if isinstance(self.tokenizer, analysis.Analyzer) else self.tokenizer(reader)
+    def components(self, field, reader):
+        source = tokens = self.tokenizer.tokenStream(field, reader) if isinstance(self.tokenizer, analysis.Analyzer) else self.tokenizer(reader)
         for filter in self.filters:
             tokens = filter(tokens)
-        return tokens
+        return source, tokens
+    def tokenStream(self, field, reader):
+        return self.components(field, reader)[1]
+    def createComponents(self, field, reader):
+        return analysis.Analyzer.TokenStreamComponents(*self.components(field, reader))
     def tokens(self, text, field=None):
         "Return lucene TokenStream from text."
         return self.tokenStream(field, StringReader(text))
         for name, value in attrs.items():
             setattr(parser, name, value)
         if isinstance(parser, queryParser.MultiFieldQueryParser):
-            return queryParser.MultiFieldQueryParser.parse(parser, query)
+            return parser.parse(parser, query)
         try:
             return parser.parse(query)
         finally:
     def __getattr__(self, name):
         if name == 'indexReader':
             raise AttributeError(name)
-        return getattr(self.indexReader, name)
+        cls = getattr(index, 'DirectoryReader', index.IndexReader)
+        return getattr(cls.cast_(self.indexReader), name)
     def __len__(self):
         return self.numDocs()
     def __contains__(self, id):
-        return 0 <= id < self.maxDoc() and not self.isDeleted(id)
+        if 0 <= id < self.maxDoc():
+            if hasattr(self, 'isDeleted'):
+                return not self.isDeleted(id)
+            bits = index.MultiFields.getLiveDocs(self.indexReader)
+            return bits is None or bits.get(id)
+        return False
     def __iter__(self):
-        return itertools.ifilterfalse(self.isDeleted, xrange(self.maxDoc()))
+        ids = xrange(self.maxDoc())
+        if not self.hasDeletions():
+            return iter(ids)
+        if hasattr(self, 'isDeleted'):
+            return itertools.ifilterfalse(self.isDeleted, ids)
+        return filter(index.MultiFields.getLiveDocs(self.indexReader).get, ids)
     def __getitem__(self, id):
         return Document(self.document(id))
     @property
     def directory(self):
         "reader's lucene Directory"
-        return self.indexReader.directory()
+        return self.__getattr__('directory')()
     @property
     def path(self):
         "FSDirectory path"
     @property
     def timestamp(self):
         "timestamp of reader's last commit"
-        return self.indexCommit.timestamp / 1000.0
+        commit = self.indexCommit
+        try:
+            modified = commit.timestamp
+        except AttributeError:
+            modified = store.FSDirectory.fileModified(store.FSDirectory.cast_(commit.directory).directory, commit.segmentsFileName)
+        return modified * 0.001
     @property
     def readers(self):
         "segment readers"
-        return map(index.SegmentReader.cast_, self.sequentialSubReaders)
+        readers = (context.reader() for context in self.leaves()) if hasattr(self, 'leaves') else self.sequentialSubReaders
+        return map(index.SegmentReader.cast_, readers)
     @property
     def segments(self):
         "segment filenames with document counts"
         """
         if hasattr(index.IndexReader, 'getFieldNames'):
             return list(self.getFieldNames(getattr(self.FieldOption, option.upper())))
-        fieldinfos = util.ReaderUtil.getMergedFieldInfos(self.indexReader).iterator()
+        module = index.MultiFields if hasattr(index, 'MultiFields') else util.ReaderUtil
+        fieldinfos = module.getMergedFieldInfos(self.indexReader).iterator()
         return [fieldinfo.name for fieldinfo in fieldinfos if all(getattr(fieldinfo, name) == attrs[name] for name in attrs)]
     def terms(self, name, value='', stop=None, counts=False, **fuzzy):
         """Generate a slice of term values, optionally with frequency counts.
         :param fuzzy: optional keyword arguments for fuzzy terms
         """
         term = index.Term(name, value)
-        if fuzzy:
-            args = fuzzy.pop('minSimilarity', 0.5), fuzzy.pop('prefixLength', 0)
-            termenum = search.FuzzyTermEnum(self.indexReader, term, *args, **fuzzy)
-        elif '*' in value or '?' in value:
-            value = value.rstrip('*')
-            if '*' in value or '?' in value:
-                warnings.warn('Wildcard term enumeration has been removed from lucene 4; use a prefix instead.', DeprecationWarning)
-            termenum = search.WildcardTermEnum(self.indexReader, term)
+        args = fuzzy.get('minSimilarity', 0.5), fuzzy.get('prefixLength', 0)
+        if hasattr(index, 'MultiFields'):
+            terms = index.MultiFields.getTerms(self.indexReader, name)
+            if terms:
+                if fuzzy:
+                    termenum = search.FuzzyTermsEnum(terms, util.AttributeSource(), term, args[0], args[1], False)
+                elif value.endswith('*'): 
+                    termenum = search.PrefixTermsEnum(terms.iterator(None), util.BytesRef(value.rstrip('*')))
+                else:
+                    termenum = search.TermRangeTermsEnum(terms.iterator(None), util.BytesRef(value), stop and util.BytesRef(stop), True, False)
+                for bytesref in util.BytesRefIterator.cast_(termenum):
+                    text = bytesref.utf8ToString()
+                    yield (text, termenum.docFreq()) if counts else text
         else:
-            termenum = search.TermRangeTermEnum(self.indexReader, name, value, stop, True, False, None)
-        with contextlib.closing(termenum):
-            term = termenum.term()
-            while term:
-                text = term.text()
-                yield (text, termenum.docFreq()) if counts else text
-                term = termenum.next() and termenum.term()
+            if fuzzy:
+                termenum = search.FuzzyTermEnum(self.indexReader, term, *args, **fuzzy)
+            elif '*' in value or '?' in value:
+                value = value.rstrip('*')
+                if '*' in value or '?' in value:
+                    warnings.warn('Wildcard term enumeration has been removed from lucene 4; use a prefix instead.', DeprecationWarning)
+                termenum = search.WildcardTermEnum(self.indexReader, term)
+            else:
+                termenum = search.TermRangeTermEnum(self.indexReader, name, value, stop, True, False, None)
+            with contextlib.closing(termenum):
+                term = termenum.term()
+                while term:
+                    text = term.text()
+                    yield (text, termenum.docFreq()) if counts else text
+                    term = termenum.next() and termenum.term()
     def numbers(self, name, step=0, type=int, counts=False):
         """Generate decoded numeric term values, optionally with frequency counts.
         
         term = index.Term(name, chr(ord(' ') + step))
         decode = util.NumericUtils.prefixCodedToLong
         convert = util.NumericUtils.sortableLongToDouble if issubclass(type, float) else int
-        with contextlib.closing(search.PrefixTermEnum(self.indexReader, term)) as termenum:
-            term = termenum.term()
-            while term:
-                value = convert(decode(term.text()))
+        if hasattr(index, 'MultiFields'):
+            terms = index.MultiFields.getTerms(self.indexReader, name)
+            termenum = search.PrefixTermsEnum(terms.iterator(None), util.BytesRef(term.text()))
+            for bytesref in util.BytesRefIterator.cast_(termenum):
+                value = convert(decode(bytesref))
                 yield (value, termenum.docFreq()) if counts else value
-                term = termenum.next() and termenum.term()
+        else:
+            with contextlib.closing(search.PrefixTermEnum(self.indexReader, term)) as termenum:
+                term = termenum.term()
+                while term:
+                    value = convert(decode(term.text()))
+                    yield (value, termenum.docFreq()) if counts else value
+                    term = termenum.next() and termenum.term()
     def docs(self, name, value, counts=False):
         "Generate doc ids which contain given term, optionally with frequency counts."
-        with contextlib.closing(self.termDocs(index.Term(name, value))) as termdocs:
-            while termdocs.next():
-                doc = termdocs.doc()
-                yield (doc, termdocs.freq()) if counts else doc
+        if hasattr(index, 'MultiFields'):
+            docsenum = index.MultiFields.getTermDocsEnum(self.indexReader, index.MultiFields.getLiveDocs(self.indexReader), name, util.BytesRef(value))
+            if docsenum:
+                for doc in iter(docsenum.nextDoc, index.DocsEnum.NO_MORE_DOCS):
+                    yield (doc, docsenum.freq()) if counts else doc
+        else:
+            with contextlib.closing(self.termDocs(index.Term(name, value))) as termdocs:
+                while termdocs.next():
+                    doc = termdocs.doc()
+                    yield (doc, termdocs.freq()) if counts else doc
     def positions(self, name, value, payloads=False):
         "Generate doc ids and positions which contain given term, optionally only with payloads."
-        array = lucene.JArray_byte('')
-        with contextlib.closing(self.termPositions(index.Term(name, value))) as termpositions:
-            while termpositions.next():
-                doc = termpositions.doc()
-                positions = (termpositions.nextPosition() for n in xrange(termpositions.freq()))
-                if payloads:
-                    yield doc, [(position, termpositions.getPayload(array, 0).string_) for position in positions if termpositions.payloadAvailable]
-                else:
-                    yield doc, list(positions)
+        if hasattr(index, 'MultiFields'):
+            docsenum = index.MultiFields.getTermPositionsEnum(self.indexReader, index.MultiFields.getLiveDocs(self.indexReader), name, util.BytesRef(value))
+            if docsenum:
+                for doc in iter(docsenum.nextDoc, index.DocsEnum.NO_MORE_DOCS):
+                    positions = (docsenum.nextPosition() for n in xrange(docsenum.freq()))
+                    if payloads:
+                        yield doc, [(position, docsenum.payload.utf8ToString()) for position in positions if docsenum.payload]
+                    else:
+                        yield doc, list(positions)
+        else:
+            array = lucene.JArray_byte('')
+            with contextlib.closing(self.termPositions(index.Term(name, value))) as termpositions:
+                while termpositions.next():
+                    doc = termpositions.doc()
+                    positions = (termpositions.nextPosition() for n in xrange(termpositions.freq()))
+                    if payloads:
+                        yield doc, [(position, termpositions.getPayload(array, 0).string_) for position in positions if termpositions.payloadAvailable]
+                    else:
+                        yield doc, list(positions)
     def comparator(self, name, type='string', parser=None):
         """Return cache of field values suitable for sorting.
         Parsing values into an array is memory optimized.
         :param positions: optionally include slice positions instead of counts
         :param payloads: optionally only include slice positions with payloads
         """
-        spans_ = itertools.takewhile(spans.Spans.next, itertools.repeat(query.getSpans(self.indexReader)))
-        for doc, spans_ in itertools.groupby(spans_, key=spans.Spans.doc):
-            if payloads:
-                yield doc, [(span.start(), span.end(), [lucene.JArray_byte.cast_(data).string_ for data in span.payload]) \
-                    for span in spans_ if span.payloadAvailable]
-            elif positions:
-                yield doc, [(span.start(), span.end()) for span in spans_]
-            else:
-                yield doc, sum(1 for span in spans_)
+        offset = 0
+        for reader in self.readers:
+            spans_ = query.getSpans(reader.context, reader.liveDocs, HashMap()) if hasattr(reader, 'context') else query.getSpans(reader)
+            for doc, spans_ in itertools.groupby(itertools.takewhile(spans.Spans.next, itertools.repeat(spans_)), key=spans.Spans.doc):
+                doc += offset
+                if payloads:
+                    yield doc, [(span.start(), span.end(), [lucene.JArray_byte.cast_(data).string_ for data in span.payload]) \
+                        for span in spans_ if span.payloadAvailable]
+                elif positions:
+                    yield doc, [(span.start(), span.end()) for span in spans_]
+                else:
+                    yield doc, sum(1 for span in spans_)
+            offset += reader.maxDoc()
     def termvector(self, id, field, counts=False):
         "Generate terms for given doc id and field, optionally with frequency counts."
-        tfv = self.getTermFreqVector(id, field) or search.QueryTermVector([])
-        return zip(tfv.terms, tfv.termFrequencies) if counts else iter(tfv.terms)
+        if hasattr(index.IndexReader, 'getTermFreqVector'):
+            tfv = self.getTermFreqVector(id, field) or search.QueryTermVector([])
+            for item in zip(tfv.terms, tfv.termFrequencies) if counts else tfv.terms:
+                yield item
+        else:
+            terms = self.getTermVector(id, field)
+            if terms:
+                termenum = terms.iterator(None)
+                for bytesref in util.BytesRefIterator.cast_(termenum):
+                    term = bytesref.utf8ToString()
+                    yield (term, termenum.totalTermFreq()) if counts else term
     def positionvector(self, id, field, offsets=False):
         "Generate terms and positions for given doc id and field, optionally with character offsets."
-        tpv = index.TermPositionVector.cast_(self.getTermFreqVector(id, field))
-        for idx, term in enumerate(tpv.terms):
-            if offsets:
-                yield term, list(map(operator.attrgetter('startOffset', 'endOffset'), tpv.getOffsets(idx)))
-            else:
-                yield term, list(tpv.getTermPositions(idx))
+        if hasattr(index.IndexReader, 'getTermFreqVector'):
+            tpv = index.TermPositionVector.cast_(self.getTermFreqVector(id, field))
+            for idx, term in enumerate(tpv.terms):
+                if offsets:
+                    yield term, list(map(operator.attrgetter('startOffset', 'endOffset'), tpv.getOffsets(idx)))
+                else:
+                    yield term, list(tpv.getTermPositions(idx))
+        else:
+            termenum = self.getTermVector(id, field).iterator(None)
+            for bytesref in util.BytesRefIterator.cast_(termenum):
+                term = bytesref.utf8ToString()
+                docsenum = termenum.docsAndPositions(None, None)
+                positions = (docsenum.nextPosition() for n in xrange(docsenum.freq()))
+                if offsets:
+                    yield term, [(docsenum.startOffset(), docsenum.endOffset()) for position in positions]
+                else:
+                    yield term, list(positions)
     def morelikethis(self, doc, *fields, **attrs):
         """Return MoreLikeThis query for document.
         
         mlt.fieldNames = fields or None
         for name, value in attrs.items():
             setattr(mlt, name, value)
-        return mlt.like(StringReader(doc) if isinstance(doc, basestring) else doc)
+        try:
+            return mlt.like(StringReader(doc), '') if isinstance(doc, basestring) else mlt.like(doc)
+        except lucene.InvalidArgsError:
+            return mlt.like(StringReader(doc))
     def overlap(self, left, right):
         "Return intersection count of cached filters."
         count, bitset = 0, getattr(util, 'FixedBitSet', util.OpenBitSet)
         for reader in self.readers:
-            docsets = left.getDocIdSet(reader), right.getDocIdSet(reader)
-            if search.DocIdSet.EMPTY_DOCIDSET not in docsets:
-                bits = [bitset.cast_(docset).bits for docset in docsets]
+            if hasattr(reader, 'liveDocs'):
+                docsets = [filter.getDocIdSet(reader.context, reader.liveDocs).bits() for filter in (left, right)]
+            else:
+                docsets = left.getDocIdSet(reader), right.getDocIdSet(reader)
+            if all(map(bitset.instance_, docsets)):
+                bits = [bitset.cast_(docset).getBits() for docset in docsets]
                 count += util.BitUtil.pop_intersect(bits[0], bits[1], 0, min(map(len, bits)))
         return int(count)
 
     def load(cls, directory, analyzer=None):
         "Open `IndexSearcher`_ with a lucene RAMDirectory, loading index into memory."
         ref = closing()
-        self = cls(store.RAMDirectory(ref.directory(directory)), analyzer)
+        directory = ref.directory(directory)
+        try:
+            directory = store.RAMDirectory(directory)
+        except lucene.InvalidArgsError:
+            directory = store.RAMDirectory(directory, store.IOContext.DEFAULT)
+        self = cls(directory, analyzer)
         self.shared.add(self.directory)
         return self
     def __del__(self):
         :param sorters: refresh cached :attr:`sorters` with associated parsers
         :param spellcheckers: refresh cached :attr:`spellcheckers`
         """
-        if self.current:
+        try:
+            reader = index.DirectoryReader.openIfChanged(index.DirectoryReader.cast_(self.indexReader))
+        except AttributeError:
+            reader = None if self.current else self.indexReader.reopen()
+        except TypeError:
+            readers = [index.DirectoryReader.openIfChanged(index.DirectoryReader.cast_(reader)) for reader in self.sequentialSubReaders]
+            reader = index.MultiReader([new or old for new, old in zip(readers, self.sequentialSubReaders)]) if any(readers) else None
+        if reader is None:
             return self
-        other = type(self)(self.indexReader.reopen(), self.analyzer)
+        other = type(self)(reader, self.analyzer)
         other.decRef()
         other.shared = self.shared
         other.filters.update((key, value if isinstance(value, search.Filter) else dict(value)) for key, value in self.filters.items())
         return Document(self.doc(id))
     def get(self, id, *fields):
         "Return `Document`_ with only selected fields loaded."
-        return Document(self.doc(id, document.MapFieldSelector(fields)))
+        return Document(self.document(id, getattr(document, 'MapFieldSelector', HashSet)(Arrays.asList(fields))))
     def parse(self, query, spellcheck=False, **kwargs):
         if isinstance(query, search.Query):
             return query
     def highlighter(self, query, field, **kwargs):
         "Return `Highlighter`_ or if applicable `FastVectorHighlighter`_ specific to searcher and query."
         query = self.parse(query, field=field)
-        vector = field in self.names('termvector_with_position_offset', storeTermVector=True)
+        if hasattr(index, 'MultiFields'):
+            fieldinfo = index.MultiFields.getMergedFieldInfos(self.indexReader).fieldInfo(field)
+            vector = fieldinfo and fieldinfo.hasVectors()
+        else:
+            vector = field in self.names('termvector_with_position_offset', storeTermVector=True)
         return (FastVectorHighlighter if vector else Highlighter)(self, query, field, **kwargs)
     def count(self, *query, **options):
         """Return number of hits for given query or term.
         if isinstance(query, search.Query):
             query = search.QueryWrapperFilter(query)
         if not isinstance(query, search.CachingWrapperFilter):
-            query = search.CachingWrapperFilter(query)
+            flag = search.CachingWrapperFilter.DeletesMode.RECACHE if hasattr(search.CachingWrapperFilter, 'DeletesMode') else True
+            query = search.CachingWrapperFilter(query, flag)
         for key in keys:
             filters = self.filters.get(key)
             if key in self.groupings:
         IndexSearcher.__init__(self, reader, analyzer)
         self.shared.update(shared)
         shared.clear()
-        self.version = sum(reader.version for reader in self.sequentialSubReaders)
+        if not hasattr(self, 'sequentialSubReaders'):
+            self.sequentialSubReaders = [context.reader() for context in self.context.children()]
+        self.version = sum(IndexReader(reader).version for reader in self.sequentialSubReaders)
+    def __getattr__(self, name):
+        return getattr(index.MultiReader.cast_(self.indexReader), name)
     @property
     def readers(self):
         return itertools.chain.from_iterable(IndexReader(reader).readers for reader in self.sequentialSubReaders)

File lupyne/engine/queries.py

     from org.apache.lucene import document, index, search, util
     from org.apache.lucene.search import highlight, spans, vectorhighlight
     from org.apache.pylucene import search as search_
-    from org.apache.pylucene.queryParser import PythonQueryParser
+    try:
+        from org.apache.lucene import queries
+        from org.apache.pylucene.queryparser.classic import PythonQueryParser
+    except ImportError:
+        queries = search
+        from org.apache.pylucene.queryParser import PythonQueryParser
 except ImportError:
     from lucene import Integer, Arrays, HashSet, PythonQueryParser
-    document = index = search = util = highlight = spans = vectorhighlight = search_ = lucene
+    document = index = search = util = highlight = queries = spans = vectorhighlight = search_ = lucene
 
 class Query(object):
     """Inherited lucene Query, with dynamic base class acquisition.
         elif isinstance(self, search.TermRangeQuery):
             filter = search.TermRangeFilter(self.field, self.lowerTerm, self.upperTerm, self.includesLower(), self.includesUpper())
         elif isinstance(self, search.TermQuery):
-            filter = search.TermsFilter()
+            filter = queries.TermsFilter()
             filter.addTerm(self.getTerm())
         else:
             filter = search.QueryWrapperFilter(self)
-        return search.CachingWrapperFilter(filter) if cache else filter
+        if not cache:
+            return filter
+        flag = search.CachingWrapperFilter.DeletesMode.RECACHE if hasattr(search.CachingWrapperFilter, 'DeletesMode') else True
+        return search.CachingWrapperFilter(filter, flag)
     def terms(self):
         "Generate set of query term items."
         terms = HashSet().of_(index.Term)
     @classmethod
     def range(cls, name, start, stop, lower=True, upper=False):
         "Return lucene RangeQuery, by default with a half-open interval."
+        try:
+            return cls(search.TermRangeQuery, name, start, stop, lower, upper)
+        except lucene.InvalidArgsError:
+            start, stop = (value if value is None else util.BytesRef(value) for value in (start, stop))
         return cls(search.TermRangeQuery, name, start, stop, lower, upper)
     @classmethod
     def phrase(cls, name, *values):
         "Return lucene WildcardQuery."
         return cls(search.WildcardQuery, index.Term(name, value))
     @classmethod
-    def fuzzy(cls, name, value, minimumSimilarity=0.5, prefixLength=0):
+    def fuzzy(cls, name, value, minimumSimilarity=None, prefixLength=0):
         "Return lucene FuzzyQuery."
+        if minimumSimilarity is None:
+            minimumSimilarity = getattr(search.FuzzyQuery, 'defaultMaxEdits', 0.5)
         return cls(search.FuzzyQuery, index.Term(name, value), minimumSimilarity, prefixLength)
     def __pos__(self):
         return Query.all(self)
     "Inherited lucene SpanQuery with additional span constructors."
     def filter(self, cache=True):
         "Return lucene CachingSpanFilter, optionally just SpanQueryFilter."
+        if not hasattr(search, 'SpanQueryFilter'):
+            return Query.filter(self, cache)
         filter = search.SpanQueryFilter(self)
         return search.CachingSpanFilter(filter) if cache else filter
     def __getitem__(self, slc):
     ops = {'or': 'update', 'and': 'intersection_update', 'andNot': 'difference_update'}
     def __init__(self, field, values=()):
         assert lucene.VERSION >= '3.5', 'requires FixedBitSet set operations introduced in lucene 3.5'
-        search.CachingWrapperFilter.__init__(self, search.TermsFilter())
+        args = [True] if lucene.VERSION >= '4' else []
+        search.CachingWrapperFilter.__init__(self, queries.TermsFilter(), *args)
         self.field = field
         self.values = set(values)
         self.readers = set()
         "Return lucene TermsFilter, optionally using the FieldCache."
         if cache:
             return search.FieldCacheTermsFilter(self.field, tuple(values))
-        filter = search.TermsFilter()
+        filter = queries.TermsFilter()
         for value in values:
             filter.addTerm(index.Term(self.field, value))
         return filter
     def apply(self, filter, op, readers):
         for reader in readers:
-            bitset = util.FixedBitSet.cast_(self.getDocIdSet(reader))
-            getattr(bitset, op)(filter.getDocIdSet(reader).iterator())
+            try:
+                args = [reader.context, reader.liveDocs] if hasattr(index.IndexReader, 'context') else [reader]
+                bitset = util.FixedBitSet.cast_(self.getDocIdSet(*args))
+                getattr(bitset, op)(filter.getDocIdSet(*args).iterator())
+            except lucene.JavaError as exc:
+                assert not reader.refCount, exc
     def update(self, values, op='or', cache=True):
         """Update allowed values and corresponding cached bitsets.
         
         self.arrays = list(arrays)
         self.offsets = [0]
         for array in self.arrays:
-            self.offsets.append(len(self) + len(array))
+            self.offsets.append(len(self) + getattr(type(array), 'size', len)(array))
     def __len__(self):
         return self.offsets[-1]
-    def __iter__(self):
-        return itertools.chain(*self.arrays)
     def __getitem__(self, index):
         point = bisect.bisect_right(self.offsets, index) - 1
-        return self.arrays[point][index - self.offsets[point]]
+        index -= self.offsets[point]
+        array = self.arrays[point]
+        return array.getTerm(index, util.BytesRef()).utf8ToString() if hasattr(array, 'getTerm') else array[index]
 
 class SortField(search.SortField):
     """Inherited lucene SortField used for caching FieldCache parsers.
     def __init__(self, name, type='string', parser=None, reverse=False):
         type = self.typename = getattr(type, '__name__', type).capitalize()
         if parser is None:
-            parser = getattr(search.SortField, type.upper())
+            parser = getattr(getattr(self, 'Type', self), type.upper())
         elif not search.FieldCache.Parser.instance_(parser):
             base = getattr(search_, 'Python{0}Parser'.format(type))
             namespace = {'parse' + type: staticmethod(parser)}
             parser = object.__class__(base.__name__, (base,), namespace)()
         search.SortField.__init__(self, name, parser, reverse)
     def array(self, reader):
-        method = getattr(search.FieldCache.DEFAULT, 'get{0}s'.format(self.typename))
-        return method(reader, self.field, *[self.parser][:bool(self.parser)])
+        try:
+            method = getattr(search.FieldCache.DEFAULT, 'get{0}s'.format(self.typename))
+        except AttributeError:
+            if self.typename != 'String':
+                raise
+            return search.FieldCache.DEFAULT.getTermsIndex(reader, self.field)
+        args = []
+        if self.parser:
+            args.append(self.parser)
+        if lucene.VERSION >= '4':
+            args.append(False)
+        return method(reader, self.field, *args)
     def comparator(self, searcher):
         "Return indexed values from default FieldCache using the given searcher."
         arrays = list(map(self.array, searcher.readers))
-        return arrays[0] if len(arrays) <= 1 else Comparator(arrays)
+        return arrays[0] if len(arrays) <= 1 and not hasattr(arrays[0], 'getTerm') else Comparator(arrays)
     def filter(self, start, stop, lower=True, upper=False):
         "Return lucene FieldCacheRangeFilter based on field and type."
         method = getattr(search.FieldCacheRangeFilter, 'new{0}Range'.format(self.typename))
     def terms(self, filter, *readers):
         "Generate field cache terms from docs which match filter from all segments."
         for reader in readers:
-            array, docset = self.array(reader), filter.getDocIdSet(reader)
-            for id in iter(docset.iterator().nextDoc, search.DocIdSetIterator.NO_MORE_DOCS):
-                yield array[id]
+            args = [reader.context, reader.liveDocs] if hasattr(reader, 'liveDocs') else [reader]
+            array, docset = self.array(reader), filter.getDocIdSet(*args)
+            ids = iter(docset.iterator().nextDoc, search.DocIdSetIterator.NO_MORE_DOCS)
+            if hasattr(array, 'getTerm'):
+                br = util.BytesRef()
+                for id in ids:
+                    yield array.getTerm(id, br).utf8ToString()
+            else:
+                for id in ids:
+                    yield array[id]
 
 class Highlighter(highlight.Highlighter):
     """Inherited lucene Highlighter with stored analysis options.
         scorer = (highlight.QueryTermScorer if terms else highlight.QueryScorer)(query, *(searcher.indexReader, field) * (not fields))
         highlight.Highlighter.__init__(self, *filter(None, [formatter, encoder, scorer]))
         self.searcher, self.field = searcher, field
-        self.selector = document.MapFieldSelector([field])
+        self.selector = getattr(document, 'MapFieldSelector', HashSet)(Arrays.asList([field]))
     def fragments(self, doc, count=1):
         """Return highlighted text fragments.
         
         :param count: maximum number of fragments
         """
         if not isinstance(doc, basestring):
-            doc = self.searcher.doc(doc, self.selector)[self.field]
+            doc = getattr(search.IndexSearcher, 'document', search.IndexSearcher.doc)(self.searcher, doc, self.selector)[self.field]
         return doc and list(self.getBestFragments(self.searcher.analyzer, self.field, doc, count))
 
 class FastVectorHighlighter(vectorhighlight.FastVectorHighlighter):

File lupyne/server.py

 import warnings
 import lucene
 try:
-    from org.apache.lucene import index, search
+    from org.apache.lucene import search
 except ImportError:
-    index = search = lucene
+    search = lucene
 import cherrypy
 try:
     from . import engine, client
         if cherrypy.request.method == 'POST':
             self.sync(host, path)
             cherrypy.response.status = httplib.ACCEPTED
-        reader = self.searcher.indexReader
-        readers = reader.sequentialSubReaders if index.MultiReader.instance_(reader) else [reader]
-        return dict((unicode(reader.directory()), reader.numDocs()) for reader in readers)
+        if isinstance(self.searcher, engine.MultiSearcher):
+            readers = map(engine.indexers.IndexReader, self.searcher.sequentialSubReaders)
+            return dict((reader.directory.toString(), reader.numDocs()) for reader in readers)
+        return {self.searcher.directory.toString(): len(self.searcher)}
     @cherrypy.expose
     @cherrypy.tools.json_in(process_body=dict)
     @cherrypy.tools.allow(methods=['POST'])
             mltfields = options.pop('mlt.fields', ())
             with HTTPError(httplib.BAD_REQUEST, ValueError):
                 attrs = dict((key.partition('.')[-1], json.loads(options[key])) for key in options if key.startswith('mlt.'))
-            q = searcher.morelikethis(mlt, *mltfields, **attrs)
+            q = searcher.morelikethis(mlt, *mltfields, analyzer=searcher.analyzer, **attrs)
         if count is not None:
             count += start
         if count == 0:
 def init(vmargs='-Xrs', **kwargs):
     "Callback to initialize VM and app roots after daemonizing."
     lucene.initVM(vmargs=vmargs, **kwargs)
+    try:
+        from org.apache.lucene import codecs
+    except ImportError:
+        pass
+    else:  # initialize codecs in main thread to avoid ExceptionInInitializerError
+        codecs.Codec.getDefault()
     for app in cherrypy.tree.apps.values():
         if isinstance(app.root, WebSearcher):
             app.root.__init__(*app.root.__dict__.pop('args'), **app.root.__dict__.pop('kwargs'))
 ==================
  * Engine:
    
-   - PyLucene 3.3 and 3.4 deprecated
+   - PyLucene 4.0 supported
+   - PyLucene 3.2, 3.3, and 3.4 deprecated
    - Optimized searching and sorting with unlimited count
    - Support for contrib grouping collectors and faceting
    - FieldCache comparators optimized for memory and real-time searching

File test/distributed.py

     
     def testInterface(self):
         "Distributed reading and writing."
-        self.servers += map(self.start, self.ports)
+        for port in self.ports:
+            self.start(port)
         resources = client.Resources(self.hosts, limit=1)
         assert resources.unicast('GET', '/')
         assert not resources.unicast('POST', '/terms')
             docs += result['docs']
         assert len(docs) == len(resources) + 1
         assert len(set(doc['__id__'] for doc in docs)) == 2
-        self.stop(self.servers.pop(0))
+        self.stop(self.ports[0])
         self.assertRaises(socket.error, resources.broadcast, 'GET', '/')
         assert resources.unicast('GET', '/')()
         del resources[self.hosts[0]]
     
     def testSharding(self):
         "Sharding of indices across servers."
-        self.servers += map(self.start, self.ports)
+        for port in self.ports:
+            self.start(port)
         keys = range(len(self.hosts))
         shards = client.Shards(zip(self.hosts * 2, heapq.merge(keys, keys)), limit=1)
         shards.resources.broadcast('PUT', '/fields/zone', {'store': 'yes'})
             assert len(docs) == 2
             zones.update(doc['zone'] for doc in docs)
         assert zones == set('012')
-        self.stop(self.servers.pop(0))
+        self.stop(self.ports[0])
         self.assertRaises(socket.error, shards.broadcast, 0, 'GET', '/')
         responses = shards.multicast([0, 1, 2], 'GET', '/')
         assert len(responses) == 2 and all(response() for response in responses)
         "Replication from indexer to searcher."
         directory = os.path.join(self.tempdir, 'backup')
         sync, update = '--autosync=' + self.hosts[0], '--autoupdate=1'
-        self.servers += (
-            self.start(self.ports[0], self.tempdir),
-            self.start(self.ports[1], '-r', directory, sync, update),
-            self.start(self.ports[2], '-r', directory),
-        )
+        self.start(self.ports[0], self.tempdir),
+        self.start(self.ports[1], '-r', directory, sync, update),
+        self.start(self.ports[2], '-r', directory),
         for args in [('-r', self.tempdir), (update, self.tempdir), (update, self.tempdir, self.tempdir)]:
             assert subprocess.call((sys.executable, '-m', 'lupyne.server', sync) + args, stderr=subprocess.PIPE)
         replicas = client.Replicas(self.hosts[:2], limit=1)
         resource = client.Resource(self.hosts[1])
         time.sleep(1.1)
         assert sum(resource.get('/').values()) == 2
-        self.stop(self.servers.pop())
+        self.stop(self.ports[-1])
         root = server.WebSearcher(directory, hosts=self.hosts[:2])
         app = server.mount(root)
         root.fields = {}
         assert root.update() == 2
         assert len(root.hosts) == 2
-        self.stop(self.servers.pop(0))
+        self.stop(self.ports[0])
         assert replicas.get('/docs')
         assert replicas.call('POST', '/docs', []).status == httplib.METHOD_NOT_ALLOWED
         assert replicas.get('/terms', option='indexed') == []
         assert replicas.call('POST', '/docs', [], retry=True).status == httplib.METHOD_NOT_ALLOWED
         assert root.update() == 2
         assert len(root.hosts) == 1
-        self.stop(self.servers.pop(0))
+        self.stop(self.ports[1])
         assert root.update() == 2
         assert len(root.hosts) == 0 and isinstance(app.root, server.WebIndexer)
         app.root.close()

File test/local.py

     analysis = document = search = store = util = miscellaneous = standard = grouping = highlight = vectorhighlight = lucene
 from lupyne import engine
 from . import fixture
+if not hasattr(analysis, 'PorterStemFilter'):
+    analysis.PorterStemFilter = analysis.en.PorterStemFilter
+if hasattr(analysis, 'core'):
+    analysis.WhitespaceAnalyzer, analysis.WhitespaceTokenizer = analysis.core.WhitespaceAnalyzer, analysis.core.WhitespaceTokenizer
 
 class typeAsPayload(engine.TokenFilter):
     "Custom implementation of lucene TypeAsPayloadTokenFilter."
 
 class Filter(PythonFilter):
     "Broken filter to test errors are raised."
-    def getDocIdSet(self, indexReader):
+    def getDocIdSet(self, *args):
         assert False
 
 class BaseTest(unittest.TestCase):
     def testInterface(self):
         "Indexer and document interfaces."
         self.assertRaises(TypeError, engine.IndexSearcher)
-        analyzer = standard.StandardAnalyzer(util.Version.values()[-1])
+        analyzer = lambda reader: standard.StandardTokenizer(util.Version.values()[-1], reader)
         stemmer = engine.Analyzer(analyzer, analysis.PorterStemFilter, typeAsPayload)
         for token in stemmer.tokens('hello'):
             assert token.positionIncrement == 1
         assert str(stemmer.parse('hellos', field=['body', 'title'])) == 'body:hello title:hello'
         assert str(stemmer.parse('hellos', field={'body': 1.0, 'title': 2.0})) == 'body:hello title:hello^2.0'
         indexer = engine.Indexer(analyzer=stemmer, version=util.Version.LUCENE_30, writeLockTimeout=100L)
-        assert indexer.writeLockTimeout == 100
+        assert indexer.config.writeLockTimeout == 100
         self.assertRaises(lucene.JavaError, engine.Indexer, indexer.directory)
         indexer.set('text')
-        indexer.set('name', store=True, index=False, boost=2.0)
-        for field in indexer.fields['name'].items('sample'):
-            assert isinstance(field, document.Field) and field.boost == 2.0
-        indexer.set('tag', store=True, index=True)
+        indexer.set('name', store=True, index=False)
+        indexer.set('tag', store=True, index=True, boost=2.0)
+        for field in indexer.fields['tag'].items('sample'):
+            assert isinstance(field, document.Field) and getattr(field, 'getBoost', field.boost)() == 2.0
         searcher = indexer.indexSearcher
         indexer.commit()
         assert searcher is indexer.indexSearcher
         assert not searcher.search(count=1)
         indexer.add(text='hello worlds', name='sample', tag=['python', 'search'])
         assert len(indexer) == 1 and list(indexer) == []
-        assert not indexer.optimized
         indexer.commit()
         assert searcher is not indexer.indexSearcher
         assert list(indexer) == [0]
-        assert indexer.current and indexer.optimized
+        assert indexer.current
         assert 0 in indexer and 1 not in indexer
         doc = indexer[0]
         assert doc == {'tag': ['python', 'search'], 'name': ['sample']}
         assert not list(indexer.termvector(0, 'tag'))
         assert indexer.count('text', 'hello') == indexer.count('text:hello') == 1
         assert sorted(indexer.names()) == ['name', 'tag', 'text']
-        assert sorted(indexer.names('indexed', isIndexed=True)) == ['tag', 'text']
-        assert indexer.names('unindexed', isIndexed=False) == ['name']
+        try:
+            names = indexer.names(indexed=True)
+        except AttributeError:
+            names = indexer.names('indexed', isIndexed=True)
+        assert sorted(names)[-2:] == ['tag', 'text']
+        try:
+            names = indexer.names(indexed=False)
+        except AttributeError:
+            names = indexer.names('unindexed', isIndexed=False)
+        assert 'name' in names
         assert list(indexer.terms('text')) == ['hello', 'world']
         assert list(indexer.terms('text', 'h', 'v')) == ['hello']
         assert dict(indexer.terms('text', 'w', counts=True)) == {'world': 1}
         assert str(search.MatchAllDocsQuery() | query) == '*:* text:*'
         assert str(search.MatchAllDocsQuery() - query) == '*:* -text:*'
         query = +query
-        query &= engine.Query.fuzzy('text', 'hello')
-        query |= engine.Query.fuzzy('text', 'hello', 0.1)
-        assert str(query) == '+text:* +text:hello~0.5 text:hello~0.1'
+        if hasattr(search.FuzzyQuery, 'defaultMaxEdits'):
+            query &= engine.Query.fuzzy('text', 'hello')
+            query |= engine.Query.fuzzy('text', 'hello', 1)
+            assert str(query) == '+text:* +text:hello~2 text:hello~1'
+        else:
+            query &= engine.Query.fuzzy('text', 'hello')
+            query |= engine.Query.fuzzy('text', 'hello', 0.1)
+            assert str(query) == '+text:* +text:hello~0.5 text:hello~0.1'
         query = engine.Query.span('text', 'world')
         assert str(query.mask('name')) == 'mask(text:world) as name'
         assert str(query.payload()) == 'spanPayCheck(text:world, payloadRef: )'
-        assert isinstance(query.filter(cache=False), search.SpanQueryFilter) and isinstance(query.filter(), search.CachingSpanFilter)
+        assert isinstance(query.filter(cache=False), getattr(search, 'SpanQueryFilter', search.QueryWrapperFilter))
+        assert isinstance(query.filter(), getattr(search, 'CachingSpanFilter', search.CachingWrapperFilter))
         query = engine.Query.disjunct(0.1, query, name='sample')
         assert str(query) == '(text:world | name:sample)~0.1'
         query = engine.Query.near('text', 'hello', ('tag', 'python'), slop=-1, inOrder=False)
         indexer.commit()
         assert 0 not in indexer and len(indexer) == 0 and sum(indexer.segments.values()) == 0
         indexer.add(tag='test', name='old')
-        with assertWarns(DeprecationWarning):
-            indexer.update('tag', boost=2.0, tag='test')
+        if hasattr(document.Document, 'boost'):
+            with assertWarns(DeprecationWarning):
+                indexer.update('tag', boost=2.0, tag='test')
+        else:
+            indexer.update('tag', tag='test')
         indexer.commit()
         assert [indexer[id].dict() for id in indexer] == [{'tag': 'test'}]
         indexer.update('tag', 'test', {'name': 'new'})
         indexer += temp.directory
         indexer += self.tempdir
         assert len(indexer) == 3
-        indexer.add(text=analysis.WhitespaceTokenizer(StringReader('?')), name=lucene.JArray_byte('{}'))
+        indexer.add(text=analysis.WhitespaceTokenizer(util.Version.LUCENE_CURRENT, StringReader('?')), name=lucene.JArray_byte('{}'))
         indexer.commit()
-        assert indexer[next(indexer.docs('text', '?'))] == {'name': ['{}']}
+        value = indexer[next(indexer.docs('text', '?'))]['name']
+        assert value == '{}' or value.utf8ToString() == '{}'
         reader = engine.indexers.IndexReader(indexer.indexReader)
         assert reader[0].dict() == {} and reader.count('text', '?') == 1
         assert len(reader.comparator('text')) == 4
         indexer.commit(merge=True)
         assert not indexer.hasDeletions()
         indexer.commit(merge=1)
-        assert indexer.optimized
+        assert len(list(indexer.readers)) == 1
         del reader.indexReader
         self.assertRaises(AttributeError, getattr, reader, 'maxDoc')
         del indexer.indexSearcher
         indexer.commit(filters=True, spellcheckers=True)
         assert reader.refCount == 0
         assert list(indexer.filters) == list(indexer.spellcheckers) == ['amendment']
-        doc['amendment'] = engine.Analyzer(analysis.WhitespaceTokenizer).tokens(doc['amendment'])
-        doc['date'] = engine.Analyzer(analysis.WhitespaceTokenizer).tokens(doc['date']), 2.0
+        tokenizer = lambda reader: analysis.WhitespaceTokenizer(util.Version.LUCENE_CURRENT, reader)
+        doc['amendment'] = engine.Analyzer(tokenizer).tokens(doc['amendment'])
+        doc['date'] = engine.Analyzer(tokenizer).tokens(doc['date']), 2.0
         scores = list(searcher.match(doc, 'text:congress', 'text:law', 'amendment:27', 'date:19*'))
         assert 0.0 == scores[0] < scores[1] < scores[2] < scores[3] == 1.0
         searcher = engine.MultiSearcher([indexer.indexReader, self.tempdir])
         assert searcher.facets(search.MatchAllDocsQuery(), 'amendment')['amendment'] == dict.fromkeys(map(str, range(1, 28)), 2)
         reader = searcher.indexReader
         del searcher
-        self.assertRaises(lucene.JavaError, reader.isCurrent)
+        assert not reader.refCount
         assert len(indexer) == len(indexer.search()) == 35
         assert sorted(indexer.names()) == ['amendment', 'article', 'date', 'text']
         articles = list(indexer.terms('article'))
         assert sorted(map(int, indexer.terms('amendment'))) == range(1, 28)
         assert list(itertools.islice(indexer.terms('text', 'right'), 2)) == ['right', 'rights']
         assert list(indexer.terms('text', 'right*')) == ['right', 'rights']
-        with assertWarns(DeprecationWarning):
-            assert list(indexer.terms('text', 'right?')) == ['rights']
+        if hasattr(search, 'WildcardTermEnum'):
+            with assertWarns(DeprecationWarning):
+                assert list(indexer.terms('text', 'right?')) == ['rights']
         assert list(indexer.terms('text', 'right', minSimilarity=0.5)) == ['eight', 'right', 'rights']
         word, count = next(indexer.terms('text', 'people', counts=True))
         assert word == 'people' and count == 8
         assert math.isnan(hits.maxscore)
         hits = indexer.search('text:right', count=2, sort=sort, maxscore=True)
         assert hits.maxscore > max(hits.scores)
-        comparator = indexer.comparator('amendment', type=int, parser=lambda value: int(value or -1))
+        parser = lambda value: int((value.utf8ToString() if hasattr(value, 'utf8ToString') else value) or -1)
+        comparator = indexer.comparator('amendment', type=int, parser=parser)
         with assertWarns(DeprecationWarning):
             hits = indexer.search('text:people', sort=comparator.__getitem__)
         assert sorted(hits.ids) == list(hits.ids) and list(hits.ids) != ids
         query = engine.Query.term('text', 'right', boost=2.0)
         assert query.boost == 2.0
         assert indexer.facets(str(query), 'amendment', 'article') == {'amendment': 12, 'article': 1}
-        self.assertRaises(TypeError, indexer.overlap, query.filter(), search.QueryWrapperFilter(query))
         hits = indexer.search('text:people', filter=query.filter())
         assert len(hits) == 4
         hit, = indexer.search('date:192*')
         assert dict(indexer.termvector(id, 'text', counts=True))['persons'] == 2
         assert dict(indexer.positionvector(id, 'text'))['persons'] in ([3, 26], [10, 48])
         assert dict(indexer.positionvector(id, 'text', offsets=True))['persons'] == [(46, 53), (301, 308)]
-        query = indexer.morelikethis(0)
+        analyzer = analysis.WhitespaceAnalyzer(util.Version.LUCENE_CURRENT)
+        query = indexer.morelikethis(0, analyzer=analyzer)
         assert str(query) == 'text:united text:states'
         hits = indexer.search(query & engine.Query.prefix('article', ''))
         assert len(hits) == 8 and hits[0]['article'] == 'Preamble'
-        assert str(indexer.morelikethis(0, 'article')) == ''
-        assert str(indexer.morelikethis(0, minDocFreq=3)) == 'text:establish text:united text:states'
-        assert str(indexer.morelikethis('jury', 'text', minDocFreq=4, minTermFreq=1)) == 'text:jury'
-        assert str(indexer.morelikethis('jury', 'article')) == ''
-        self.assertRaises(lucene.JavaError, indexer.morelikethis, 'jury')
+        assert str(indexer.morelikethis(0, 'article', analyzer=analyzer)) == ''
+        assert str(indexer.morelikethis(0, minDocFreq=3, analyzer=analyzer)) == 'text:establish text:united text:states'
+        assert str(indexer.morelikethis('jury', 'text', minDocFreq=4, minTermFreq=1, analyzer=analyzer)) == 'text:jury'
+        assert str(indexer.morelikethis('jury', 'article', analyzer=analyzer)) == ''
+        try:
+            query = indexer.morelikethis('jury')
+        except lucene.JavaError:
+            pass
+        else:
+            assert str(query) == ''
         assert indexer.suggest('missing', '') == list(indexer.correct('missing', '')) == []
         assert indexer.suggest('text', '')[:8] == ['shall', 'states', 'any', 'have', 'united', 'congress', 'state', 'constitution']
         assert indexer.suggest('text', 'con')[:2] == ['congress', 'constitution']
                 location = '.'.join(doc[name] for name in ['state', 'county', 'city'])
                 indexer.add(doc, latitude=lat, longitude=lng, location=location)
         indexer.commit()
-        assert set(['state', 'zipcode']) < set(indexer.names('indexed', isIndexed=True))
-        assert set(['latitude', 'longitude', 'county', 'city']) == set(indexer.names('unindexed', isIndexed=False))
+        try:
+            names = indexer.names(indexed=True)
+        except AttributeError:
+            names = indexer.names('indexed', isIndexed=True)
+        assert set(['state', 'zipcode']) < set(names)
+        try:
+            names = indexer.names(indexed=False)
+        except AttributeError:
+            names = indexer.names('unindexed', isIndexed=False)
+        assert set(['latitude', 'longitude', 'county', 'city']) <= set(names)
         states = list(indexer.terms('state'))
         assert states[0] == 'AK' and states[-1] == 'WY'
         counties = [term.split('.')[-1] for term in indexer.terms('state.county', 'CA', 'CA~')]
         size = indexer.copy(path, exclude=query, merge=1)
         assert len(searcher) + size == len(indexer)
         searcher = engine.IndexSearcher(path)
-        assert searcher.optimized and 'CA' not in searcher.terms('state')
+        assert len(searcher.segments) == 1 and 'CA' not in searcher.terms('state')
         directory.close()
     
     def testSpatial(self):
         field = engine.Field('', index=True, analyzed=True, omitNorms=True, termvector=True, withPositions=True, withOffsets=True)
         field, = field.items(' ')
         attrs = 'indexed', 'tokenized', 'termVectorStored', 'storePositionWithTermVector', 'storeOffsetWithTermVector', 'omitNorms'
-        assert all(getattr(field, attr) for attr in attrs)
+        try:
+            assert all(getattr(field, attr) for attr in attrs)
+        except AttributeError:
+            attrs = 'indexed', 'tokenized', 'storeTermVectors', 'storeTermVectorPositions', 'storeTermVectorOffsets', 'omitNorms'
+            assert all(getattr(field.fieldType(), attr)() for attr in attrs)
         indexer = engine.Indexer(self.tempdir)
         with assertWarns(DeprecationWarning):
             indexer.set('amendment', engine.FormatField, format='{0:02d}', store=True)
         assert list(hits.ids) == ids[:len(hits)]
         query = engine.Query.range('size', None, '1000')
         assert indexer.count(query) == len(sizes) - len(ids)
-        indexer.sorters['year'] = engine.SortField('Y-m-d', type=int, parser=lambda date: int(date.split('-')[0]))
-        assert indexer.comparator('year')[:10] == [1791] * 10
+        parser = lambda date: int((date.utf8ToString() if lucene.VERSION >= '4' else date).split('-')[0])
+        indexer.sorters['year'] = engine.SortField('Y-m-d', type=int, parser=parser)
+        assert list(indexer.comparator('year'))[:10] == [1791] * 10
         cache = len(search.FieldCache.DEFAULT.cacheEntries)
         hits = indexer.search(count=3, sort='year')
         assert [int(hit['amendment']) for hit in hits] == [1, 2, 3]
         cache = len(search.FieldCache.DEFAULT.cacheEntries)
         assert list(indexer.comparator('year'))[-1] == 0
         assert cache == len(search.FieldCache.DEFAULT.cacheEntries)
+        self.assertRaises(AttributeError, indexer.comparator, 'size', type='score')
     
     def testNumeric(self):
         "Numeric fields."
         assert indexer.count(query) == len(sizes) - len(ids)
         self.assertRaises(OverflowError, list, field.items(-2**64))
         nf, = field.items(0.5)
-        assert nf.numericValue.doubleValue() == 0.5
+        assert (nf.numericValue if hasattr(document, 'NumericField') else nf.numericValue()).doubleValue() == 0.5
         assert str(field.range(-2**64, 0)) == 'size:[* TO 0}'
         assert str(field.range(0, 2**64)) == 'size:[0 TO *}'
         assert str(field.range(0.5, None, upper=True)) == 'size:[0.5 TO *]'

File test/remote.py

         local.BaseTest.run(self, result)
     def setUp(self):
         local.BaseTest.setUp(self)
-        self.servers = []
+        self.servers = {}
     def tearDown(self):
-        for server in self.servers:
-            self.stop(server)
+        for port in list(self.servers):
+            self.stop(port)
         local.BaseTest.tearDown(self)
     def start(self, port, *args, **config):
         "Start server in separate process on given port."
         config.update(self.config)
         config['server.socket_port'] = port
         cherrypy.process.servers.wait_for_free_port('localhost', port)
-        server = subprocess.Popen((sys.executable, '-m', 'lupyne.server', '-c', json.dumps(config)) + args)
+        server = self.servers[port] = subprocess.Popen((sys.executable, '-m', 'lupyne.server', '-c', json.dumps(config)) + args)
         cherrypy.process.servers.wait_for_occupied_port('localhost', port)
         assert not server.poll()
-        return server
-    def stop(self, server):
-        "Terminate server."
+    def stop(self, port):
+        "Terminate server on given port."
+        server = self.servers.pop(port)
         server.terminate()
         assert server.wait() == 0
 
     def testInterface(self):
         "Remote reading and writing."
         config = {'tools.json_out.indent': 2, 'tools.validate.last_modified': True, 'tools.validate.expires': 0, 'tools.validate.max_age': 0}
-        self.servers += (
-            self.start(self.ports[0], self.tempdir, '--autoreload=1', **config),
-            self.start(self.ports[1], self.tempdir, self.tempdir, '--autoupdate=2.0'), # concurrent searchers
-        )
+        self.start(self.ports[0], self.tempdir, '--autoreload=1', **config),
+        self.start(self.ports[1], self.tempdir, self.tempdir, '--autoupdate=2.0'), # concurrent searchers
         resource = client.Resource('localhost', self.ports[0])
         assert resource.get('/favicon.ico')
         response = resource.call('GET', '/')
         resource = client.Resource('localhost', self.ports[-1] + 1)
         with assertRaises(socket.error, errno.ECONNREFUSED):
             resource.get('/')
-        self.stop(self.servers.pop(0))
+        port = self.ports[0]
+        self.stop(port)
         pidfile = os.path.join(self.tempdir, 'pid')
-        self.start(self.ports[0], '-dp', pidfile)
+        self.start(port, '-dp', pidfile)
         time.sleep(1)
         os.kill(int(open(pidfile).read()), signal.SIGTERM)
+        del self.servers[port]
         filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'lupyne/server.py')
         assert subprocess.call((sys.executable, filepath, '-c', filepath), stderr=subprocess.PIPE)
         assert cherrypy.tree.mount(None)
     
     def testBasic(self):
         "Remote text indexing and searching."
-        self.servers.append(self.start(self.ports[0], self.tempdir))
+        self.start(self.ports[0], self.tempdir)
         resource = client.Resource('localhost', self.ports[0])
         assert resource.get('/fields') == []
         for name, settings in fixture.constitution.fields.items():
         positions = dict(resource.get('/terms/text/people/docs/positions'))
         assert sorted(positions) == docs and list(map(len, positions.values())) == counts.values()
         doc, = resource.get('/search', q='amendment:1', fields='', **{'fields.indexed': 'article,amendment:int'})['docs']
-        assert doc['amendment'] == 1 and doc['article'] is None
+        assert doc['amendment'] == 1 and not doc['article']
         result = resource.get('/search', **{'q.field': 'text', 'q': 'write "hello world"', 'spellcheck': 3})
         terms = result['spellcheck'].pop('text')
         assert result['docs'] == [] and result['spellcheck'] == {}
         assert all(min(group['count'], 2) >= len(group['docs']) for group in result['groups'])
         assert all(doc.get('date') == group['value'] for group in result['groups'] for doc in group['docs'])
         group = result['groups'][0]
-        assert group['value'] == '1791-12-15' and result['groups'][-1]['value'] is None
+        assert group['value'] == '1791-12-15' and not result['groups'][-1]['value']
         assert sorted(group) == ['count', 'docs', 'value'] and group['count'] == 5
         assert len(group['docs']) == 2 and group['docs'][0]['amendment'] == '2'
         assert len(result['groups'][1]['docs']) == 1 and all(group['docs'] == [] for group in result['groups'][2:])
         "Nested and numeric fields."
         writer = engine.IndexWriter(self.tempdir)
         writer.commit()
-        self.servers.append(self.start(self.ports[0], '-r', self.tempdir, **{'tools.validate.etag': False, 'tools.validate.last_modified': False}))
+        self.start(self.ports[0], '-r', self.tempdir, **{'tools.validate.etag': False})
         writer.set('zipcode', engine.NumericField, store=True)
         writer.fields['location'] = engine.NestedField('county.city')
         for doc in fixture.zipcodes.docs():
         "Real Time updating and searching."
         for args in [('-r',), ('--real-time', 'index0', 'index1'), ('-r', '--real-time', 'index')]:
             assert subprocess.call((sys.executable, '-m', 'lupyne.server') + args, stderr=subprocess.PIPE)
-        with contextlib.closing(server.WebIndexer(self.tempdir)) as root:
-            root.indexer.add()
-            assert root.update() == 1
+        root = server.WebIndexer(self.tempdir)
+        root.indexer.add()
+        assert root.update() == 1
+        del root
         port = self.ports[0]
-        self.servers.append(self.start(self.ports[0], '--real-time', **{'tools.validate.expires': 0, 'tools.validate.last_modified': False}))
+        self.start(self.ports[0], '--real-time', **{'tools.validate.expires': 0})
         resource = client.Resource('localhost', port)
         response = resource.call('GET', '/docs')
         version, modified, expires = map(response.getheader, ('etag', 'last-modified', 'expires'))