Commits

Matt Chaput committed 7f1394a

Error on SegmentWriter re-use. Added guard against empty Or/And make_binary_tree error.
Fixed delete_by_query to check for already-deleted documents.

Comments (0)

Files changed (6)

src/whoosh/filedb/filewriting.py

         self.ix = ix
         self.storage = ix.storage
         self.indexname = ix.indexname
+        self.is_closed = False
         
         info = ix._read_toc()
         self.schema = info.schema
                 poolclass = TempfilePool
         self.pool = poolclass(self.schema, procs=procs, **poolargs)
     
+    def _check_state(self):
+        if self.is_closed:
+            raise IndexingError("This writer is closed")
+    
     def add_field(self, fieldname, fieldspec):
+        self._check_state()
         if self._added:
             raise Exception("Can't modify schema after adding data to writer")
         super(SegmentWriter, self).add_field(fieldname, fieldspec)
     
     def remove_field(self, fieldname):
+        self._check_state()
         if self._added:
             raise Exception("Can't modify schema after adding data to writer")
         super(SegmentWriter, self).remove_field(fieldname)
         return any(s.has_deletions() for s in self.segments)
 
     def delete_document(self, docnum, delete=True):
+        self._check_state()
         segment, segdocnum = self._segment_and_docnum(docnum)
         segment.delete_document(segdocnum, delete=delete)
 
         return segment.is_deleted(segdocnum)
 
     def searcher(self):
+        self._check_state()
         from whoosh.filedb.fileindex import FileIndex
         return FileIndex(self.storage, indexname=self.indexname).searcher()
     
     def add_reader(self, reader):
+        self._check_state()
         startdoc = self.docnum
         
         has_deletions = reader.has_deletions()
         self._added = True
     
     def add_document(self, **fields):
+        self._check_state()
         schema = self.schema
         
         # Sort the keys
         self.docnum += 1
     
     def update_document(self, **fields):
+        self._check_state()
         _unique_cache = self._unique_cache
         
         # Check which of the supplied fields are unique
         self.vectorindex.add((docnum, fieldname), offset)
     
     def _close_all(self):
+        self.is_closed = True
+        
         self.termsindex.close()
         self.postwriter.close()
         self.storedfields.close()
         :param merge: if False, do not merge small segments.
         """
         
+        self._check_state()
         try:
             if mergetype:
                 pass
             self.writelock.release()
         
     def cancel(self):
+        self._check_state()
         try:
             self.pool.cancel()
             self._close_all()

src/whoosh/query.py

     def _matcher(self, matchercls, searcher, exclude_docs, **kwargs):
         submatchers = self._submatchers(searcher, exclude_docs)
         
+        if len(submatchers) == 1:
+            return submatchers[0]
+        if not submatchers:
+            return NullMatcher()
+        
         tree = make_binary_tree(matchercls, submatchers, **kwargs)
         if self.boost == 1.0:
             return tree

src/whoosh/writing.py

         
         count = 0
         for docnum in q.docs(s):
-            self.delete_document(docnum)
-            count += 1
+            if not self.is_deleted(docnum):
+                self.delete_document(docnum)
+                count += 1
         
         if not searcher:
             s.close()

tests/test_indexing.py

 from whoosh.filedb.filestore import FileStorage, RamStorage
 from whoosh.filedb.filewriting import NO_MERGE
 from whoosh.util import length_to_byte, byte_to_length, permutations
-from whoosh.writing import BatchWriter
+from whoosh.writing import BatchWriter, IndexingError
 
 
 class TestIndexing(unittest.TestCase):
     
     def test_deletion(self):
         s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT)
-        st = RamStorage()
-        ix = st.create_index(s)
+        ix = RamStorage().create_index(s)
         
         w = ix.writer()
         w.add_document(key=u"A", name=u"Yellow brown", value=u"Blue red green purple?")
         w.add_document(key=u"C", name=u"One two", value=u"Three four five.")
         w.commit()
         
-        # This blows up with a ``KeyError: "Document 1 in segment '_MAIN_1' is already deleted"``.
+        # This will match both documents with key == B, one of which is already
+        # deleted. This should not raise an error.
+        w = ix.writer()
         count = w.delete_by_term("key", u"B")
         self.assertEqual(count, 1)
         w.commit()
         
         ix.optimize()
-        self.assertEqual(ix.doc_count_all(), 2)
-        self.assertEqual(ix.doc_count(), 2)
+        self.assertEqual(ix.doc_count_all(), 4)
+        self.assertEqual(ix.doc_count(), 4)
         tr = ix.reader()
         self.assertEqual(list(tr.lexicon("name")), ["brown", "one", "two", "yellow"])
         tr.close()
 
+    def test_writer_reuse(self):
+        s = fields.Schema(key=fields.ID)
+        ix = RamStorage().create_index(s)
+        
+        w = ix.writer()
+        w.add_document(key=u"A")
+        w.add_document(key=u"B")
+        w.add_document(key=u"C")
+        w.commit()
+        
+        # You can't re-use a commited/canceled writer
+        self.assertRaises(IndexingError, w.add_document, key=u"D")
+        self.assertRaises(IndexingError, w.update_document, key=u"B")
+        self.assertRaises(IndexingError, w.delete_document, 0)
+        self.assertRaises(IndexingError, w.add_reader, None)
+        self.assertRaises(IndexingError, w.add_field, "name", fields.ID)
+        self.assertRaises(IndexingError, w.remove_field, "key")
+        self.assertRaises(IndexingError, w.searcher)
+
     def test_update(self):
         # Test update with multiple unique keys
         SAMPLE_DOCS = [{"id": u"test1", "path": u"/test/1", "text": u"Hello"},

tests/test_parsing.py

         self.assertEqual(q.start, teststart)
         self.assertEqual(q.end, testend)
         
-        # Broken from what I'd expect.
-        # Also yields a ``ValueError: Called make_binary_tree with empty list``
-        # when run against a real data set.
-        q = qp.parse("view_count:[1 TO 10]")
-        self.assertEqual(q.__class__, query.NumericRange)
-        self.assertEqual(q.start, 1)
-        self.assertEqual(q.end, 10)
-        self.assertEqual(q.fieldname, "name")
-    
     def test_regressions(self):
-        qp = qparser.QueryParser("content")
+        qp = qparser.QueryParser("f")
         
-        # From 0.3.18, these used to require escaping. Mostly good
-        # for regression testing.
-        q = qp.parse(u"re-inker")
-        q = qp.parse(u"0.7 wire")
-        q = qp.parse(u"daler-rowney pearlescent 'bell bronze'")
-        q = qp.parse(u'22" BX')
+        # From 0.3.18, these used to require escaping. Mostly good for
+        # regression testing.
+        self.assertEqual(qp.parse(u"re-inker"), query.Term("f", u"re-inker"))
+        self.assertEqual(qp.parse(u"0.7 wire"),
+                         query.And([query.Term("f", u"0.7"), query.Term("f", u"wire")]))
+        self.assertEqual(qp.parse(u"daler-rowney pearl 'bell bronze'"),
+                         query.And([query.Term("f", u"daler-rowney"),
+                                    query.Term("f", u"pearl"),
+                                    query.Term("f", u"bell bronze")]))
+        self.assertEqual(qp.parse(u'22" BX'),
+                         query.And([query.Term("f", u'22"'), query.Term("f", "BX")]))
         
-        # Sorry, dunno what these should all look like, so fail loudly.
-        self.fail()
-    
     def test_empty_ranges(self):
         schema = fields.Schema(name=fields.TEXT, num=fields.NUMERIC,
                                date=fields.DATETIME)

tests/test_searching.py

     def test_and(self):
         self._run_query(And([Term("value", u"red"), Term("name", u"yellow")]),
                         [u"A"])
+        # Missing
+        self._run_query(And([Term("value", u"ochre"), Term("name", u"glonk")]),
+                        [])
         
     def test_or(self):
         self._run_query(Or([Term("value", u"red"), Term("name", u"yellow")]),
                         [u"A", u"D", u"E"])
+        # Missing
+        self._run_query(Or([Term("value", u"ochre"), Term("name", u"glonk")]),
+                        [])
+        self._run_query(Or([]), [])
     
     def test_not(self):
         self._run_query(Or([Term("value", u"red"), Term("name", u"yellow"), Not(Term("name", u"quick"))]),
     def test_wildcard(self):
         self._run_query(Or([Wildcard('value', u'*red*'), Wildcard('name', u'*yellow*')]),
                         [u"A", u"C", u"D", u"E"])
+        # Missing
+        self._run_query(Wildcard('value', 'glonk*'), [])
     
     def test_not2(self):
         schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
         qp = qparser.QueryParser("content", schema=schema)
         
         q = qp.parse(u"charlie [delta TO foxtrot]")
-        self.assertEqual(q.__class__.__name__, "And")
-        self.assertEqual(q[0].__class__.__name__, "Term")
-        self.assertEqual(q[1].__class__.__name__, "TermRange")
+        self.assertEqual(q.__class__, query.And)
+        self.assertEqual(q[0].__class__, query.Term)
+        self.assertEqual(q[1].__class__, query.TermRange)
         self.assertEqual(q[1].start, "delta")
         self.assertEqual(q[1].end, "foxtrot")
         self.assertEqual(q[1].startexcl, False)
         self.assertEqual(ids, [u'A', u'B', u'C'])
         
         q = qp.parse(u"foxtrot {echo TO hotel]")
-        self.assertEqual(q.__class__.__name__, "And")
-        self.assertEqual(q[0].__class__.__name__, "Term")
-        self.assertEqual(q[1].__class__.__name__, "TermRange")
+        self.assertEqual(q.__class__, query.And)
+        self.assertEqual(q[0].__class__, query.Term)
+        self.assertEqual(q[1].__class__, query.TermRange)
         self.assertEqual(q[1].start, "echo")
         self.assertEqual(q[1].end, "hotel")
         self.assertEqual(q[1].startexcl, True)
         self.assertEqual(ids, [u'B', u'C', u'D', u'E'])
         
         q = qp.parse(u"{bravo TO delta}")
-        self.assertEqual(q.__class__.__name__, "TermRange")
+        self.assertEqual(q.__class__, query.TermRange)
         self.assertEqual(q.start, "bravo")
         self.assertEqual(q.end, "delta")
         self.assertEqual(q.startexcl, True)
         self.assertEqual(q.endexcl, True)
         ids = sorted([d['id'] for d in s.search(q)])
         self.assertEqual(ids, [u'A', u'B', u'C'])
+        
+        # Shouldn't match anything
+        q = qp.parse(u"[1 to 10]")
+        self.assertEqual(q.__class__, query.TermRange)
+        self.assertEqual(len(s.search(q)), 0)
     
     def test_range_clusiveness(self):
         schema = fields.Schema(id=fields.ID(stored=True))