Commits

Matt Chaput committed dd6e077 Merge

Merging branches.

Comments (0)

Files changed (8)

src/whoosh/classify.py

                           self.ixreader.field_length(fieldname))
         self.model = model
 
-        # Cache the collection frequency of every term in this field. This
-        # turns out to be much faster than reading each individual weight
-        # from the term index as we add words.
-        self.collection_freq = dict((word, ti.weight()) for word, ti
-                                    in self.ixreader.iter_field(fieldname))
-
         # Maps words to their weight in the top N documents.
         self.topN_weight = defaultdict(float)
 
         """
 
         model = self.model
+        fieldname = self.fieldname
+        ixreader = self.ixreader
         tlist = []
         maxweight = 0
-        collection_freq = self.collection_freq
+
+        # If no terms have been added, return an empty list
+        if not self.topN_weight:
+            return []
 
         for word, weight in iteritems(self.topN_weight):
-            if word in collection_freq:
-                score = model.score(weight, collection_freq[word],
-                                    self.top_total)
+            if (fieldname, word) in ixreader:
+                cf = ixreader.frequency(fieldname, word)
+                score = model.score(weight, cf, self.top_total)
                 if score > maxweight:
                     maxweight = score
                 tlist.append((score, word))

src/whoosh/fields.py

     
     The FieldType object supports the following attributes:
     
-    * format (fields.Format): the storage format for the field's contents.
+    * format (formats.Format): the storage format for the field's contents.
     
     * analyzer (analysis.Analyzer): the analyzer to use to turn text into
       terms.
     
-    * vector (fields.Format): the storage format for the field's vectors
+    * vector (formats.Format): the storage format for the field's vectors
       (forward index), or None if the field should not store vectors.
     
     * scorable (boolean): whether searches against this field may be scored.

src/whoosh/filedb/fieldcache.py

         :param default: the value to use for documents without the field.
         """
 
-        self.order = order or array(self.code)
+        self.order = order or array(typecode)
+        self.typecode = typecode
+
         self.hastexts = hastexts
         self.texts = None
         if hastexts:
             self.texts = texts or [default]
-        self.typecode = typecode
 
     def __eq__(self, other):
         return (other and self.__class__ is other.__class__
         return cache
 
     def is_loaded(self, key):
-        if key in self.caches:
-            return True
-
-        with self.sharedlock:
-            return key in self.shared_cache
+        return key in self.caches or key in self.shared_cache
 
     def put(self, key, cache, save=True):
         self.caches[key] = cache
 
         if self._file_exists(key):
             try:
-                return self._load(key)
+                fc = self._load(key)
+                self.put(key, fc)
+                return fc
             except (OSError, BadFieldCache):
                 return None
 

src/whoosh/filedb/filewriting.py

         vpostwriter = self.vpostwriter
         offset = vpostwriter.start(self.schema[fieldname].vector)
         for text, weight, valuestring in vlist:
-            assert isinstance(text, text_type), "%r is not unicode" % text
+            #assert isinstance(text, text_type), "%r is not unicode" % text
             vpostwriter.write(text, weight, valuestring, 0)
-        vpostwriter.finish()
-
+        vpostwriter.finish(inlinelimit=0)
         self.vectorindex.add((docnum, fieldname), offset)
 
     def _add_vector_reader(self, docnum, fieldname, vreader):
             vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(),
                               0)
             vreader.next()
-        vpostwriter.finish()
-
+        vpostwriter.finish(inlinelimit=0)
         self.vectorindex.add((docnum, fieldname), offset)
 
     def _close_all(self):

src/whoosh/writing.py

         # Check which of the supplied fields are unique
         unique_fields = [name for name, field in self.schema.items()
                          if name in fields and field.unique]
-        if not unique_fields:
-            raise IndexingError("None of the fields in %r"
-                                " are unique" % list(fields.keys()))
         return unique_fields
 
     def update_document(self, **fields):
 
         # Delete the set of documents matching the unique terms
         unique_fields = self._unique_fields(fields)
-        with self.searcher() as s:
-            for docnum in s._find_unique([(name, fields[name])
-                                          for name in unique_fields]):
-                self.delete_document(docnum)
+        if unique_fields:
+            with self.searcher() as s:
+                for docnum in s._find_unique([(name, fields[name])
+                                              for name in unique_fields]):
+                    self.delete_document(docnum)
 
         # Add the given fields
         self.add_document(**fields)

stress/test_bigfacet.py

 
 tagcount = 100
 doccount = 500000
-dirname = "tagindex"
+dirname = "testindex"
 
 schema = fields.Schema(tags=fields.KEYWORD(stored=True, vector=formats.Existence()))
 

tests/test_sorting.py

 
 # 
 
+def test_persistent_cache():
+    schema = fields.Schema(id=fields.ID(stored=True))
+    st = RamStorage()
+    ix = st.create_index(schema)
+    with ix.writer() as w:
+        for term in u("charlie alfa echo bravo delta").split():
+            w.add_document(id=term)
+
+    ix = st.open_index()
+    with ix.reader() as r:
+        _ = r.fieldcache("id")
+        del _
+
+    ix = st.open_index()
+    with ix.reader() as r:
+        assert r.fieldcache_available("id")
+        assert not r.fieldcache_loaded("id")
+        fc = r.fieldcache("id")
+        assert r.fieldcache_loaded("id")
+        assert_equal(list(fc.order), [3, 1, 5, 2, 4])
+        assert_equal(list(fc.texts), [u('\uffff'), u'alfa', u'bravo',
+                                      u'charlie', u'delta', u'echo'])
+
 def test_float_cache():
     schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(type=float))
     with TempIndex(schema, "floatcache") as ix:

tests/test_vectors.py

 from whoosh.support.testing import TempIndex
 
 
+def test_single_term():
+    schema = fields.Schema(text=fields.TEXT(vector=True))
+    ix = RamStorage().create_index(schema)
+    with ix.writer() as w:
+        w.add_document(text=u("TEST TEST TEST"))
+    with ix.searcher() as s:
+        v = s.vector(0, "text")
+        assert v.is_active()
+
 def test_vector_reading():
-    schema = fields.Schema(title = fields.TEXT,
-                           content = fields.TEXT(vector=formats.Frequency()))
-    
+    schema = fields.Schema(title=fields.TEXT,
+                           content=fields.TEXT(vector=formats.Frequency()))
+
     with TempIndex(schema, "vectorreading") as ix:
         writer = ix.writer()
         writer.add_document(title=u("one"),
                             content=u("This is the story of the black hole story"))
         writer.commit()
-        
+
         with ix.reader() as r:
             assert_equal(list(r.vector_as("frequency", 0, "content")),
                              [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])
 
 def test_vector_merge():
-    schema = fields.Schema(title = fields.TEXT,
-                           content = fields.TEXT(vector=formats.Frequency()))
-    
+    schema = fields.Schema(title=fields.TEXT,
+                           content=fields.TEXT(vector=formats.Frequency()))
+
     with TempIndex(schema, "vectormerge") as ix:
         writer = ix.writer()
         writer.add_document(title=u("one"),
                             content=u("This is the story of the black hole story"))
         writer.commit()
-        
+
         writer = ix.writer()
         writer.add_document(title=u("two"),
                             content=u("You can read along in your book"))
         writer.commit()
-        
+
         with ix.searcher() as s:
             r = s.reader()
-        
+
             docnum = s.document_number(title=u("one"))
             vec = list(r.vector_as("frequency", docnum, "content"))
             assert_equal(vec, [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])
-        
+
             docnum = s.document_number(title=u("two"))
-        
+
             vec = list(r.vector_as("frequency", docnum, "content"))
             assert_equal(vec, [(u('along'), 1), (u('book'), 1), (u('read'), 1)])
-        
+
 def test_vector_unicode():
-    schema = fields.Schema(content = fields.TEXT(vector=formats.Frequency()))
+    schema = fields.Schema(content=fields.TEXT(vector=formats.Frequency()))
     ix = RamStorage().create_index(schema)
-    
+
     writer = ix.writer()
     writer.add_document(content=u("\u1234\u2345\u3456 \u4567\u5678\u6789"))
     writer.add_document(content=u("\u0123\u1234\u4567 \u4567\u5678\u6789"))
     writer.commit()
-    
+
     writer = ix.writer()
     writer.add_document(content=u("\u2345\u3456\u4567 \u789a\u789b\u789c"))
     writer.add_document(content=u("\u0123\u1234\u4567 \u2345\u3456\u4567"))
     writer.commit()
-    
+
     with ix.reader() as r:
         vec = list(r.vector_as("frequency", 0, "content"))
         assert_equal(vec, [(u('\u3456\u4567'), 1), (u('\u789a\u789b\u789c'), 1)])
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.