Commits

Thomas Waldmann  committed 0e4301c

white-space-only source cosmetics, details see below

Changes were made by running a script that did the cleanups automatically:
- no trailing blanks
- exactly one linefeed at file end, see PEP8
- DOS line endings on .bat and .cmd files, unix line endings everywhere else

  • Participants
  • Parent commits d9d9aa5

Comments (0)

Files changed (120)

File benchmark/dictionary.py

     name = "dictionary"
     filename = "dcvgr10.txt.gz"
     headline_field = "head"
-    
+
     def documents(self):
         path = os.path.join(self.options.dir, self.filename)
         f = gzip.GzipFile(path)
-        
+
         head = body = None
         for line in f:
             line = line.decode("latin1")
                 head, body = line.split(".", 1)
             else:
                 body += line
-                
+
         if head:
             yield {"head": head, "body": head + body}
-    
+
     def whoosh_schema(self):
         ana = analysis.StemmingAnalyzer()
         #ana = analysis.StandardAnalyzer()

File benchmark/enron.py

 
 class Enron(Spec):
     name = "enron"
-    
+
     enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz"
     enron_archive_filename = "enron_mail_082109.tar.gz"
     cache_filename = "enron_cache.pickle"
 
     main_field = "body"
     headline_field = "subject"
-    
+
     field_order = ("subject", "date", "from", "to", "cc", "bcc", "body")
-    
+
     cachefile = None
 
     # Functions for downloading and then reading the email archive and caching
     # the messages in an easier-to-digest format
-    
+
     def download_archive(self, archive):
         print("Downloading Enron email archive to %r..." % archive)
         t = now()
         urlretrieve(self.enron_archive_url, archive)
         print("Downloaded in ", now() - t, "seconds")
-    
+
     @staticmethod
     def get_texts(archive):
         archive = tarfile.open(archive, "r:gz")
             if f is not None:
                 text = f.read()
                 yield text
-    
+
     @staticmethod
     def get_messages(archive, headers=True):
         header_to_field = Enron.header_to_field
                     if v:
                         d[fn] = v.decode("latin_1")
             yield d
-    
+
     def cache_messages(self, archive, cache):
         print("Caching messages in %s..." % cache)
-        
+
         if not os.path.exists(archive):
             raise Exception("Archive file %r does not exist" % archive)
-        
+
         t = now()
         f = open(cache, "wb")
         c = 0
     def setup(self):
         archive = os.path.abspath(os.path.join(self.options.dir, self.enron_archive_filename))
         cache = os.path.abspath(os.path.join(self.options.dir, self.cache_filename))
-    
+
         if not os.path.exists(archive):
             self.download_archive(archive)
         else:
             print("Archive is OK")
-        
+
         if not os.path.exists(cache):
             self.cache_messages(archive, cache)
         else:
             print("Cache is OK")
-    
+
     def documents(self):
         if not os.path.exists(self.cache_filename):
             raise Exception("Message cache does not exist, use --setup")
-        
+
         f = open(self.cache_filename, "rb")
         try:
             while True:
         except EOFError:
             pass
         f.close()
-    
+
     def whoosh_schema(self):
         ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None)
         storebody = self.options.storebody
         conn.add_field_action('cc', xappy.FieldActions.INDEX_EXACT)
         conn.add_field_action('bcc', xappy.FieldActions.INDEX_EXACT)
         return conn
-    
+
     def zcatalog_setup(self, cat):
         from zcatalog import indexes
         for name in ("date", "frm"):
             cat[name] = indexes.FieldIndex(field_name=name)
         for name in ("to", "subject", "cc", "bcc", "body"):
             cat[name] = indexes.TextIndex(field_name=name)
-    
+
     def process_document_whoosh(self, d):
         d["filepos"] = self.filepos
         if self.options.storebody:
             mf = self.main_field
             d["_stored_%s" % mf] = compress(d[mf], 9)
-    
+
     def process_result_whoosh(self, d):
         mf = self.main_field
         if mf in d:
             dd = load(self.cachefile)
             d.fields()[mf] = dd[mf]
         return d
-            
+
     def process_document_xapian(self, d):
         d[self.main_field] = " ".join([d.get(name, "") for name
                                        in self.field_order])
-    
+
 
 
 if __name__=="__main__":

File benchmark/marc21.py

             limit = None
         search(qstring, options.ixdir, options.basedir, limit=limit,
                optimize=options.optimize, scores=options.scores)
-
-
-

File benchmark/reuters.py

     filename = "reuters21578.txt.gz"
     main_field = "text"
     headline_text = "headline"
-    
+
     def whoosh_schema(self):
         #ana = analysis.StemmingAnalyzer()
         ana = analysis.StandardAnalyzer()
                                headline=fields.STORED,
                                text=fields.TEXT(analyzer=ana, stored=True))
         return schema
-    
+
     def zcatalog_setup(self, cat):
         from zcatalog import indexes  #@UnresolvedImport
         cat["id"] = indexes.FieldIndex(field_name="id")
         cat["headline"] = indexes.TextIndex(field_name="headline")
         cat["body"] = indexes.TextIndex(field_name="text")
-    
+
     def documents(self):
         path = os.path.join(self.options.dir, self.filename)
         f = gzip.GzipFile(path)
-        
+
         for line in f:
             id, text = line.decode("latin1").split("\t")
             yield {"id": id, "text": text, "headline": text[:70]}
 
-        
+
 if __name__ == "__main__":
     Bench().run(Reuters)

File scripts/make_checkpoint.py

         print("Deleted", path, w.delete_by_term("path", path))
 
 print(counter, ix.doc_count())
-
-
-

File scripts/read_checkpoint.py

         assert r[0]["dt"] == dt
 
 print("Done")
-
-
-

File scripts/release.py

 from os import system
 
 # Script to build and upload a release of Whoosh to PyPI and build
-# and upload the 
+# and upload the
 
 def build_docs():
     system("python setup.py build_sphinx")
         opts = {"base": "http://svn.whoosh.ca/projects/whoosh",
                 "tag": tag,
                 "msg": "Tagging trunk as %s" % tag}
-        
+
         system('svn copy %(base)s/trunk %(base)s/tags/%(tag)s -m "%(msg)s"' % opts)
 
 
                       help="Configuration file",
                       metavar="INIFILE",
                       default="whoosh.ini")
-    
+
     parser.add_option("-d", "--no-docs", dest="dodocs",
                       help="Don't build or upload docs",
                       action="store_false",
                       default=True)
-    
+
     parser.add_option("-D", "--no-build-docs", dest="builddocs",
                       help="Skip building docs",
                       action="store_false",
                       default=True)
-    
+
     parser.add_option("-t", "--tag", dest="tag",
                       help="Tag the trunk as this",
                       default=None)
-    
+
     (options, args) = parser.parse_args()
-    
+
     cp = ConfigParser()
     cp.read(options.configfile)
-    
+
     if options.dodocs:
         upload_docs(cp.get("website", "username"),
                     cp.get("website", "server"),
     "Topic :: Text Processing :: Indexing",
     ],
 )
-

File src/whoosh/__init__.py

 
 def versionstring(build=True, extra=True):
     """Returns the version number of Whoosh as a string.
-    
+
     :param build: Whether to include the build number in the string.
     :param extra: Whether to include alpha/beta/rc etc. tags. Only
         checked if build is True.
         s += "".join(str(n) for n in __version__[3:])
 
     return s
-

File src/whoosh/analysis/__init__.py

   a string and yield Token objects (actually, the same token object over and
   over, for performance reasons) corresponding to the tokens (words) in the
   text.
-      
+
   Every tokenizer is a callable that takes a string and returns an iterator of
   tokens.
 
 * Filters take the tokens from the tokenizer and perform various
   transformations on them. For example, the LowercaseFilter converts all tokens
   to lowercase, which is usually necessary when indexing regular English text.
-      
+
   Every filter is a callable that takes a token generator and returns a token
   generator.
 
 * Analyzers are convenience functions/classes that "package up" a tokenizer and
   zero or more filters into a single unit. For example, the StandardAnalyzer
   combines a RegexTokenizer, LowercaseFilter, and StopFilter.
-    
+
   Every analyzer is a callable that takes a string and returns a token
   iterator. (So Tokenizers can be used as Analyzers if you don't need any
   filtering).
 from whoosh.analysis.intraword import *
 from whoosh.analysis.ngrams import *
 from whoosh.analysis.analyzers import *
-
-
-
-
-
-

File src/whoosh/analysis/acore.py

     """
     Represents a "token" (usually a word) extracted from the source text being
     indexed.
-    
+
     See "Advanced analysis" in the user guide for more information.
-    
+
     Because object instantiation in Python is slow, tokenizers should create
     ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
     each time.
-    
+
     This trick means that consumers of tokens (i.e. filters) must never try to
     hold onto the token object between loop iterations, or convert the token
     generator into a list. Instead, save the attributes between iterations,
     not the object::
-    
+
         def RemoveDuplicatesFilter(self, stream):
             # Removes duplicate words.
             lasttext = None
 
     def has_morph(self):
         return self.is_morph
-
-
-
-
-
-
-
-
-
-

File src/whoosh/analysis/analyzers.py

 
 def KeywordAnalyzer(lowercase=False, commas=False):
     """Parses whitespace- or comma-separated tokens.
-    
+
     >>> ana = KeywordAnalyzer()
     >>> [token.text for token in ana("Hello there, this is a TEST")]
     ["Hello", "there,", "this", "is", "a", "TEST"]
-    
+
     :param lowercase: whether to lowercase the tokens.
     :param commas: if True, items are separated by commas rather than
         whitespace.
 
 def SimpleAnalyzer(expression=default_pattern, gaps=False):
     """Composes a RegexTokenizer with a LowercaseFilter.
-    
+
     >>> ana = SimpleAnalyzer()
     >>> [token.text for token in ana("Hello there, this is a TEST")]
     ["hello", "there", "this", "is", "a", "test"]
-    
+
     :param expression: The regular expression pattern to use to extract tokens.
     :param gaps: If True, the tokenizer *splits* on the expression, rather
         than matching on the expression.
                      minsize=2, maxsize=None, gaps=False):
     """Composes a RegexTokenizer with a LowercaseFilter and optional
     StopFilter.
-    
+
     >>> ana = StandardAnalyzer()
     >>> [token.text for token in ana("Testing is testing and testing")]
     ["testing", "testing", "testing"]
                      ignore=None, cachesize=50000):
     """Composes a RegexTokenizer with a lower case filter, an optional stop
     filter, and a stemming filter.
-    
+
     >>> ana = StemmingAnalyzer()
     >>> [token.text for token in ana("Testing is testing and testing")]
     ["test", "test", "test"]
-    
+
     :param expression: The regular expression pattern to use to extract tokens.
     :param stoplist: A list of stop words. Set this to None to disable
         the stop word filter.
                   mergewords=False, mergenums=False):
     """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
     StopFilter.
-    
+
     >>> ana = FancyAnalyzer()
     >>> [token.text for token in ana("Should I call getInt or get_real?")]
     ["should", "call", "getInt", "get", "int", "get_real", "get", "real"]
-    
+
     :param expression: The regular expression pattern to use to extract tokens.
     :param stoplist: A list of stop words. Set this to None to disable
         the stop word filter.
                      cachesize=50000):
     """Configures a simple analyzer for the given language, with a
     LowercaseFilter, StopFilter, and StemFilter.
-    
+
     >>> ana = LanguageAnalyzer("es")
     >>> [token.text for token in ana("Por el mar corren las liebres")]
     ['mar', 'corr', 'liebr']
-    
+
     :param expression: The regular expression pattern to use to extract tokens.
     :param gaps: If True, the tokenizer *splits* on the expression, rather
         than matching on the expression.
         pass
 
     return chain
-
-
-
-
-

File src/whoosh/analysis/filters.py

     """Base class for Filter objects. A Filter subclass must implement a
     filter() method that takes a single argument, which is an iterator of Token
     objects, and yield a series of Token objects in return.
-    
+
     Filters that do morphological transformation of tokens (e.g. stemming)
     should set their ``is_morph`` attribute to True.
     """
     def __init__(self, **kwargs):
         """Use keyword arguments to associate mode attribute values with
         instantiated filters.
-        
+
         >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False)
         >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False)
         >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query)
-        
+
         This class expects that the value of the mode attribute is consistent
         among all tokens in a token stream.
         """
 
 class TeeFilter(Filter):
     """Interleaves the results of two or more filters (or filter chains).
-    
+
     NOTE: because it needs to create copies of each token for each sub-filter,
     this filter is quite slow.
-    
+
     >>> target = "ALFA BRAVO CHARLIE"
     >>> # In one branch, we'll lower-case the tokens
     >>> f1 = LowercaseFilter()
     >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2)
     >>> [token.text for token in ana(target)]
     ["alfa", "AFLA", "bravo", "OVARB", "charlie", "EILRAHC"]
-    
+
     To combine the incoming token stream with the output of a filter chain, use
     ``TeeFilter`` and make one of the filters a :class:`PassFilter`.
-    
+
     >>> f1 = PassFilter()
     >>> f2 = BiWordFilter()
     >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) | LowercaseFilter()
 
 class ReverseTextFilter(Filter):
     """Reverses the text of each token.
-    
+
     >>> ana = RegexTokenizer() | ReverseTextFilter()
     >>> [token.text for token in ana("hello there")]
     ["olleh", "ereht"]
 
 class LowercaseFilter(Filter):
     """Uses unicode.lower() to lowercase token text.
-    
+
     >>> rext = RegexTokenizer()
     >>> stream = rext("This is a TEST")
     >>> [token.text for token in LowercaseFilter(stream)]
 class StopFilter(Filter):
     """Marks "stop" words (words too common to index) in the stream (and by
     default removes them).
-    
+
     >>> rext = RegexTokenizer()
     >>> stream = rext("this is a test")
     >>> stopper = StopFilter()
     """Translates the text of tokens by calling unicode.translate() using the
     supplied character mapping object. This is useful for case and accent
     folding.
-    
+
     The ``whoosh.support.charset`` module has a useful map for accent folding.
-    
+
     >>> from whoosh.support.charset import accent_map
     >>> retokenizer = RegexTokenizer()
     >>> chfilter = CharsetFilter(accent_map)
     >>> [t.text for t in chfilter(retokenizer(u'café'))]
     [u'cafe']
-    
+
     Another way to get a character mapping object is to convert a Sphinx
     charset table file using
     :func:`whoosh.support.charset.charset_table_to_dict`.
-    
+
     >>> from whoosh.support.charset import charset_table_to_dict
     >>> from whoosh.support.charset import default_charset
     >>> retokenizer = RegexTokenizer()
     >>> chfilter = CharsetFilter(charmap)
     >>> [t.text for t in chfilter(retokenizer(u'Stra\\xdfe'))]
     [u'strase']
-    
+
     The Sphinx charset table format is described at
     http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
     """
 class DelimitedAttributeFilter(Filter):
     """Looks for delimiter characters in the text of each token and stores the
     data after the delimiter in a named attribute on the token.
-    
+
     The defaults are set up to use the ``^`` character as a delimiter and store
     the value after the ``^`` as the boost for the token.
-    
+
     >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost")
     >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter()
     >>> for t in ana(u("image render^2 file^0.5"))
     'image' 1.0
     'render' 2.0
     'file' 0.5
-    
+
     Note that you need to make sure your tokenizer includes the delimiter and
     data as part of the token!
     """
 
 class SubstitutionFilter(Filter):
     """Performs a regular expression substitution on the token text.
-    
+
     This is especially useful for removing text from tokens, for example
     hyphens::
-    
+
         ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "")
-        
+
     Because it has the full power of the re.sub() method behind it, this filter
     can perform some fairly complex transformations. For example, to take
     tokens like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c',
     'f=e'``::
-    
+
         # Analyzer that swaps the text on either side of an equal sign
         rt = RegexTokenizer(r"\\S+")
         sf = SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1")
         for t in tokens:
             t.text = pattern.sub(replacement, t.text)
             yield t
-
-
-
-
-
-
-
-
-
-

File src/whoosh/analysis/intraword.py

     """Given a set of words (or any object with a ``__contains__`` method),
     break any tokens in the stream that are composites of words in the word set
     into their individual parts.
-    
+
     Given the correct set of words, this filter can break apart run-together
     words and trademarks (e.g. "turbosquid", "applescript"). It can also be
     useful for agglutinative languages such as German.
-    
+
     The ``keep_compound`` argument lets you decide whether to keep the
     compound word in the token stream along with the word segments.
-    
+
     >>> cwf = CompoundWordFilter(wordset, keep_compound=True)
     >>> analyzer = RegexTokenizer(r"\S+") | cwf
     >>> [t.text for t in analyzer("I do not like greeneggs and ham")
 
 class BiWordFilter(Filter):
     """Merges adjacent tokens into "bi-word" tokens, so that for example::
-    
+
         "the", "sign", "of", "four"
-        
+
     becomes::
-    
+
         "the-sign", "sign-of", "of-four"
-        
+
     This can be used to create fields for pseudo-phrase searching, where if
     all the terms match the document probably contains the phrase, but the
     searching is faster than actually doing a phrase search on individual word
     terms.
-    
+
     The ``BiWordFilter`` is much faster than using the otherwise equivalent
     ``ShingleFilter(2)``.
     """
 class ShingleFilter(Filter):
     """Merges a certain number of adjacent tokens into multi-word tokens, so
     that for example::
-    
+
         "better", "a", "witty", "fool", "than", "a", "foolish", "wit"
-        
+
     with ``ShingleFilter(3, ' ')`` becomes::
-    
+
         'better a witty', 'a witty fool', 'witty fool than', 'fool than a',
         'than a foolish', 'a foolish wit'
-    
+
     This can be used to create fields for pseudo-phrase searching, where if
     all the terms match the document probably contains the phrase, but the
     searching is faster than actually doing a phrase search on individual word
     terms.
-    
+
     If you're using two-word shingles, you should use the functionally
     equivalent ``BiWordFilter`` instead because it's faster than
     ``ShingleFilter``.
     """Splits words into subwords and performs optional transformations on
     subword groups. This filter is funtionally based on yonik's
     WordDelimiterFilter in Solr, but shares no code with it.
-    
+
     * Split on intra-word delimiters, e.g. `Wi-Fi` -> `Wi`, `Fi`.
     * When splitwords=True, split on case transitions,
       e.g. `PowerShot` -> `Power`, `Shot`.
     * Leading and trailing delimiter characters are ignored.
     * Trailing possesive "'s" removed from subwords,
       e.g. `O'Neil's` -> `O`, `Neil`.
-    
+
     The mergewords and mergenums arguments turn on merging of subwords.
-    
+
     When the merge arguments are false, subwords are not merged.
-    
+
     * `PowerShot` -> `0`:`Power`, `1`:`Shot` (where `0` and `1` are token
       positions).
-    
+
     When one or both of the merge arguments are true, consecutive runs of
     alphabetic and/or numeric subwords are merged into an additional token with
     the same position as the last sub-word.
-    
+
     * `PowerShot` -> `0`:`Power`, `1`:`Shot`, `1`:`PowerShot`
     * `A's+B's&C's` -> `0`:`A`, `1`:`B`, `2`:`C`, `2`:`ABC`
     * `Super-Duper-XL500-42-AutoCoder!` -> `0`:`Super`, `1`:`Duper`, `2`:`XL`,
       `2`:`SuperDuperXL`,
       `3`:`500`, `4`:`42`, `4`:`50042`, `5`:`Auto`, `6`:`Coder`,
       `6`:`AutoCoder`
-    
+
     When using this filter you should use a tokenizer that only splits on
     whitespace, so the tokenizer does not remove intra-word delimiters before
     this filter can see them, and put this filter before any use of
     LowercaseFilter.
-    
+
     >>> rt = RegexTokenizer(r"\\S+")
     >>> iwf = IntraWordFilter()
     >>> lcf = LowercaseFilter()
     >>> analyzer = rt | iwf | lcf
-    
+
     One use for this filter is to help match different written representations
     of a concept. For example, if the source text contained `wi-fi`, you
     probably want `wifi`, `WiFi`, `wi-fi`, etc. to match. One way of doing this
     is to specify mergewords=True and/or mergenums=True in the analyzer used
     for indexing, and mergewords=False / mergenums=False in the analyzer used
     for querying.
-    
+
     >>> iwf_i = IntraWordFilter(mergewords=True, mergenums=True)
     >>> iwf_q = IntraWordFilter(mergewords=False, mergenums=False)
     >>> iwf = MultiFilter(index=iwf_i, query=iwf_q)
     >>> analyzer = RegexTokenizer(r"\S+") | iwf | LowercaseFilter()
-    
+
     (See :class:`MultiFilter`.)
     """
 

File src/whoosh/analysis/morph.py

     stemming algorithm. Stemming attempts to reduce multiple forms of the same
     root word (for example, "rendering", "renders", "rendered", etc.) to a
     single word in the index.
-    
+
     >>> stemmer = RegexTokenizer() | StemFilter()
     >>> [token.text for token in stemmer("fundamentally willows")]
     ["fundament", "willow"]
-    
+
     You can pass your own stemming function to the StemFilter. The default
     is the Porter stemming algorithm for English.
-    
+
     >>> stemfilter = StemFilter(stem_function)
-    
+
     By default, this class wraps an LRU cache around the stemming function. The
     ``cachesize`` keyword argument sets the size of the cache. To make the
     cache unbounded (the class caches every input), use ``cachesize=-1``. To
     disable caching, use ``cachesize=None``.
-    
+
     If you compile and install the py-stemmer library, the
     :class:`PyStemmerFilter` provides slightly easier access to the language
     stemmers in that library.
                 t.text = secondary
                 t.boost = b * secondary_boost
                 yield t
-
-
-
-
-
-
-

File src/whoosh/analysis/ngrams.py

 
 class NgramTokenizer(Tokenizer):
     """Splits input text into N-grams instead of words.
-    
+
     >>> ngt = NgramTokenizer(4)
     >>> [token.text for token in ngt("hi there")]
     ["hi t", "i th", " the", "ther", "here"]
     words, so the grams emitted by it will contain whitespace, punctuation,
     etc. You may want to massage the input or add a custom filter to this
     tokenizer's output.
-    
+
     Alternatively, if you only want sub-word grams without whitespace, you
     could combine a RegexTokenizer with NgramFilter instead.
     """
 
 class NgramFilter(Filter):
     """Splits token text into N-grams.
-    
+
     >>> rext = RegexTokenizer()
     >>> stream = rext("hello there")
     >>> ngf = NgramFilter(4)
 
 def NgramAnalyzer(minsize, maxsize=None):
     """Composes an NgramTokenizer and a LowercaseFilter.
-    
+
     >>> ana = NgramAnalyzer(4)
     >>> [token.text for token in ana("hi there")]
     ["hi t", "i th", " the", "ther", "here"]
     if not tokenizer:
         tokenizer = RegexTokenizer()
     return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at)
-

File src/whoosh/analysis/tokenizers.py

 class IDTokenizer(Tokenizer):
     """Yields the entire input string as a single token. For use in indexed but
     untokenized fields, such as a document's path.
-    
+
     >>> idt = IDTokenizer()
     >>> [token.text for token in idt("/a/b 123 alpha")]
     ["/a/b 123 alpha"]
 class RegexTokenizer(Tokenizer):
     """
     Uses a regular expression to extract tokens from text.
-    
+
     >>> rex = RegexTokenizer()
     >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
     ["hi", "there", "3.141", "big", "time", "under_score"]
     Characters that map to None are considered token break characters. For all
     other characters the map is used to translate the character. This is useful
     for case and accent folding.
-    
+
     This tokenizer loops character-by-character and so will likely be much
     slower than :class:`RegexTokenizer`.
-    
+
     One way to get a character mapping object is to convert a Sphinx charset
     table file using :func:`whoosh.support.charset.charset_table_to_dict`.
-    
+
     >>> from whoosh.support.charset import charset_table_to_dict
     >>> from whoosh.support.charset import default_charset
     >>> charmap = charset_table_to_dict(default_charset)
     >>> chtokenizer = CharsetTokenizer(charmap)
     >>> [t.text for t in chtokenizer(u'Stra\\xdfe ABC')]
     [u'strase', u'abc']
-    
+
     The Sphinx charset table format is described at
     http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
     """
 
 def SpaceSeparatedTokenizer():
     """Returns a RegexTokenizer that splits tokens by whitespace.
-    
+
     >>> sst = SpaceSeparatedTokenizer()
     >>> [token.text for token in sst("hi there big-time, what's up")]
     ["hi", "there", "big-time,", "what's", "up"]
 
 def CommaSeparatedTokenizer():
     """Splits tokens by commas.
-    
+
     Note that the tokenizer calls unicode.strip() on each match of the regular
     expression.
-    
+
     >>> cst = CommaSeparatedTokenizer()
     >>> [token.text for token in cst("hi there, what's , up")]
     ["hi there", "what's", "up"]
         for match in self.expr.finditer(value):
             token.text = value[:match.end()]
             yield token
-
-
-

File src/whoosh/classify.py

 
     def add(self, vector):
         """Adds forward-index information about one of the "top N" documents.
-        
+
         :param vector: A series of (text, weight) tuples, such as is
             returned by Reader.vector_as("weight", docnum, fieldname).
         """
 
     def expanded_terms(self, number, normalize=True):
         """Returns the N most important terms in the vectors added so far.
-        
+
         :param number: The number of terms to return.
         :param normalize: Whether to normalize the weights.
         :returns: A list of ("term", weight) tuples.
         clusters.append((left, right, j - i, v))
     clusters.sort(key=lambda x: (0 - x[2], x[3]))
     return clusters
-
-

File src/whoosh/codec/__init__.py

     from whoosh.codec.whoosh3 import W3Codec
 
     return W3Codec(*args, **kwargs)
-

File src/whoosh/codec/base.py

     """Do not instantiate this object directly. It is used by the Index object
     to hold information about a segment. A list of objects of this class are
     pickled as part of the TOC file.
-    
+
     The TOC file stores a minimal amount of information -- mostly a list of
     Segment objects. Segments are the real reverse indexes. Having multiple
     segments allows quick incremental indexing: just create a new segment for
     def _close_graph(self):
         if hasattr(self, "_gwriter") and self._gwriter:
             self._gwriter.close()
-
-
-
-
-
-
-
-
-
-

File src/whoosh/codec/memory.py

 
     def should_assemble(self):
         return False
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

File src/whoosh/codec/plaintext.py

 
     def should_assemble(self):
         return False
-
-
-
-
-
-
-
-
-
-
-
-
-
-

File src/whoosh/codec/whoosh2.py

     #assert len(text) == 17
     #return long(text[1:], 16)
     return from_base85(text[1:])
-

File src/whoosh/codec/whoosh3.py

         if self._deleted is None:
             return False
         return docnum in self._deleted
-
-
-
-

File src/whoosh/collectors.py

         r.termdocs = dict(self.termdocs)
         r.docterms = dict(self.docterms)
         return r
-
-
-

File src/whoosh/columns.py

 #            pos = self._refbase + offset * self._itemsize
 #            reflist = self._dbfile.get_array(pos, self._typecode, length)
 #            return [self._uniques[ref] for ref in reflist]
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

File src/whoosh/compat.py

         """
         funcobj.__isabstractmethod__ = True
         return funcobj
-
-
-

File src/whoosh/externalsort.py

 class SortingPool(object):
     """This object implements a general K-way external merge sort for Python
     objects.
-    
+
     >>> pool = MergePool()
     >>> # Add an unlimited number of items in any order
     >>> for item in my_items:
     >>> # Get the items back in sorted order
     >>> for item in pool.items():
     ...     print(item)
-    
+
     This class uses the `marshal` module to write the items to temporary files,
     so you can only sort marshal-able types (generally: numbers, strings,
     tuples, lists, and dicts).
 
     def items(self, maxfiles=128):
         """Returns a sorted list or iterator of the items in the pool.
-        
+
         :param maxfiles: maximum number of files to open at once.
         """
 
 
 def sort(items, maxsize=100000, tempdir=None, maxfiles=128):
     """Sorts the given items using an external merge sort.
-    
+
     :param tempdir: the path of a directory to use for temporary file
         storage. The default is to use the system's temp directory.
     :param maxsize: the maximum number of items to keep in memory at once.
     for item in items:
         p.add(item)
     return p.items(maxfiles=maxfiles)
-
-
-

File src/whoosh/fields.py

 
 class FieldType(object):
     """Represents a field configuration.
-    
+
     The FieldType object supports the following attributes:
-    
+
     * format (formats.Format): the storage format for the field's contents.
-    
+
     * analyzer (analysis.Analyzer): the analyzer to use to turn text into
       terms.
-    
+
     * vector (formats.Format): the storage format for the field's vectors
       (forward index), or None if the field should not store vectors.
-    
+
     * scorable (boolean): whether searches against this field may be scored.
       This controls whether the index stores per-document field lengths for
       this field.
-          
+
     * stored (boolean): whether the content of this field is stored for each
       document. For example, in addition to indexing the title of a document,
       you usually want to store the title so it can be presented as part of
       the search results.
-         
+
     * unique (boolean): whether this field's value is unique to each document.
       For example, 'path' or 'ID'. IndexWriter.update_document() will use
       fields marked as 'unique' to find the previous version of a document
       being updated.
-      
+
     * multitoken_query is a string indicating what kind of query to use when
       a "word" in a user query parses into multiple tokens. The string is
       interpreted by the query parser. The strings understood by the default
       with an AND query), "or" (join the tokens with OR), "phrase" (join
       the tokens with a phrase query), and "default" (use the query parser's
       default join type).
-    
+
     The constructor for the base field type simply lets you supply your own
     configured field format, vector format, and scorable and stored values.
     Subclasses may configure some or all of this for you.
     def index(self, value, **kwargs):
         """Returns an iterator of (btext, frequency, weight, encoded_value)
         tuples for each unique word in the input value.
-        
+
         The default implementation uses the ``analyzer`` attribute to tokenize
         the value into strings, then encodes them into bytes using UTF-8.
         """
 
     def process_text(self, qstring, mode='', **kwargs):
         """Analyzes the given string and returns an iterator of token texts.
-        
+
         >>> field = fields.TEXT()
         >>> list(field.process_text("The ides of March"))
         ["ides", "march"]
         """Returns an iterator of the "sortable" tokens in the given reader and
         field. These values can be used for sorting. The default implementation
         simply returns all tokens in the field.
-        
+
         This can be overridden by field types such as NUMERIC where some values
         in a field are not useful for sorting.
         """
     def separate_spelling(self):
         """Returns True if this field requires special handling of the words
         that go into the field's word graph.
-        
+
         The default behavior is to return True if the field is "spelled" but
         not indexed, or if the field is indexed but the analyzer has
         morphological transformations (e.g. stemming). Exotic field types may
         need to override this behavior.
-        
+
         This method should return False if the field does not support spelling
         (i.e. the ``spelling`` attribute is False).
         """
     def spellable_words(self, value):
         """Returns an iterator of each unique word (in sorted order) in the
         input value, suitable for inclusion in the field's word graph.
-        
+
         The default behavior is to call the field analyzer with the keyword
         argument ``no_morph=True``, which should make the analyzer skip any
         morphological transformation filters (e.g. stemming) to preserve the
     def supports(self, name):
         """Returns True if the underlying format supports the given posting
         value type.
-        
+
         >>> field = TEXT()
         >>> field.supports("positions")
         True
     """Special field type that lets you index integer or floating point
     numbers in relatively short fixed-width terms. The field converts numbers
     to sortable bytes for you before indexing.
-    
+
     You specify the numeric type of the field (``int`` or ``float``) when you
     create the ``NUMERIC`` object. The default is ``int``. For ``int``, you can
     specify a size in bits (``32`` or ``64``). For both ``int`` and ``float``
     you can specify a ``signed`` keyword argument (default is ``True``).
-    
+
     >>> schema = Schema(path=STORED, position=NUMERIC(int, 64, signed=False))
     >>> ix = storage.create_index(schema)
     >>> with ix.writer() as w:
     ...     w.add_document(path="/a", position=5820402204)
     ...
-    
+
     You can also use the NUMERIC field to store Decimal instances by specifying
     a type of ``int`` or ``long`` and the ``decimal_places`` keyword argument.
     This simply multiplies each number by ``(10 ** decimal_places)`` before
     storing it as an integer. Of course this may throw away decimal prcesision
     (by truncating, not rounding) and imposes the same maximum value limits as
     ``int``/``long``, but these may be acceptable for certain applications.
-    
+
     >>> from decimal import Decimal
     >>> schema = Schema(path=STORED, position=NUMERIC(int, decimal_places=4))
     >>> ix = storage.create_index(schema)
     >>> with ix.writer() as w:
     ...     w.add_document(path="/a", position=Decimal("123.45")
     ...
-    
+
     """
 
     def __init__(self, numtype=int, bits=32, stored=False, unique=False,
 class DATETIME(NUMERIC):
     """Special field type that lets you index datetime objects. The field
     converts the datetime objects to sortable text for you before indexing.
-    
+
     Since this field is based on Python's datetime module it shares all the
     limitations of that module, such as the inability to represent dates before
     year 1 in the proleptic Gregorian calendar. However, since this field
     stores datetimes as an integer number of microseconds, it could easily
     represent a much wider range of dates if the Python datetime implementation
     ever supports them.
-    
+
     >>> schema = Schema(path=STORED, date=DATETIME)
     >>> ix = storage.create_index(schema)
     >>> w = ix.writer()
 class BOOLEAN(FieldType):
     """Special field type that lets you index boolean values (True and False).
     The field converts the boolean values to text for you before indexing.
-    
+
     >>> schema = Schema(path=STORED, done=BOOLEAN)
     >>> ix = storage.create_index(schema)
     >>> w = ix.writer()
 class Schema(object):
     """Represents the collection of fields in an index. Maps field names to
     FieldType objects which define the behavior of each field.
-    
+
     Low-level parts of the index use field numbers instead of field names for
     compactness. This class has several methods for converting between the
     field name, field number, and field object itself.
         fieldtype pairs. The fieldtype can be an instantiated FieldType object,
         or a FieldType sub-class (in which case the Schema will instantiate it
         with the default constructor before adding it).
-        
+
         For example::
-        
+
             s = Schema(content = TEXT,
                        title = TEXT(stored = True),
                        tags = KEYWORD(stored = True))
 
     def add(self, name, fieldtype, glob=False):
         """Adds a field to this schema.
-        
+
         :param name: The name of the field.
         :param fieldtype: An instantiated fields.FieldType object, or a
             FieldType subclass. If you pass an instantiated object, the schema
 
     """Allows you to define a schema using declarative syntax, similar to
     Django models::
-    
+
         class MySchema(SchemaClass):
             path = ID
             date = DATETIME
             content = TEXT
-            
+
     You can use inheritance to share common fields between schemas::
-    
+
         class Parent(SchemaClass):
             path = ID(stored=True)
             date = DATETIME
-            
+
         class Child1(Parent):
             content = TEXT(positions=False)
-            
+
         class Child2(Parent):
             tags = KEYWORD
-    
+
     This class overrides ``__new__`` so instantiating your sub-class always
     results in an instance of ``Schema``.
-    
+
     >>> class MySchema(SchemaClass):
     ...     title = TEXT(stored=True)
     ...     content = TEXT
     for i in xrange(1, len(schemas)):
         schema = merge_schema(schema, schemas[i])
     return schema
-
-
-

File src/whoosh/filedb/compound.py

             length = bio.tell()
             if length:
                 self.blocks.append((bio, 0, length))
-
-
-
-
-
-
-
-
-

File src/whoosh/filedb/filetables.py

     are used to index into one of 256 hash tables. This is basically the CDB
     algorithm, but unlike CDB this object writes all data serially (it doesn't
     seek backwards to overwrite information at the end).
-    
+
     Also unlike CDB, this format uses 64-bit file pointers, so the file length
     is essentially unlimited. However, each key and value must be less than
     2 GB in length.
         dbfile = self.dbfile
         for keypos, keylen, datapos, datalen in self.ranges_from(key):
             yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

File src/whoosh/filedb/gae.py

 To create a new index::
 
     from whoosh.filedb.gae import DataStoreStorage
-    
+
     ix = DataStoreStorage().create_index(schema)
 
 To open an existing index::

File src/whoosh/filedb/structfile.py

 
     def checksum(self):
         return self._check & 0xffffffff
-
-
-
-
-
-
-

File src/whoosh/formats.py

         token. For example, in a Frequency format, the value string would be
         the same as frequency; in a Positions format, the value string would
         encode a list of token positions at which "tokentext" occured.
-        
+
         :param value: The unicode text to index.
         :param analyzer: The analyzer to use to process the text.
         """
     """Only indexes whether a given term occurred in a given document; it does
     not store frequencies or positions. This is useful for fields that should
     be searchable but not scorable, such as file path.
-    
+
     Supports: frequency, weight (always reports frequency = 1).
     """
 
 
 class Frequency(Format):
     """Stores frequency information for each posting.
-    
+
     Supports: frequency, weight.
     """
 
 class Positions(Format):
     """Stores position information in each posting, to allow phrase searching
     and "near" queries.
-    
+
     Supports: frequency, weight, positions, position_boosts (always reports
     position boost = 1.0).
     """
 class Characters(Positions):
     """Stores token position and character start and end information for each
     posting.
-    
+
     Supports: frequency, weight, positions, position_boosts (always reports
     position boost = 1.0), characters.
     """
 class PositionBoosts(Positions):
     """A format that stores positions and per-position boost information
     in each posting.
-    
+
     Supports: frequency, weight, positions, position_boosts.
     """
 
 class CharacterBoosts(Characters):
     """A format that stores positions, character start and end, and
     per-position boost information in each posting.
-    
+
     Supports: frequency, weight, positions, position_boosts, characters,
     character_boosts.
     """
         poses = [(pos, sc, ec, boost) for pos, (sc, ec, boost)
                  in sorted(s.items())]
         return self.encode(poses)[0]  # encode() returns value, summedboost
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

File src/whoosh/fst.py

         means for this class. For example, a string implementation would return
         the common shared prefix, for an int implementation it would return
         the minimum of the two numbers.
-        
+
         If there is no common part, this method should return None.
         """
 
 
 class ComboNode(Node):
     """Base class for nodes that blend the nodes of two different graphs.
-    
+
     Concrete subclasses need to implement the ``edge()`` method and possibly
     override the ``accept`` property.
     """
 class BaseCursor(object):
     """Base class for a cursor-type object for navigating an FST/word graph,
     represented by a :class:`GraphReader` object.
-    
+
     >>> cur = GraphReader(dawgfile).cursor()
     >>> for key in cur.follow():
     ...   print(repr(key))
-    
+
     The cursor "rests" on arcs in the FSA/FST graph, rather than nodes.
     """
 
 class Arc(object):
     """
     Represents a directed arc between two nodes in an FSA/FST graph.
-    
+
     The ``lastarc`` attribute is True if this is the last outgoing arc from the
     previous node.
     """
 
 class GraphWriter(object):
     """Writes an FSA/FST graph to disk.
-    
+
     Call ``insert(key)`` to insert keys into the graph. You must
     insert keys in sorted order. Call ``close()`` to finish the graph and close
     the file.
-    
+
     >>> gw = GraphWriter(my_file)
     >>> gw.insert("alfa")
     >>> gw.insert("bravo")
     >>> gw.insert("charlie")
     >>> gw.close()
-    
+
     The graph writer can write separate graphs for multiple fields. Use
     ``start_field(name)`` and ``finish_field()`` to separate fields.
-    
+
     >>> gw = GraphWriter(my_file)
     >>> gw.start_field("content")
     >>> gw.insert("alfalfa")
 
     def insert(self, key, value=None):
         """Inserts the given key into the graph.
-        
+
         :param key: a sequence of bytes objects, a bytes object, or a string.
         :param value: an optional value to encode in the graph along with the
             key. If the writer was not instantiated with a value type, passing
                   % (arc.label, arc.target, arc.accept, arc.value))
         if arc.target is not None:
             dump_graph(graph, arc.target, tab + 1, out=out)
-
-

File src/whoosh/highlight.py

         fragments = top_fragments(fragments, top, self.scorer, self.order)
         output = self.formatter.format(fragments)
         return output
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

File src/whoosh/idsets.py

 
 class OnDiskBitSet(BaseBitSet):
     """A DocIdSet backed by an array of bits on disk.
-    
+
     >>> st = RamStorage()
     >>> f = st.create_file("test.bin")
     >>> bs = BitSet([1, 10, 15, 7, 2])
 
         pos = bisect_right(data, i)
         return data[pos]
-
-
-
-
-

File src/whoosh/index.py

         stream.write_uint(stream.checksum())
         stream.close()
         storage.rename_file(tempfilename, tocfilename, safe=True)
-
-
-
-

File src/whoosh/lang/__init__.py

         return stoplists[tlc]
 
     raise Exception("No stop-word list available for %r" % lang)
-
-
-

File src/whoosh/lang/isri.py

             if self.stm.startswith(sp1):
                 self.stm = self.stm[1:]
                 return self.stm
-
-

File src/whoosh/lang/morph_en.py

     """Given an English word, returns a collection of morphological variations
     on the word by algorithmically adding and removing suffixes. The variation
     list may contain non-words (e.g. render -> renderment).
-    
+
     >>> variations("pull")
     set(['pull', 'pullings', 'pullnesses', 'pullful', 'pullment', 'puller', ... ])
     """

File src/whoosh/lang/paicehusk.py

 tsis0.    { protect  -sist }
 tsi3>     { -ist > -   }
 tt1.      { -tt > -t   }
-uqi3.     { -iqu > -   } 
+uqi3.     { -iqu > -   }
 ugo1.     { -ogu > -og }
 vis3j>    { -siv > -j  }
 vie0.     { protect  -eiv }

File src/whoosh/lang/phonetic.py

                     r += code
             prevcode = code
     return r
-
-
-
-
-

File src/whoosh/lang/porter.py

 def stem(w):
     """Uses the Porter stemming algorithm to remove suffixes from English
     words.
-    
+
     >>> stem("fundamentally")
     "fundament"
     """

File src/whoosh/lang/snowball/__init__.py

            "es": SpanishStemmer,
            "sv": SwedishStemmer,
            }
-

File src/whoosh/lang/snowball/bases.py

                 rv = word[3:]
 
         return rv
-
-
-

File src/whoosh/lang/stopwords.py

     niye o sanki şey siz şu tüm ve veya ya yani
     """.split()),
 }
-
-

File src/whoosh/lang/wordnet.py

 class Thesaurus(object):
     """Represents the WordNet synonym database, either loaded into memory
     from the wn_s.pl Prolog file, or stored on disk in a Whoosh index.
-    
+
     This class allows you to parse the prolog file "wn_s.pl" from the WordNet prolog
     download into an object suitable for looking up synonyms and performing query
     expansion.
 
     http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz
-    
+
     To load a Thesaurus object from the wn_s.pl file...
-    
+
     >>> t = Thesaurus.from_filename("wn_s.pl")
-    
+
     To save the in-memory Thesaurus to a Whoosh index...
-    
+
     >>> from whoosh.filedb.filestore import FileStorage
     >>> fs = FileStorage("index")
     >>> t.to_storage(fs)
-    
+
     To load a Thesaurus object from a Whoosh index...
-    
+
     >>> t = Thesaurus.from_storage(fs)
-    
+
     The Thesaurus object is thus usable in two ways:
-    
+
     * Parse the wn_s.pl file into memory (Thesaurus.from_*) and then look up
       synonyms in memory. This has a startup cost for parsing the file, and uses
       quite a bit of memory to store two large dictionaries, however synonym
       look-ups are very fast.
-      
+
     * Parse the wn_s.pl file into memory (Thesaurus.from_filename) then save it to
       an index (to_storage). From then on, open the thesaurus from the saved
       index (Thesaurus.from_storage). This has a large cost for storing the index,
       but after that it is faster to open the Thesaurus (than re-parsing the file)
       but slightly slower to look up synonyms.
-    
+
     Here are timings for various tasks on my (fast) Windows machine, which might
     give an idea of relative costs for in-memory vs. on-disk.
-    
+
     ================================================ ================
     Task                                             Approx. time (s)
     ================================================ ================
     Look up synonyms for "light" (in memory)         0.0011
     Look up synonyms for "light" (loaded from disk)  0.0028
     ================================================ ================
-    
+
     Basically, if you can afford spending the memory necessary to parse the
     Thesaurus and then cache it, it's faster. Otherwise, use an on-disk index.
     """
     def from_file(cls, fileobj):
         """Creates a Thesaurus object from the given file-like object, which should
         contain the WordNet wn_s.pl file.
-        
+
         >>> f = open("wn_s.pl")
         >>> t = Thesaurus.from_file(f)
         >>> t.synonyms("hail")
     def from_filename(cls, filename):
         """Creates a Thesaurus object from the given filename, which should
         contain the WordNet wn_s.pl file.
-        
+
         >>> t = Thesaurus.from_filename("wn_s.pl")
         >>> t.synonyms("hail")
         ['acclaim', 'come', 'herald']
     def from_storage(cls, storage, indexname="THES"):
         """Creates a Thesaurus object from the given storage object,
         which should contain an index created by Thesaurus.to_storage().
-        
+
         >>> from whoosh.filedb.filestore import FileStorage
         >>> fs = FileStorage("index")
         >>> t = Thesaurus.from_storage(fs)
         >>> t.synonyms("hail")
         ['acclaim', 'come', 'herald']
-        
+
         :param storage: A :class:`whoosh.store.Storage` object from
             which to load the index.
         :param indexname: A name for the index. This allows you to
     def to_storage(self, storage, indexname="THES"):
         """Creates am index in the given storage object from the
         synonyms loaded from a WordNet file.