Whoosh raises MemoryError if importing sklearn modules

Issue #485 new
mx2048
created an issue

If I import sklearn or from sklearn import manifold before opening the searcher = ix.searcher(), I get MemoryError:

  File ".../sklearn_whoosh_memory_error.py", line 58, in open_index
    wh_searcher = ix.searcher()
  File "...\lib\site-packages\whoosh\index.py", line 318, in searcher
    return Searcher(self.reader(), fromindex=self, **kwargs)
  File "...\lib\site-packages\whoosh\index.py", line 548, in reader
    info.generation, reuse=reuse)
  File "...\lib\site-packages\whoosh\index.py", line 535, in _reader
    readers = [segreader(segment) for segment in segments]
  File "...\lib\site-packages\whoosh\index.py", line 535, in <listcomp>
    readers = [segreader(segment) for segment in segments]
  File "...\lib\site-packages\whoosh\index.py", line 524, in segreader
    generation=generation)
  File "...\lib\site-packages\whoosh\reading.py", line 620, in __init__
    self._terms = self._codec.terms_reader(self._storage, segment)
  File "...\lib\site-packages\whoosh\codec\whoosh3.py", line 122, in terms_reader
    postfile = segment.open_file(storage, self.POSTS_EXT)
  File "...\lib\site-packages\whoosh\codec\base.py", line 556, in open_file
    return storage.open_file(fname, **kwargs)
  File "...\lib\site-packages\whoosh\filedb\filestore.py", line 333, in open_file
    return self.a.open_file(name, *args, **kwargs)
  File "...\lib\site-packages\whoosh\filedb\compound.py", line 121, in open_file
    f = BufferFile(buf, name=name)
  File "...\lib\site-packages\whoosh\filedb\structfile.py", line 357, in __init__
    self.file = BytesIO(buf)
MemoryError

The solution is to import sklearn after the whoosh searcher is closed: searcher.close().

Note, I have more than 3 GB of available memory, while the peak working memory set of the whole script is about 1.5 GB. At that, my indexdir contains segments of total size 33 GB, the largest segment is 7.5 GB.

Here is example of my code:

def show_memory(memory_tuple):

    def convert_to_mb(num):
        num //= (1024*1024)
        return num

    wset = memory_tuple.wset  # Windows working memory set
    peak_wset = memory_tuple.peak_wset  # Windows peak working memory set

    print('wset = {} MB ♦ peak_wset = {} MB'.format(convert_to_mb(wset),
                                                  convert_to_mb(peak_wset)))


def show_event(event):
    print(event.ljust(45, '-') + ':', end=' ')


import os
import psutil

process = psutil.Process(os.getpid())
show_event('Start')
show_memory(process.memory_info())

from whoosh.index import open_dir
show_event('whoosh.index')
show_memory(process.memory_info())

from whoosh.qparser import MultifieldParser, PhrasePlugin, SequencePlugin
show_event('whoosh.qparser')
show_memory(process.memory_info())

from sklearn import manifold
show_event('from sklearn import manifold')
show_memory(process.memory_info())

from sklearn.decomposition import PCA
show_event('from sklearn.decomposition import PCA')
show_memory(process.memory_info())


def open_index():
    """Open existing whoosh search index."""

    global wh_searcher
    global wh_parser

    folder = r"\\a\b\c"

    ix = open_dir(folder)

    wh_parser = MultifieldParser(["q", "w", "e"], schema=ix.schema)
    wh_searcher = ix.searcher()
    wh_parser.remove_plugin_class(PhrasePlugin)
    wh_parser.add_plugin(SequencePlugin)


def close_searcher():
    wh_searcher.close()


def search_whoosh_index(search_query):
    query = wh_parser.parse(search_query)
    results = wh_searcher.search(query, limit=1, scored=False, sortedby=None)
    return results


def main():
    open_index()

    show_event('open_index()')
    show_memory(process.memory_info())

    search_query = 'some query terms'
    results = search_whoosh_index(search_query)

    show_event('search_whoosh_index')
    show_memory(process.memory_info())

    close_searcher()

    show_event('close_searcher')
    show_memory(process.memory_info())

    #Import here to resolve the issue
    # from sklearn import manifold
    # from sklearn.decomposition import PCA


if __name__ == '__main__':

    #main()

    try:
        main()
        show_event('Success')
        show_memory(process.memory_info())

    except Exception:
        show_event('Memory error')
        show_memory(process.memory_info())

Here is output when success:

Start----------------------------------------: wset = 12 MB  peak_wset = 12 MB
whoosh.index---------------------------------: wset = 16 MB  peak_wset = 16 MB
whoosh.qparser-------------------------------: wset = 18 MB  peak_wset = 18 MB
from sklearn import manifold-----------------: wset = 63 MB  peak_wset = 63 MB
from sklearn.decomposition import PCA--------: wset = 63 MB  peak_wset = 63 MB
open_index()---------------------------------: wset = 1543 MB  peak_wset = 1543 MB
Hit:  498
search_whoosh_index--------------------------: wset = 1545 MB  peak_wset = 1545 MB
close_searcher-------------------------------: wset = 806 MB  peak_wset = 1545 MB
Success--------------------------------------: wset = 806 MB  peak_wset = 1545 MB

I don't know if the whoosh project is abandoned or not. Anyway, I hope the solution to this issue will help somebody.

Comments (0)

  1. Log in to comment