OverflowError when writer adds document

Issue #460 new
Qiyun Zhu created an issue

I was trying to index the titles and abstract texts of the entire PubMed (~2,500,500 articles in total). It worked fine until reaching ~1/3 of the whole process. Here is the error message:

Traceback (most recent call last):
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/util/numlists.py", line 57, in append
    self.array.append(n)
OverflowError: unsigned int is greater than maximum

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "index_pubmed.py", line 26, in <module>
    writer.add_document(title=title, content=abstract)
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/writing.py", line 786, in add_document
    perdocwriter.finish_doc()
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/codec/whoosh3.py", line 250, in finish_doc
    self.add_column_value("_stored", STORED_COLUMN, sf)
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/codec/base.py", line 821, in add_column_value
    self._get_column(fieldname).add(self._docnum, value)
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/columns.py", line 1265, in add
    self._child.add(docnum, v)
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/columns.py", line 855, in add
    VarBytesColumn.Writer.add(self, docnum, v)
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/columns.py", line 276, in add
    self._offsets.append(self._offset_base)
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/util/numlists.py", line 59, in append
    self._retype(n)
  File "/home/me/Programs/Miniconda2/envs/py3/lib/python3.5/site-packages/whoosh/util/numlists.py", line 48, in _retype
    raise OverflowError("%r is too big to fit in an array" % maxnum)
OverflowError: 4294967382 is too big to fit in an array

Comments (2)

  1. Vimos

    Similar error here

    Traceback (most recent call last):
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/util/numlists.py", line 57, in append
        self.array.append(n)
    OverflowError: unsigned int is greater than maximum
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "/home/vimos/Data/Dataset/freebase/index_en.py", line 42, in <module>
        writer.commit()
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/multiproc.py", line 253, in commit
        self._commit(mergetype, optimize, merge)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/multiproc.py", line 268, in _commit
        finalsegments = self._merge_segments(mergetype, optimize, merge)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/writing.py", line 827, in _merge_segments
        return mergetype(self, self.segments)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/writing.py", line 101, in MERGE_SMALL
        writer.add_reader(reader)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/writing.py", line 709, in add_reader
        docmap = self.write_per_doc(fieldnames, reader)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/writing.py", line 697, in write_per_doc
        pdw.finish_doc()
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/codec/whoosh3.py", line 250, in finish_doc
        self.add_column_value("_stored", STORED_COLUMN, sf)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/codec/base.py", line 821, in add_column_value
        self._get_column(fieldname).add(self._docnum, value)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/columns.py", line 1265, in add
        self._child.add(docnum, v)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/columns.py", line 855, in add
        VarBytesColumn.Writer.add(self, docnum, v)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/columns.py", line 276, in add
        self._offsets.append(self._offset_base)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/util/numlists.py", line 59, in append
        self._retype(n)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/util/numlists.py", line 48, in _retype
        raise OverflowError("%r is too big to fit in an array" % maxnum)
    OverflowError: 4294967443 is too big to fit in an array
    

    here is the version info

      BIMPM-pytorch git:(master)  ipython                                           [18/05/23| 8:25PM]
    Python 3.6.4 |Anaconda custom (64-bit)| (default, Jan 16 2018, 18:10:19) 
    Type 'copyright', 'credits' or 'license' for more information
    IPython 6.2.1 -- An enhanced Interactive Python. Type '?' for help.
    
    In [1]: import whoosh
    
    In [2]: whoosh.__version__
    Out[2]: (2, 7, 4)
    

    But my problem is resolved after I change my code from

    for j, fn in enumerate(os.scandir(en)):
        print('{}: {} {} -------------------------------------------------'.format(j, fn.name, count))
        writer = ix.writer(limitmb=4096, procs=4, multisegment=True)
        with gzip.open(fn, 'rb') as f:
            for i, l in enumerate(f):
                s, r, o, _ = l.decode().strip().split('\t')
                if not o.endswith('@en'):
                    continue
                count += 1
                writer.add_document(entity=s,
                                    relation=r,
                                    name=name_re.search(o).group('name'))
                if count % 100000 == 0:
                    print(i, l.decode(), o)
        writer.commit()
    

    to

    limitmb = 4096
    for j, fn in enumerate(os.scandir(en)):
        print('{}: {} {} -------------------------------------------------'.format(j, fn.name, count))
        writer = ix.writer(limitmb=limitmb, procs=6, multisegment=True)
        with gzip.open(fn, 'rb') as f:
            for i, l in enumerate(f):
                s, r, o, _ = l.decode().strip().split('\t')
                if not o.endswith('@en') or r in ['<http://www.w3.org/2000/01/rdf-schema#label>']:
                    continue
                count += 1
                writer.add_document(entity=s,
                                    relation=r,
                                    name=name_re.search(o).group('name'))
                if count % 500000 == 0:
                    print(count, i, l.decode())
                    writer.commit(merge=False)
                    writer = ix.writer(limitmb=limitmb, procs=6, multisegment=True)
            try:
                print(count, i, l.decode())
                writer.commit(merge=False)
            except Exception as e:
                print(e)
    

    I guess when the file is too large, commit only once will lead to this error.

    Update: The new code also fails

    Traceback (most recent call last):
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/util/numlists.py", line 57, in append
        self.array.append(n)
    OverflowError: unsigned int is greater than maximum
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "index_en.py", line 62, in <module>
        writer.commit()
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/multiproc.py", line 253, in commit
        self._commit(mergetype, optimize, merge)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/multiproc.py", line 268, in _commit
        finalsegments = self._merge_segments(mergetype, optimize, merge)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/writing.py", line 827, in _merge_segments
        return mergetype(self, self.segments)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/writing.py", line 101, in MERGE_SMALL
        writer.add_reader(reader)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/writing.py", line 709, in add_reader
        docmap = self.write_per_doc(fieldnames, reader)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/writing.py", line 697, in write_per_doc
        pdw.finish_doc()
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/codec/whoosh3.py", line 250, in finish_doc
        self.add_column_value("_stored", STORED_COLUMN, sf)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/codec/base.py", line 821, in add_column_value
        self._get_column(fieldname).add(self._docnum, value)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/columns.py", line 1265, in add
        self._child.add(docnum, v)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/columns.py", line 855, in add
        VarBytesColumn.Writer.add(self, docnum, v)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/columns.py", line 276, in add
        self._offsets.append(self._offset_base)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/util/numlists.py", line 59, in append
        self._retype(n)
      File "/home/vimos/anaconda3/lib/python3.6/site-packages/whoosh/util/numlists.py", line 48, in _retype
        raise OverflowError("%r is too big to fit in an array" % maxnum)
    OverflowError: 4294967364 is too big to fit in an array
    
  2. Log in to comment