Writer part in python3 not working with **metadata

Issue #32 new
gritschel created an issue

Hi AJ (me again ...),

in the current version the writer part always drops me an error, when I am trying (as always) my reading data -- reading metadata -- writing everything code sniplet in python3 (under Windows).

Here is the code:

from __future__ import division, print_function
#from __future__ import absolute_import, unicode_literals

import numpy as np
import os
import sys
import savReaderWriter as sav

directory = "C:\\Users\\RitschelG\\Projekte\\spss_to_pandas"
spss_filename = r"test.sav"
spss_file = os.path.join(directory, spss_filename)

# read SPSS file data
ioLocale = "german" if sys.platform.startswith("win") else "de_DE.cp1252"
data = sav.SavReader(spss_file, returnHeader=True, ioUtf8=True, ioLocale=ioLocale, rawMode=True)
with data:
    allData = data.all()
variables = allData[0]
records = allData[1:]

# read SPSS file metadata
with sav.SavHeaderReader(spss_file, ioUtf8=True, ioLocale=ioLocale) as header:
    metadata = header.dataDictionary(asNamedtuple=False)  # Why does this take so long?

# write (unmodified) data to SPSS file
spss_file_out = os.path.join(directory, 'out.sav')
with sav.SavWriter(spss_file_out, overwrite=True, ioUtf8=True, ioLocale=ioLocale,
                   mode=b'wb', refSavFileName=None, **metadata) as writer:
    for i, record in enumerate(records):
        writer.writerow(record)

And here the error message:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-1-0d7726b30128> in <module>()
     26 spss_file_out = os.path.join(directory, 'out.sav')
     27 with sav.SavWriter(spss_file_out, overwrite=True, ioUtf8=True, ioLocale=ioLocale,
---> 28                    mode=b'wb', refSavFileName=None, **metadata) as writer:
     29     for i, record in enumerate(records):
     30         writer.writerow(record)

C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\savWriter.py in __init__(self, savFileName, varNames, varTypes, valueLabels, varLabels, formats, missingValues, measureLevels, columnWidths, alignments, varSets, varRoles, varAttributes, fileAttributes, fileLabel, multRespDefs, caseWeightVar, overwrite, ioUtf8, ioLocale, mode, refSavFileName)
    183             self.varLabels = varLabels
    184             self.formats = formats
--> 185             self.missingValues = missingValues
    186             self.measureLevels = measureLevels
    187             self.columnWidths = columnWidths

C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\header.py in missingValues(self, missingValues)
    623         if missingValues:
    624             for varName, kwargs in missingValues.items():
--> 625                 self._setMissingValue(varName, **kwargs)
    626 
    627     # measurelevel, colwidth and alignment must all be set or not at all.

C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\header.py in _setMissingValue(self, varName, **kwargs)
    551                 raise ValueError("Missing value label > 9 bytes")
    552 
--> 553             nvalues = len(values) if values is not None else values
    554             if values is None or values == {}:
    555                 missingFmt = "SPSS_NO_MISSVAL"

TypeError: object of type 'map' has no len()

Note: It doesn't matter whether rawMode is set True or False.

When trying the same with ioUtf8=False everywhere, I get the following (different) error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-1-f9299041e964> in <module>()
     26 spss_file_out = os.path.join(directory, 'out.sav')
     27 with sav.SavWriter(spss_file_out, overwrite=True, ioUtf8=False, ioLocale=ioLocale,
---> 28                    mode=b'wb', refSavFileName=None, **metadata) as writer:
     29     for i, record in enumerate(records):
     30         writer.writerow(record)

C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\savWriter.py in __init__(self, savFileName, varNames, varTypes, valueLabels, varLabels, formats, missingValues, measureLevels, columnWidths, alignments, varSets, varRoles, varAttributes, fileAttributes, fileLabel, multRespDefs, caseWeightVar, overwrite, ioUtf8, ioLocale, mode, refSavFileName)
    183             self.varLabels = varLabels
    184             self.formats = formats
--> 185             self.missingValues = missingValues
    186             self.measureLevels = measureLevels
    187             self.columnWidths = columnWidths

C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\header.py in missingValues(self, missingValues)
    623         if missingValues:
    624             for varName, kwargs in missingValues.items():
--> 625                 self._setMissingValue(varName, **kwargs)
    626 
    627     # measurelevel, colwidth and alignment must all be set or not at all.

TypeError: _setMissingValue() keywords must be strings

Note: It doesn't matter whether rawMode is set True or False.

I guess this one has to do with the str vs. unicode type issue in python3.

Comments (5)

  1. Albert-Jan Roskam repo owner

    hi, thanks once again!

    • Argh, still more Python 3 problems. I do not have access to Python 3 now, but nvalues = len(values) should probably be nvalues = len(list(values)) for it to also work with Python 3. Something like map(float, range(10)) returns a list in Python 2, but an 'object of type 'map'' in Python 3.

    • The second point appears to be caused by the fact that you specified lower/upper/value/values as a bytestring and not as a unicode string. See the documentation # note also that 'lower', 'upper', 'value(s)' are without b' prefix

  2. gritschel reporter

    Argh, indeed. :-) I never tried to write py2/3 compatible code myself, but I see that one has to take into account many pitfalls.

    Concerning the second point: Well, I did not specify anything by hand (see code above). I am just piping everything from sav.SavHeaderReader back to sav.SavWriter using dictionary unpacking, i.e. **metadata. So from my user's point of view I can only say that the writer routine does not seem to be happy with what the reader routine provided. So in a sense it is "not my fault". ;-)

  3. Albert-Jan Roskam repo owner

    Hi Gerhard, I just pushed some changes (163a1c518b). It is now possible to use the **metadata trick even in unicode mode. Based on your code above I wrote two test cases: https://bitbucket.org/fomcl/savreaderwriter/src/163a1c518b52472eb0e23637d849b4aa005a35b4/savReaderWriter/unit_tests/test_SavHeaderReader_metadata_roundtrip_ioUtf8.py?at=master

    Basically, you need to use ioUtf8=UNICODE_BMODE so data are returned as bytes

    • codepage mode: ioUtf8 = CODEPAGE_MODE, or ioUtf8 = 0, or ioUtf8 = False
    • standard unicode mode: ioUtf8 = UNICODE_UMODE, or ioUtf8 = 1, or ioUtf8 = True
    • bytes unicode mode: ioUtf8 = UNICODE_BMODE, or ioUtf8 = 2

    There is still some work here, but I like this approach.

    Oh, and the lower-upper-values thing from missingValues is also fixed.

    Would you like to try the test script in the link?

    Regards, Albert-Jan

  4. gritschel reporter

    Hi AJ, thanks for the update! I am going to try out your test script. It's fantastic news that you have been working on the **metadata trick -- I also like this approach. ;-) I will try it out and let you know my findings ... Regards, Gerhard

  5. gritschel reporter

    I tried your unittest scipt and possibly found some unexpected behaviour. You wrote in the comments that the unittest fails, if ioLocale is specified. I found that this is not the issue. Even without ioLocale specified the unittest fails on the first run, but it succeeds on the second and subsequent runs. I couldn't find the reason for this. But it happens with and without specifying ioLocale.

    Here's what I did (version savReaderWriter-163a1c518b52472eb0e23637d849b4aa005a35b4 on python2 32bit and win7)

    Python 2.7.8 |Anaconda 2.1.0 (32-bit)| (default, Jul  2 2014, 15:13:35) [MSC v.1500 32 bit (Intel)]
    Type "copyright", "credits" or "license" for more information.
    
    IPython 2.2.0 -- An enhanced Interactive Python.
    Anaconda is brought to you by Continuum Analytics.
    Please check out: http://continuum.io/thanks and https://binstar.org
    ?         -> Introduction and overview of IPython's features.
    %quickref -> Quick reference.
    help      -> Python's own help system.
    object?   -> Details about 'object', use 'object??' for extra details.
    %guiref   -> A brief reference about the graphical user interface.
    
    In [1]: cd "C:/Users/RitschelG/AppData/Local/Continuum/32bit/Anaconda/Lib/site-packages\savReaderWriter-163a1c518b52472eb0e23637d849b4aa005a35b4-py2.7.egg\savReaderWriter"
    C:\Users\RitschelG\AppData\Local\Continuum\32bit\Anaconda\Lib\site-packages\savReaderWriter-163a1c518b52472eb0e23637d849b4aa005a35b4-py2.7.egg\savReaderWriter
    
    In [2]: run "unit_tests/test_SavHeaderReader_metadata_roundtrip_ioUtf8.py"
    .F
    ======================================================================
    FAIL: test_metadata_same (__main__.Test_MetadataRoundTrip)
    ----------------------------------------------------------------------
    Traceback (most recent call last):
      File "C:\Users\RitschelG\AppData\Local\Continuum\32bit\Anaconda\Lib\site-packages\savReaderWriter-163a1c518b52472eb0e23637d849b4aa005a35b4-py2.7.egg\savReaderWriter\unit_tests\test_SavHeaderReader_metadata_roundtrip_ioUtf8.py", line 64, in test_metadata_same
        self.assertEqual(in_metadata, out_metadata)
    AssertionError: {u'valueLabels': {u'job': {0.0: u'unemployed', 1.0: u'worker', 2.0: u'employee'} [truncated]... != {u'valueLabels': {u'job': {0.0: u'unemployed', 1.0: u'worker', 2.0: u'employee'} [truncated]...
      {u'alignments': {u'count': u'right',
                       u'date': u'center',
                       u'float': u'right',
                       u'job': u'right',
                       u'name': u'left',
                       u'sex': u'right',
                       u'\xfcmlaut': u'left'},
       u'caseWeightVar': u'',
       u'columnWidths': {u'count': 8,
                         u'date': 8,
                         u'float': 8,
                         u'job': 8,
                         u'name': 8,
                         u'sex': 8,
                         u'\xfcmlaut': 8},
       u'fileAttributes': {},
       u'fileLabel': u'',
       u'formats': {u'count': u'F8',
                    u'date': u'SDATE10',
                    u'float': u'F8.2',
                    u'job': u'F8',
    -               u'name': u'A20',
    ?                           ^
    
    +               u'name': u'A60',
    ?                           ^
    
                    u'sex': u'F2',
    -               u'\xfcmlaut': u'A8'},
    ?                                ^
    
    +               u'\xfcmlaut': u'A24'},
    ?                                ^^
    
       u'measureLevels': {u'count': u'ordinal',
                          u'date': u'ratio',
                          u'float': u'ratio',
                          u'job': u'nominal',
                          u'name': u'nominal',
                          u'sex': u'nominal',
                          u'\xfcmlaut': u'nominal'},
       u'missingValues': {u'count': {},
                          u'date': {},
                          u'float': {},
                          u'job': {u'values': [9.0]},
                          u'name': {},
                          u'sex': {u'values': [9.0]},
                          u'\xfcmlaut': {}},
       u'multRespDefs': {},
       u'valueLabels': {u'job': {0.0: u'unemployed',
                                 1.0: u'worker',
                                 2.0: u'employee'},
                        u'sex': {0.0: u'female', 1.0: u'male', 2.0: u'unclear'}},
       u'varAttributes': {},
       u'varLabels': {u'count': u'running integer number',
                      u'date': u'',
                      u'float': u'',
                      u'job': u'job description',
                      u'name': u'',
                      u'sex': u'',
                      u'\xfcmlaut': u'contains \xfcmlauts'},
       u'varNames': [u'count',
                     u'name',
                     u'date',
                     u'float',
                     u'\xfcmlaut',
                     u'sex',
                     u'job'],
       u'varRoles': {u'count': u'input',
                     u'date': u'input',
                     u'float': u'input',
                     u'job': u'input',
                     u'name': u'input',
                     u'sex': u'input',
                     u'\xfcmlaut': u'input'},
       u'varSets': {},
       u'varTypes': {u'count': 0,
                     u'date': 0,
                     u'float': 0,
                     u'job': 0,
    -                u'name': 20,
    ?                         ^
    
    +                u'name': 60,
    ?                         ^
    
                     u'sex': 0,
    -                u'\xfcmlaut': 8}}
    ?                              ^
    
    +                u'\xfcmlaut': 24}}
    ?                              ^^
    
    
    ----------------------------------------------------------------------
    Ran 2 tests in 0.184s
    
    FAILED (failures=1)
    
    In [3]: run "unit_tests/test_SavHeaderReader_metadata_roundtrip_ioUtf8.py"
    ..
    ----------------------------------------------------------------------
    Ran 2 tests in 0.180s
    
    OK
    

    PS: I ran your script without any modification.

  6. Log in to comment