Writer part in python3 not working with **metadata
Hi AJ (me again ...),
in the current version the writer part always drops me an error, when I am trying (as always) my reading data -- reading metadata -- writing everything code sniplet in python3 (under Windows).
Here is the code:
from __future__ import division, print_function
#from __future__ import absolute_import, unicode_literals
import numpy as np
import os
import sys
import savReaderWriter as sav
directory = "C:\\Users\\RitschelG\\Projekte\\spss_to_pandas"
spss_filename = r"test.sav"
spss_file = os.path.join(directory, spss_filename)
# read SPSS file data
ioLocale = "german" if sys.platform.startswith("win") else "de_DE.cp1252"
data = sav.SavReader(spss_file, returnHeader=True, ioUtf8=True, ioLocale=ioLocale, rawMode=True)
with data:
allData = data.all()
variables = allData[0]
records = allData[1:]
# read SPSS file metadata
with sav.SavHeaderReader(spss_file, ioUtf8=True, ioLocale=ioLocale) as header:
metadata = header.dataDictionary(asNamedtuple=False) # Why does this take so long?
# write (unmodified) data to SPSS file
spss_file_out = os.path.join(directory, 'out.sav')
with sav.SavWriter(spss_file_out, overwrite=True, ioUtf8=True, ioLocale=ioLocale,
mode=b'wb', refSavFileName=None, **metadata) as writer:
for i, record in enumerate(records):
writer.writerow(record)
And here the error message:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1-0d7726b30128> in <module>()
26 spss_file_out = os.path.join(directory, 'out.sav')
27 with sav.SavWriter(spss_file_out, overwrite=True, ioUtf8=True, ioLocale=ioLocale,
---> 28 mode=b'wb', refSavFileName=None, **metadata) as writer:
29 for i, record in enumerate(records):
30 writer.writerow(record)
C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\savWriter.py in __init__(self, savFileName, varNames, varTypes, valueLabels, varLabels, formats, missingValues, measureLevels, columnWidths, alignments, varSets, varRoles, varAttributes, fileAttributes, fileLabel, multRespDefs, caseWeightVar, overwrite, ioUtf8, ioLocale, mode, refSavFileName)
183 self.varLabels = varLabels
184 self.formats = formats
--> 185 self.missingValues = missingValues
186 self.measureLevels = measureLevels
187 self.columnWidths = columnWidths
C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\header.py in missingValues(self, missingValues)
623 if missingValues:
624 for varName, kwargs in missingValues.items():
--> 625 self._setMissingValue(varName, **kwargs)
626
627 # measurelevel, colwidth and alignment must all be set or not at all.
C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\header.py in _setMissingValue(self, varName, **kwargs)
551 raise ValueError("Missing value label > 9 bytes")
552
--> 553 nvalues = len(values) if values is not None else values
554 if values is None or values == {}:
555 missingFmt = "SPSS_NO_MISSVAL"
TypeError: object of type 'map' has no len()
Note: It doesn't matter whether rawMode
is set True or False.
When trying the same with ioUtf8=False
everywhere, I get the following (different) error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1-f9299041e964> in <module>()
26 spss_file_out = os.path.join(directory, 'out.sav')
27 with sav.SavWriter(spss_file_out, overwrite=True, ioUtf8=False, ioLocale=ioLocale,
---> 28 mode=b'wb', refSavFileName=None, **metadata) as writer:
29 for i, record in enumerate(records):
30 writer.writerow(record)
C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\savWriter.py in __init__(self, savFileName, varNames, varTypes, valueLabels, varLabels, formats, missingValues, measureLevels, columnWidths, alignments, varSets, varRoles, varAttributes, fileAttributes, fileLabel, multRespDefs, caseWeightVar, overwrite, ioUtf8, ioLocale, mode, refSavFileName)
183 self.varLabels = varLabels
184 self.formats = formats
--> 185 self.missingValues = missingValues
186 self.measureLevels = measureLevels
187 self.columnWidths = columnWidths
C:\Users\RitschelG\AppData\Local\Continuum\Anaconda3\lib\site-packages\savreaderwriter-3.3.0-py3.4.egg\savReaderWriter\header.py in missingValues(self, missingValues)
623 if missingValues:
624 for varName, kwargs in missingValues.items():
--> 625 self._setMissingValue(varName, **kwargs)
626
627 # measurelevel, colwidth and alignment must all be set or not at all.
TypeError: _setMissingValue() keywords must be strings
Note: It doesn't matter whether rawMode
is set True or False.
I guess this one has to do with the str vs. unicode type issue in python3.
Comments (5)
-
repo owner -
reporter Argh, indeed. :-) I never tried to write py2/3 compatible code myself, but I see that one has to take into account many pitfalls.
Concerning the second point: Well, I did not specify anything by hand (see code above). I am just piping everything from sav.SavHeaderReader back to sav.SavWriter using dictionary unpacking, i.e.
**metadata
. So from my user's point of view I can only say that the writer routine does not seem to be happy with what the reader routine provided. So in a sense it is "not my fault". ;-) -
repo owner Hi Gerhard, I just pushed some changes (163a1c518b). It is now possible to use the
**metadata
trick even in unicode mode. Based on your code above I wrote two test cases: https://bitbucket.org/fomcl/savreaderwriter/src/163a1c518b52472eb0e23637d849b4aa005a35b4/savReaderWriter/unit_tests/test_SavHeaderReader_metadata_roundtrip_ioUtf8.py?at=masterBasically, you need to use
ioUtf8=UNICODE_BMODE
so data are returned as bytes- codepage mode: ioUtf8 = CODEPAGE_MODE, or ioUtf8 = 0, or ioUtf8 = False
- standard unicode mode: ioUtf8 = UNICODE_UMODE, or ioUtf8 = 1, or ioUtf8 = True
- bytes unicode mode: ioUtf8 = UNICODE_BMODE, or ioUtf8 = 2
There is still some work here, but I like this approach.
Oh, and the lower-upper-values thing from missingValues is also fixed.
Would you like to try the test script in the link?
Regards, Albert-Jan
-
reporter Hi AJ, thanks for the update! I am going to try out your test script. It's fantastic news that you have been working on the
**metadata
trick -- I also like this approach. ;-) I will try it out and let you know my findings ... Regards, Gerhard -
reporter I tried your unittest scipt and possibly found some unexpected behaviour. You wrote in the comments that the unittest fails, if ioLocale is specified. I found that this is not the issue. Even without ioLocale specified the unittest fails on the first run, but it succeeds on the second and subsequent runs. I couldn't find the reason for this. But it happens with and without specifying ioLocale.
Here's what I did (version savReaderWriter-163a1c518b52472eb0e23637d849b4aa005a35b4 on python2 32bit and win7)
Python 2.7.8 |Anaconda 2.1.0 (32-bit)| (default, Jul 2 2014, 15:13:35) [MSC v.1500 32 bit (Intel)] Type "copyright", "credits" or "license" for more information. IPython 2.2.0 -- An enhanced Interactive Python. Anaconda is brought to you by Continuum Analytics. Please check out: http://continuum.io/thanks and https://binstar.org ? -> Introduction and overview of IPython's features. %quickref -> Quick reference. help -> Python's own help system. object? -> Details about 'object', use 'object??' for extra details. %guiref -> A brief reference about the graphical user interface. In [1]: cd "C:/Users/RitschelG/AppData/Local/Continuum/32bit/Anaconda/Lib/site-packages\savReaderWriter-163a1c518b52472eb0e23637d849b4aa005a35b4-py2.7.egg\savReaderWriter" C:\Users\RitschelG\AppData\Local\Continuum\32bit\Anaconda\Lib\site-packages\savReaderWriter-163a1c518b52472eb0e23637d849b4aa005a35b4-py2.7.egg\savReaderWriter In [2]: run "unit_tests/test_SavHeaderReader_metadata_roundtrip_ioUtf8.py" .F ====================================================================== FAIL: test_metadata_same (__main__.Test_MetadataRoundTrip) ---------------------------------------------------------------------- Traceback (most recent call last): File "C:\Users\RitschelG\AppData\Local\Continuum\32bit\Anaconda\Lib\site-packages\savReaderWriter-163a1c518b52472eb0e23637d849b4aa005a35b4-py2.7.egg\savReaderWriter\unit_tests\test_SavHeaderReader_metadata_roundtrip_ioUtf8.py", line 64, in test_metadata_same self.assertEqual(in_metadata, out_metadata) AssertionError: {u'valueLabels': {u'job': {0.0: u'unemployed', 1.0: u'worker', 2.0: u'employee'} [truncated]... != {u'valueLabels': {u'job': {0.0: u'unemployed', 1.0: u'worker', 2.0: u'employee'} [truncated]... {u'alignments': {u'count': u'right', u'date': u'center', u'float': u'right', u'job': u'right', u'name': u'left', u'sex': u'right', u'\xfcmlaut': u'left'}, u'caseWeightVar': u'', u'columnWidths': {u'count': 8, u'date': 8, u'float': 8, u'job': 8, u'name': 8, u'sex': 8, u'\xfcmlaut': 8}, u'fileAttributes': {}, u'fileLabel': u'', u'formats': {u'count': u'F8', u'date': u'SDATE10', u'float': u'F8.2', u'job': u'F8', - u'name': u'A20', ? ^ + u'name': u'A60', ? ^ u'sex': u'F2', - u'\xfcmlaut': u'A8'}, ? ^ + u'\xfcmlaut': u'A24'}, ? ^^ u'measureLevels': {u'count': u'ordinal', u'date': u'ratio', u'float': u'ratio', u'job': u'nominal', u'name': u'nominal', u'sex': u'nominal', u'\xfcmlaut': u'nominal'}, u'missingValues': {u'count': {}, u'date': {}, u'float': {}, u'job': {u'values': [9.0]}, u'name': {}, u'sex': {u'values': [9.0]}, u'\xfcmlaut': {}}, u'multRespDefs': {}, u'valueLabels': {u'job': {0.0: u'unemployed', 1.0: u'worker', 2.0: u'employee'}, u'sex': {0.0: u'female', 1.0: u'male', 2.0: u'unclear'}}, u'varAttributes': {}, u'varLabels': {u'count': u'running integer number', u'date': u'', u'float': u'', u'job': u'job description', u'name': u'', u'sex': u'', u'\xfcmlaut': u'contains \xfcmlauts'}, u'varNames': [u'count', u'name', u'date', u'float', u'\xfcmlaut', u'sex', u'job'], u'varRoles': {u'count': u'input', u'date': u'input', u'float': u'input', u'job': u'input', u'name': u'input', u'sex': u'input', u'\xfcmlaut': u'input'}, u'varSets': {}, u'varTypes': {u'count': 0, u'date': 0, u'float': 0, u'job': 0, - u'name': 20, ? ^ + u'name': 60, ? ^ u'sex': 0, - u'\xfcmlaut': 8}} ? ^ + u'\xfcmlaut': 24}} ? ^^ ---------------------------------------------------------------------- Ran 2 tests in 0.184s FAILED (failures=1) In [3]: run "unit_tests/test_SavHeaderReader_metadata_roundtrip_ioUtf8.py" .. ---------------------------------------------------------------------- Ran 2 tests in 0.180s OK
PS: I ran your script without any modification.
- Log in to comment
hi, thanks once again!
Argh, still more Python 3 problems. I do not have access to Python 3 now, but
nvalues = len(values)
should probably benvalues = len(list(values))
for it to also work with Python 3. Something likemap(float, range(10))
returns a list in Python 2, but an 'object of type 'map'' in Python 3.The second point appears to be caused by the fact that you specified lower/upper/value/values as a bytestring and not as a unicode string. See the documentation # note also that 'lower', 'upper', 'value(s)' are without b' prefix