Commits

AJ R committed ca59733 Merge

Merge branch 'py3k' of https://bitbucket.org/fomcl/savreaderwriter into py3k

  • Participants
  • Parent commits d797125, a577abf

Comments (0)

Files changed (4)

File savReaderWriter/__init__.py

     "SPSS_MAX_ENCODING": (64, "Maximum encoding text")}
 
 supportedDates = {  # uses ISO dates wherever applicable.
-    "DATE": "%Y-%m-%d",
-    "JDATE": "%Y-%m-%d",
-    "EDATE": "%Y-%m-%d",
-    "SDATE": "%Y-%m-%d",
-    "DATETIME": "%Y-%m-%d %H:%M:%S",
-    "ADATE": "%Y-%m-%d",
-    "WKDAY": "%A",
-    "MONTH": "%B",
-    "MOYR": "%B %Y",
-    "WKYR": "%W WK %Y",
-    "QYR": "%m Q %Y",  # %m (month) is converted to quarter, see next dict.
-    "TIME": "%H:%M:%S.%f",
-    "DTIME": "%d %H:%M:%S"}
+    b"DATE": "%Y-%m-%d",
+    b"JDATE": "%Y-%m-%d",
+    b"EDATE": "%Y-%m-%d",
+    b"SDATE": "%Y-%m-%d",
+    b"DATETIME": "%Y-%m-%d %H:%M:%S",
+    b"ADATE": "%Y-%m-%d",
+    b"WKDAY": "%A",
+    b"MONTH": "%B",
+    b"MOYR": "%B %Y",
+    b"WKYR": "%W WK %Y",
+    b"QYR": "%m Q %Y",  # %m (month) is converted to quarter, see next dict.
+    b"TIME": "%H:%M:%S.%f",
+    b"DTIME": "%d %H:%M:%S"}
 
 QUARTERS = {'01': '1', '02': '1', '03': '1', '04': '2', '05': '2', '06': '2',
             '07': '3', '08': '3', '09': '3', '10': '4', '11': '4', '12': '4'}

File savReaderWriter/header.py

         ...             u'setType': u'D',
         ...             u'varNames': [u'v1', u'v2']}}  # doctest: +SKIP
         True
-        """ 
-        uS = lambda x: x.decode("utf-8") if isinstance(x, str) else x
+        """
+        bytes_ = __builtins__["bytes"] if sys.version_info[0] > 2 else str
+        uS = lambda x: x.decode("utf-8") if isinstance(x, bytes_) else x
         uL = lambda x: map(uS, x) if isinstance(x, list) else x
         @functools.wraps(func)
         def wrapper(arg):
             result = func(arg)
             if not arg.ioUtf8:
                 return result  # unchanged
-            if isinstance(result, str):
+            if isinstance(result, bytes_):
                 return uS(result)
             uresult = {}
             for k, v in result.items():
         gc.collect()
         if segfaults:
             return
-        #print ".. freeing" , funcName[8:]
+        #print("... freeing", funcName[8:])
         func = getattr(self.spssio, funcName)
         retcode = func(*args)
         if retcode:
     def _splitformats(self):
         """This function returns the 'bare' formats + variable widths,
         e.g. format F5.3 is returned as 'F' and '5'"""
-        regex = re.compile(b"\w+(?P<varWid>\d+)[.]?\d?", re.I)
+        pattern = b"(?P<bareFmt>[a-z]+)(?P<varWid>\d+)[.]?\d*"
+        if self.ioUtf8_:
+            pattern = pattern.decode("utf-8")
+        regex = re.compile(pattern, re.I)
         bareformats, varWids = {}, {}
         for varName, format_ in self.formats.items():
-            bareformats[varName] = re.sub(b"\d+\.", b"", format_)
-            varWids[varName] = int(regex.search(format_).group("varWid"))
+            bareformat, varWid = regex.findall(format_)[0]
+            bareformats[varName] = bareformat
+            varWids[varName] = int(varWid)
         return bareformats, varWids
 
     @formats.setter

File savReaderWriter/savReader.py

     def __str__(self):
         """This function returns a conscise file report of the spss data file
         For example str(SavReader(savFileName))"""
-        return unicode(self).encode(self.fileEncoding)
+        return self.__unicode__.encode(self.fileEncoding)
 
     def __unicode__(self):
         """This function returns a conscise file report of the spss data file,
         For example unicode(SavReader(savFileName))"""
-        self.fileReport = self.getFileReport(self.savFileName, self.varNames,
-                                             self.varTypes, self.formats,
-                                             self.nCases)
+        self.fileReport = self.getFileReport()
         return self.fileReport
 
     @property
         """Helper function for formatValues function. Determines whether
         iterating over each individual value is really needed"""
         hasDates = bool(set(self.bareformats.values()) & set(supportedDates))
-        hasNfmt = "N" in self.bareformats
+        hasNfmt = b"N" in self.bareformats
         hasRecodeSysmis = self.recodeSysmisTo is not None
         items = [hasDates, hasNfmt, hasRecodeSysmis, self.ioUtf8_]
         return False if any(items) else True
         to <recodeSysmisTo>. If rawMode==True, this function does nothing"""
         if self.rawMode or self.autoRawMode:
             return record  # 6-7 times faster!
-
         for i, value in enumerate(record):
             varName = self.header[i]
             varType = self.varTypes[varName]
                 self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0)
             theDate = (self.gregorianEpoch +
                        datetime.timedelta(seconds=spssDateValue))
-            return datetime.datetime.strftime(theDate, fmt)
+            return bytes(datetime.datetime.strftime(theDate, fmt))
         except (OverflowError, TypeError, ValueError):
             return recodeSysmisTo
 
-    def getFileReport(self, savFileName, varNames, varTypes,
-                      formats, nCases):
+    def getFileReport(self):
         """ This function prints a report about basic file characteristics """
-        bytes = os.path.getsize(savFileName)
+        bytes = os.path.getsize(self.savFileName)
         kb = float(bytes) / 2**10
         mb = float(bytes) / 2**20
         (fileSize, label) = (mb, "MB") if mb > 1 else (kb, "kB")
         lang, cp = locale.getlocale()
         intEnc = "Utf-8/Unicode" if self.ioUtf8 else "Codepage (%s)" % cp
         varlist = []
-        line = "  %%0%sd. %%s (%%s - %%s)" % len(str(len(varNames) + 1))
-        for cnt, varName in enumerate(varNames):
-            lbl = "string" if varTypes[varName] > 0 else "numerical"
-            format_ = formats[varName]
+        line = "  %%0%sd. %%s (%%s - %%s)" % len(str(len(self.varNames) + 1))
+        for cnt, varName in enumerate(self.varNames):
+            lbl = "string" if self.varTypes[varName] > 0 else "numerical"
+            format_ = self.formats[varName]
             varlist.append(line % (cnt + 1, varName, format_, lbl))
-        info = {"savFileName": savFileName,
+        info = {"savFileName": self.savFileName,
                 "fileSize": fileSize,
                 "label": label,
-                "nCases": nCases,
-                "nCols": len(varNames),
-                "nValues": nCases * len(varNames),
+                "nCases": self.nCases,
+                "nCols": len(self.varNames),
+                "nValues": self.nCases * len(self.varNames),
                 "spssVersion": "%s (%s)" % (systemString, spssVersion),
                 "ioLocale": self.ioLocale,
                 "ioUtf8": intEnc,

File savReaderWriter/unit_tests/test_SavReader_ioUtf8_mode.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+##############################################################################
+## Read a file in utf8 mode aka unicode mode (ioUtf8=True)
+##############################################################################
+
+## ... check if non-ascii encodings work well
+## ... notice the use of 'ioUtf8'
+## ... source: http://www.omniglot.com/language/phrases/hello.htm
+
+import unittest
+from savReaderWriter import *
+
+greetings_expected = [
+    [u'Arabic', u'\u0627\u0644\u0633\u0644\u0627\u0645 \u0639\u0644\u064a\u0643\u0645'],
+    [u'Assamese', u'\u09a8\u09ae\u09b8\u09cd\u0995\u09be\u09f0'],
+    [u'Bengali', u'\u0986\u09b8\u09b8\u09be\u09b2\u09be\u09ae\u09c1\u0986\u09b2\u09be\u0987\u0995\u09c1\u09ae'],
+    [u'English', u'Greetings and salutations'],
+    [u'Georgian', u'\u10d2\u10d0\u10db\u10d0\u10e0\u10ef\u10dd\u10d1\u10d0'],
+    [u'Kazakh', u'\u0421\u04d9\u043b\u0435\u043c\u0435\u0442\u0441\u0456\u0437 \u0431\u0435'],
+    [u'Russian', u'\u0417\u0434\u0440\u0430\u0432\u0441\u0442\u0432\u0443\u0439\u0442\u0435'],
+    [u'Spanish', u'\xa1Hola!'],
+    [u'Swiss German', u'Gr\xfcezi'],
+    [u'Thai', u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e35'],
+    [u'Walloon', u'Bondjo\xfb'],
+    [u'Telugu', u'\u0c0f\u0c2e\u0c02\u0c21\u0c40']]
+
+class test_SavReader_typical_use(unittest.TestCase):
+    """ Read a file, ioUtf=True use"""
+
+    def test_SavReader_unicode_mode(self):
+
+        savFileName = "../savReaderWriter/test_data/greetings.sav"
+        greetings_got = []
+        with SavReader(savFileName, ioUtf8=True) as reader:
+            for record in reader:
+                language, greeting = record[1:]
+                greetings_got.append([language.rstrip(), greeting.rstrip()])
+        self.assertEqual(greetings_expected, greetings_got)
+
+if __name__ == "__main__":
+    unittest.main()