savReaderWriter / savReaderWriter / savReader.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from ctypes import *
import os
import operator
import locale
import datetime

from savReaderWriter import *
from header import *

class SavReader(Header):
    """ Read Spss system files (.sav, .zsav)

    Parameters:
    -savFileName: the file name of the spss data file
    -returnHeader: Boolean that indicates whether the first record should
        be a list of variable names (default = False)
    -recodeSysmisTo: indicates to which value missing values should
        be recoded (default = None),
    -selectVars: indicates which variables in the file should be selected.
        The variables should be specified as a list or a tuple of
        valid variable names. If None is specified, all variables
        in the file are used (default = None)
    -idVar: indicates which variable in the file should be used for use as id
        variable for the 'get' method (default = None)
    -verbose: Boolean that indicates whether information about the spss data
        file (e.g., number of cases, variable names, file size) should be
        printed on the screen (default = False).
    -rawMode: Boolean that indicates whether values should get SPSS-style
        formatting, and whether date variables (if present) should be converted
        to ISO-dates. If True, the program does not format any values, which
        increases processing speed. (default = False)
    -ioUtf8: indicates the mode in which text communicated to or from the
        I/O Module will be. Valid values are True (UTF-8 mode aka Unicode mode)
        and False (Codepage mode). Cf. SET UNICODE=ON/OFF (default = False)
    -ioLocale: indicates the locale of the I/O module. Cf. SET LOCALE (default
        = None, which corresponds to ".".join(locale.getlocale())

    Typical use:
    savFileName = "someFile.sav"
    with SavReader(savFileName, returnHeader=True) as reader:
        header = next(reader)
        for line in reader:
            process(line)
    """

    def __init__(self, savFileName, returnHeader=False, recodeSysmisTo=None,
                 verbose=False, selectVars=None, idVar=None, rawMode=False,
                 ioUtf8=False, ioLocale=None):
        """ Constructor. Initializes all vars that can be recycled """
        super(SavReader, self).__init__(savFileName, "rb", None,
                                        ioUtf8, ioLocale)
        self.savFileName = savFileName
        self.returnHeader = returnHeader
        self.recodeSysmisTo = recodeSysmisTo
        self.verbose = verbose
        self.selectVars = selectVars
        self.idVar = idVar
        self.rawMode = rawMode

        self.header = self.getHeader(self.selectVars)
        self.bareformats, self.varWids = self._splitformats()
        self.autoRawMode = self._isAutoRawMode()

        self.ioUtf8_ = ioUtf8
        self.sysmis_ = self.sysmis
        self.numVars = self.numberofVariables
        self.nCases = self.numberofCases

        self.myStruct = self.getStruct(self.varTypes, self.varNames)
        self.unpack_from = self.myStruct.unpack_from
        self.seekNextCase = self.spssio.spssSeekNextCase
        self.caseBuffer = self.getCaseBuffer()

        if psycoOk:
            self._items = psyco.proxy(self._items)  # 3 x faster!

    def __enter__(self):
        """ This function opens the spss data file (context manager)."""
        if self.verbose and self.ioUtf8_:
            print unicode(self).replace(os.linesep, "\n")
        elif self.verbose:
            print str(self).replace(os.linesep, "\n")
        return iter(self)

    def __exit__(self, type, value, tb):
        """ This function closes the spss data file and does some cleaning."""
        if type is not None:
            pass  # Exception occurred
        self.close()

    def close(self):
        """This function closes the spss data file and does some cleaning."""
        if not segfaults:
            self.closeSavFile(self.fh, mode="rb")
        del self.spssio

    def __len__(self):
        """ This function reports the number of cases (rows) in the spss data
        file. For example: len(SavReader(savFileName))"""
        return self.nCases

    def __cmp__(self, other):
        """ This function implements behavior for all of the comparison
        operators so comparisons can be made between SavReader instances,
        or comparisons between SavReader instances and integers."""
        if not isinstance(other, (SavReader, int)):
            raise TypeError
        other = other if isinstance(other, int) else len(other)
        if len(self) < other:
            return -1
        elif len(self) == other:
            return 0
        else:
            return 1

    def __hash__(self):
        """This function returns a hash value for the object to ensure it
        is hashable."""
        return id(self)

    def __str__(self):
        """This function returns a conscise file report of the spss data file
        For example str(SavReader(savFileName))"""
        return unicode(self).encode(self.fileEncoding)

    def __unicode__(self):
        """This function returns a conscise file report of the spss data file,
        For example unicode(SavReader(savFileName))"""
        self.fileReport = self.getFileReport(self.savFileName, self.varNames,
                                             self.varTypes, self.formats,
                                             self.nCases)
        return self.fileReport

    def _isAutoRawMode(self):
        """Helper function for formatValues function. Determines whether
        iterating over each individual value is really needed"""
        hasDates = bool(set(self.bareformats.values()) & set(supportedDates))
        hasNfmt = "N" in self.bareformats
        hasRecodeSysmis = self.recodeSysmisTo is not None
        items = [hasDates, hasNfmt, hasRecodeSysmis, self.ioUtf8_]
        return False if any(items) else True

    def formatValues(self, record):
        """This function formats date fields to ISO dates (yyyy-mm-dd), plus
        some other date/time formats. The SPSS N format is formatted to a
        character value with leading zeroes. System missing values are recoded
        to <recodeSysmisTo>. If rawMode==True, this function does nothing"""
        if self.rawMode or self.autoRawMode:
            return record  # 6-7 times faster!

        for i, value in enumerate(record):
            varName = self.header[i]
            varType = self.varTypes[varName]
            bareformat_ = self.bareformats[varName]
            varWid = self.varWids[varName]
            if varType == 0:
                # recode system missing values, if present and desired
                if value > self.sysmis_:
                    pass
                else:
                    record[i] = self.recodeSysmisTo
                # format N-type values (=numerical with leading zeroes)
                if bareformat_ == "N":
                    #record[i] = str(value).zfill(varWid)
                    record[i] = "%%0%dd" % varWid % value  #15 x faster (zfill)
                # convert SPSS dates to ISO dates
                elif bareformat_ in supportedDates:
                    fmt = supportedDates[bareformat_]
                    args = (value, fmt, self.recodeSysmisTo)
                    record[i] = self.spss2strDate(*args)
                    if bareformat_ == "QYR" and record[i]:
                        # convert month to quarter, e.g. 12 Q 1990 --> 4 Q 1990
                        # There is no such thing as a %q strftime directive
                        try:
                            record[i] = QUARTERS[record[i][:2]] + record[i][2:]
                        except KeyError:
                            record[i] = self.recodeSysmisTo
            elif varType > 0:
                value = value[:varType]
                if self.ioUtf8_:
                    record[i] = value.decode("utf-8")
                else:
                    record[i] = value
        return record

    def _items(self, start=0, stop=None, step=1, returnHeader=False):
        """ This is a helper function to implement the __getitem__ and
        the __iter__ special methods. """

        if returnHeader:
            yield self.header

        if all([start == 0, stop is None, step == 1]):  # used as iterator
            retcode = 0
        else:
            retcode = self.seekNextCase(c_int(self.fh), c_long(0))  # reset

        if retcode:
            checkErrsWarns("Problem seeking first case", retcode)

        if stop is None:
            stop = self.nCases

        selection = self.selectVars is not None
        for case in xrange(start, stop, step):
            if start:
                # only call this when iterating over part of the records
                retcode = self.seekNextCase(c_int(self.fh), c_long(case))
                if retcode:
                    checkErrsWarns("Problem seeking case %d" % case, retcode)
            elif stop == self.nCases:
                self.printPctProgress(case, self.nCases)

            record = self.record

            if selection:
                record = self.selector(record)
                record = [record] if len(record) == 1 else list(record)
            record = self.formatValues(record)
            yield record

    def __iter__(self):
        """This function allows the object to be used as an iterator"""
        return self._items(0, None, 1, self.returnHeader)

    def __getitem__(self, key):
        """ This function reports the record of case number <key>.
        For example: firstRecord = SavReader(savFileName)[0].
        The <key> argument may also be a slice, for example:
        firstfiveRecords = SavReader(savFileName)[:5].
        You can also do stuff like (numpy required!):
        savReader(savFileName)[1:5, 1]"""

        is_slice = isinstance(key, slice)
        is_array_slice = key is Ellipsis or isinstance(key, tuple)

        if is_slice:
            start, stop, step = key.indices(self.nCases)
        elif is_array_slice:
            return self._get_array_slice(key, self.nCases, len(self.header))
        else:
            key = operator.index(key)
            start = key + self.nCases if key < 0 else key
            if not 0 <= start < self.nCases:
                raise IndexError("Index out of bounds")
            stop = start + 1
            step = 1

        records = self._items(start, stop, step)
        if is_slice:
            return list(records)
        return next(records)

    def _get_array_slice(self, key, nRows, nCols):
        """This is a helper function to implement array slicing with numpy"""
        if not numpyOk:
            raise ImportError("Array slicing requires the numpy library")

        is_index = False
        rstart = cstart = 0
        try:
            row, col = key
            if row < 0:
                row = nRows + row
            if col < 0:
                col = nCols + col

            ## ... slices
            if isinstance(row, slice) and col is Ellipsis:
                # reader[1:2, ...]
                rstart, rstop, rstep = row.indices(nRows)
                cstart, cstop, cstep = 0, nRows, 1
            elif row is Ellipsis and isinstance(col, slice):
                # reader[..., 1:2]
                rstart, rstop, rstep = 0, nRows, 1
                cstart, cstop, cstep = col.indices(nCols)
            elif isinstance(row, slice) and isinstance(col, slice):
                # reader[1:2, 1:2]
                rstart, rstop, rstep = row.indices(nRows)
                cstart, cstop, cstep = col.indices(nCols)
            elif row is Ellipsis and col is Ellipsis:
                # reader[..., ...]
                rstart, rstop, rstep = 0, nRows, 1
                cstart, cstop, cstep = 0, nCols, 1

            ## ... indexes
            elif isinstance(row, int) and col is Ellipsis:
                # reader[1, ...]
                rstart, rstop, rstep = row, row + 1, 1
                cstart, cstop, cstep = 0, nCols, 1
                is_index = True
            elif row is Ellipsis and isinstance(col, int):
                # reader[..., 1]
                rstart, rstop, rstep = 0, nRows, 1
                cstart, cstop, cstep = col, col + 1, 1
                is_index = True
            elif  isinstance(row, int) and isinstance(col, int):
                # reader[1, 1]
                rstart, rstop, rstep = row, row + 1, 1
                cstart, cstop, cstep = col, col + 1, 1
                is_index = True

            # ... slice + index
            elif isinstance(row, slice) and isinstance(col, int):
                # reader[1:2, 1]
                rstart, rstop, rstep = row.indices(nRows)
                cstart, cstop, cstep = col, col + 1, 1
            elif isinstance(row, int) and isinstance(col, slice):
                # reader[1, 1:2]
                rstart, rstop, rstep = row, row + 1, 1
                cstart, cstop, cstep = col.indices(nCols)

            try:
                if not 0 <= rstart < nRows:
                    raise IndexError("Index out of bounds")
                if not 0 <= cstart < nCols:
                    raise IndexError("Index out of bounds")
                key = (Ellipsis, slice(cstart, cstop, cstep))

            except UnboundLocalError:
                msg = "The array index is either invalid, or not implemented"
                raise TypeError(msg)

        except TypeError:
            # reader[...]
            rstart, rstop, rstep = 0, nRows, 1
            key = (Ellipsis, Ellipsis)
        records = self._items(rstart, rstop, rstep)
        result = numpy.array(list(records))[key].tolist()
        if abs(key[1].start - key[1].stop) == 1:
            return reduce(list.__add__, result) # flatten list if it's one col
        if is_index:
            return result[0]
        return result

    def head(self, n=5):
        """ This convenience function returns the first <n> records. """
        return self[:abs(n)]

    def tail(self, n=5):
        """ This convenience function returns the last <n> records. """
        return self[-abs(n):]

    def all(self):
        """ This convenience function returns all the records. """        
        return [record for record in iter(self)]

    def __contains__(self, item):
        """ This function implements membership testing and returns True if
        <idVar> contains <item>. Thus, it requires the 'idVar' parameter to
        be set. For example: reader = SavReader(savFileName, idVar="ssn")
        "987654321" in reader """
        return bool(self.get(item))

    def get(self, key, default=None, full=False):
        """ This function returns the records for which <idVar> == <key>
        if <key> in <savFileName>, else <default>. Thus, the function mimics
        dict.get, but note that dict[key] is NOT implemented. NB: Even though
        this uses a binary search, this is not very fast on large data (esp.
        the first call, and with full=True)

        Parameters:
        -key: key for which the corresponding record should be returned
        -default: value that should be returned if <key> is not found
          (default: None)
        -full: value that indicates whether *all* records for which
          <idVar> == <key> should be returned (default: False)
        For example: reader = SavReader(savFileName, idVar="ssn")
        reader.get("987654321", "social security number not found!")"""

        if not self.idVar in self.varNames:
            msg = ("SavReader object must be instantiated with an existing " +
                   "variable as an idVar argument")
            raise NameError(msg)

        #two slightly modified functions from the bisect module
        def bisect_right(a, x, lo=0, hi=None):
            if hi is None:
                hi = len(a)
            while lo < hi:
                mid = (lo + hi) // 2
                if x < a[mid][0]:
                    hi = mid  # a[mid][0], not a[mid]
                else:
                    lo = mid + 1
            return lo

        def bisect_left(a, x, lo=0, hi=None):
            if hi is None:
                hi = len(a)
            while lo < hi:
                mid = (lo + hi) // 2
                if a[mid][0] < x:
                    lo = mid + 1  # a[mid][0], not a[mid]
                else:
                    hi = mid
            return lo

        idPos = self.varNames.index(self.idVar)
        if not hasattr(self, "isSorted"):
            self.isSorted = True
            if self.varTypes[self.idVar] == 0:
                if not isinstance(key, (int, float)):
                    return default
                self.recordz = ((record[idPos], i) for i,
                                record in enumerate(iter(self)))
            else:
                if not isinstance(key, basestring):
                    return default
                self.recordz = ((record[idPos].rstrip(), i) for i,
                                record in enumerate(iter(self)))
            self.recordz = sorted(self.recordz)
        insertLPos = bisect_left(self.recordz, key)
        insertRPos = bisect_right(self.recordz, key)
        if full:
            result = [self[record[1]] for record in
                      self.recordz[insertLPos: insertRPos]]
        else:
            if insertLPos == insertRPos:
                return default
            result = self[self.recordz[insertLPos][1]]
        if result:
            return result
        return default

    def getSavFileInfo(self):
        """ This function reads and returns some basic information of the open
        spss data file."""
        return (self.numVars, self.nCases, self.varNames, self.varTypes,
                self.formats, self.varLabels, self.valueLabels)

    def memoize(f):
        """Memoization decorator
        http://code.activestate.com/recipes/577219-minimalistic-memoization/"""
        cache = {}
        MAXCACHE = 10**4

        def memf(*x):
            if x not in cache:
                cache[x] = f(*x)
            if len(cache) > MAXCACHE:
                cache.popitem()
            return cache[x]
        return memf

    @memoize
    def spss2strDate(self, spssDateValue, fmt, recodeSysmisTo):
        """This function converts internal SPSS dates (number of seconds
        since midnight, Oct 14, 1582 (the beginning of the Gregorian calendar))
        to a human-readable format"""
        try:
            if not hasattr(self, "gregorianEpoch"):
                self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0)
            theDate = (self.gregorianEpoch +
                       datetime.timedelta(seconds=spssDateValue))
            return datetime.datetime.strftime(theDate, fmt)
        except OverflowError:
            return recodeSysmisTo
        except TypeError:
            return recodeSysmisTo
        except ValueError:
            return recodeSysmisTo

    def getFileReport(self, savFileName, varNames, varTypes,
                      formats, nCases):
        """ This function prints a report about basic file characteristics """
        bytes = os.path.getsize(savFileName)
        kb = float(bytes) / 2**10
        mb = float(bytes) / 2**20
        (fileSize, label) = (mb, "MB") if mb > 1 else (kb, "kB")
        systemString = self.systemString
        spssVersion = ".".join(map(str, self.spssVersion))
        lang, cp = locale.getlocale()
        intEnc = "Utf-8/Unicode" if self.ioUtf8 else "Codepage (%s)" % cp
        varlist = []
        line = "  %%0%sd. %%s (%%s - %%s)" % len(str(len(varNames) + 1))
        for cnt, varName in enumerate(varNames):
            lbl = "string" if varTypes[varName] > 0 else "numerical"
            format_ = formats[varName]
            varlist.append(line % (cnt + 1, varName, format_, lbl))
        info = {"savFileName": savFileName,
                "fileSize": fileSize,
                "label": label,
                "nCases": nCases,
                "nCols": len(varNames),
                "nValues": nCases * len(varNames),
                "spssVersion": "%s (%s)" % (systemString, spssVersion),
                "ioLocale": self.ioLocale,
                "ioUtf8": intEnc,
                "fileEncoding": self.fileEncoding,
                "fileCodePage": self.fileCodePage,
                "isCompatible": "Yes" if self.isCompatibleEncoding() else "No",
                "local_language": lang,
                "local_encoding": cp,
                "varlist": os.linesep.join(varlist),
                "sep": os.linesep,
                "asterisks": 70 * "*"}
        report = ("%(asterisks)s%(sep)s" +
                  "*File %(savFileName)r (%(fileSize)3.2f %(label)s) has " +
                  "%(nCols)s columns (variables) and %(nCases)s rows " +
                  "(%(nValues)s values)%(sep)s" +
                  "*The file was created with SPSS version: %(spssVersion)s%" +
                  "(sep)s" +
                  "*The interface locale is: %(ioLocale)r%(sep)s" +
                  "*The interface mode is: %(ioUtf8)s%(sep)s" +
                  "*The file encoding is: %(fileEncoding)r (Code page: " +
                  "%(fileCodePage)s)%(sep)s" +
                  "*File encoding and the interface encoding are compatible:" +
                  " %(isCompatible)s%(sep)s" +
                  "*Your computer's locale is: %(local_language)r (Code " +
                  "page: %(local_encoding)s)%(sep)s" +
                  "*The file contains the following variables:%(sep)s" +
                  "%(varlist)s%(sep)s%(asterisks)s%(sep)s")
        return report % info

    def getHeader(self, selectVars):
        """This function returns the variable names, or a selection thereof
        (as specified as a list using the selectVars parameter), as a list."""
        if selectVars is None:
            header = self.varNames
        elif isinstance(selectVars, (list, tuple)):
            diff = set(selectVars).difference(set(self.varNames))
            if diff:
                msg = "Variable names misspecified (%r)" % ", ".join(diff)
                raise NameError(msg)
            varPos = [varNames.index(v) for v in self.varNames
                      if v in selectVars]
            self.selector = operator.itemgetter(*varPos)
            header = self.selector(self.varNames)
            header = [header] if not isinstance(header, tuple) else header
        else:
            msg = ("Variable names list misspecified. Must be 'None' or a " +
                   "list or tuple of existing variables")
            raise TypeError(msg)
        return header
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.