1. AJ R
  2. savReaderWriter

Source

savReaderWriter / savReaderWriter / doc_tests / test_SavWriter_utf8_mode.txt

##############################################################################
## Write a file in utf8 mode aka unicode mode (ioUtf8=True)
##############################################################################

>>> import os
>>> import tempfile
>>> from savReaderWriter import *

## ... check if non-ascii encodings work well
## ... notice the use of 'ioUtf8'
## ... source: http://www.omniglot.com/language/phrases/hello.htm
>>> greetings = [
...     ['Arabic', u'\u0627\u0644\u0633\u0644\u0627\u0645\u0020\u0639\u0644\u064a\u0643\u0645'],
...     ['Assamese', u'\u09a8\u09ae\u09b8\u09cd\u0995\u09be\u09f0'],
...     ['Bengali', u'\u0986\u09b8\u09b8\u09be\u09b2\u09be\u09ae\u09c1\u0986\u09b2\u09be\u0987\u0995\u09c1\u09ae'],
...     ['English', u'Greetings and salutations'],
...     ['Georgian', u'\u10d2\u10d0\u10db\u10d0\u10e0\u10ef\u10dd\u10d1\u10d0'],
...     ['Kazakh', u'\u0421\u04d9\u043b\u0435\u043c\u0435\u0442\u0441\u0456\u0437 \u0431\u0435'],
...     ['Russian', u'\u0417\u0434\u0440\u0430\u0432\u0441\u0442\u0432\u0443\u0439\u0442\u0435'],
...     ['Spanish', u'\xa1Hola!'],
...     ['Swiss German', u'Gr\xfcezi'],
...     ['Thai', u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e35'],
...     ['Walloon', u'Bondjo\xfb'],
...     ]
>>> savFileName = os.path.join(tempfile.gettempdir(), "greetings.sav")
>>> varNames = [u'Bondjo\xfb', 'greeting']
>>> varTypes = {u'Bondjo\xfb': 20, 'greeting': 50}
>>> valueLabels = {u'Bondjo\xfb': {'Thai': u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e35'}}
>>> missingValues = {u'Bondjo\xfb': {'values': u'\xa1Hola!'}}
>>> varLabels = {'greeting': u'\u0627\u0644\u0633\u0644\u0627\u0645\u0020\u0639\u0644\u064a\u0643\u0645'}
>>> with SavWriter(savFileName, varNames, varTypes, valueLabels, varLabels,
...                missingValues=missingValues, ioUtf8=True) as sav:
...     sav.writerows(greetings)


# read the file back in ---> error/nuisance with null bytes!!
# BUG: the null bytes are supposed to be stripped or replaced with space!
>>> records_expected = \
... [[u'Arabic',
...   u'\u0627\u0644\u0633\u0644\u0627\u0645 \u0639\u0644\u064a\u0643\u0645'],
...  [u'Assamese', u'\u09a8\u09ae\u09b8\u09cd\u0995\u09be\u09f0'],
...  [u'Bengali',
...   u'\u0986\u09b8\u09b8\u09be\u09b2\u09be\u09ae\u09c1\u0986\u09b2\u09be\u0987\u0995\u09c1\u09ae'],
...  [u'English', u'Greetings and salutations'],
...  [u'Georgian', u'\u10d2\u10d0\u10db\u10d0\u10e0\u10ef\u10dd\u10d1\u10d0'],
...  [u'Kazakh',
...   u'\u0421\u04d9\u043b\u0435\u043c\u0435\u0442\u0441\u0456\u0437 \u0431\u0435'],
...  [u'Russian',
...   u'\u0417\u0434\u0440\u0430\u0432\u0441\u0442\u0432\u0443\u0439\u0442\u0435'],
...  [u'Spanish', u'\xa1Hola!'],
...  [u'Swiss German', u'Gr\xfcezi'],
...  [u'Thai', u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e35'],
...  [u'Walloon', u'Bondjo\xfb']]
>>> with SavReader(savFileName, ioUtf8=True)as sav:   # doctest: +ELLIPSIS
...     records_got = [[line[0].rstrip(), line[1].rstrip("\x00")] for line in sav]
...     records_expected == records_got
True

# clean up
>>> try:
...     os.remove(savFileName)
... except:
...     pass