Source

django-spotnet / spotnet / tests / parsing.py

try:
    from django.utils import unittest
except ImportError:
    try:
        import unittest2 as unittest
    except ImportError:
        import unittest

from spotnet import settings
from spotnet.post import RawPost, InvalidPost
from spotnet.models import Post


class ParsingTest(unittest.TestCase):
    def construct(self, headers, content, xml=None):
        """Construct a RawPost object from headers and content.

        This automatically adds all required headers:
        Lines, From and Date.
        To override them, pass a value for these in the headers argument.
        To omit these, pass them to the headers argument
        with a value of None.
        """
        if isinstance(content, str):
            lines = 1
            content = [content]
        else:
            lines = len(content)
        if 'Lines' not in headers:
            headers['Lines'] = lines
        if 'From' not in headers:
            headers['From'] = 'Testuser'
        if 'Date' not in headers:
            headers['Date'] = 'Thu, 25 Sep 2003 10:49:41 -0300'
        if xml:
            if 'X-XML' in headers:
                raise ValueError
            else:
                headers['X-XML'] = '<Spotnet><Posting>%s</Posting></Spotnet>' % ''.join(
            '<%(key)s><![CDATA[%(val)s]]></%(key)s>' % dict(key=k, val=v) for k, v in xml.iteritems()
        )

        return self.parse_to_post(
            ['%s: %s' % (k, v) for k, v in headers.iteritems() if v is not None]
        +
            ( [''] if (lines > 0 and len(content[0]) > 0) else [] )
        +
            ( content if (lines > 0 and len(content[0]) > 0) else [] )
        )

        if lines > 0 and len(content[0]) > 0:
            return self.parse_to_post('%s\n\n%s' % (
                '\n'.join('%s: %s' % (k, v) for k, v in headers.iteritems()),
                '\n'.join(content),
            ))
        else:
            return self.parse_to_post('\n'.join('%s: %s' % (k, v) for k, v in headers.iteritems()))

    def parse_to_post(self, content):
        # we create a test-wide unique postnumber
        # and messageid, so that they don't violate
        # database unique constraints
        import random
        postnumber = random.randrange(0, 10000000)
        messageid = '<testmessage-%s-%s@test.com>' % (postnumber, Post.objects.count())
        return RawPost([None, postnumber, messageid, content])


class BasicParsingTest(ParsingTest):
    def test_parse_standard(self):
        post = self.construct(
            dict(
                Hello='value1',
                World='another value',
            ),
            [
                "The message content",
                "in two short lines",
            ],
            dict(
                More='info',
                And='rubbish',
            ),
        )
        self.assertEqual(post.headers['Hello'], u'value1')
        self.assertEqual(post.headers['World'], u'another value')
        self.assertEqual(post.get_content(), u'The message content\nin two short lines')
        self.assertEqual(post.extra['More'], u'info')
        self.assertEqual(post.extra['And'], u'rubbish')

        # we test getting content again to avoid errors
        # for file like content getters being exhausted
        self.assertEqual(post.get_content(), u'The message content\nin two short lines')

    def test_parse_header_continuation(self):
        raw = RawPost((None, 123, '<blaat@free.pt>', [
            'TestHeader: Some valueeeeee that is',
            '      way tooooooooooooo looooooooooooong',
        ]))
        self.assertEqual(raw.headers.get('TestHeader'), 'Some valueeeeee that is\n way tooooooooooooo looooooooooooong')

    def test_parse_xml_header_multiline(self):
        raw = RawPost((None, 123, '<blaat@free.pt>', [
            'X-XML: <Spotnet><Posting><Key><![CDATA[the long valuueeee',
            '      continuing here ',
            'X-XML: and all the way into this other header]]></Key></Posting></Spotnet>',
        ]))
        self.assertEqual(raw.extra['Key'], u'the long valuueeee continuing here and all the way into this other header')

    def test_parse_datetime(self):
        post = self.construct(dict(Date='11 Jun 2011 08:50:22 GMT'), '')
        self.assertEqual(post.posted.isoformat(), '2011-06-11T08:50:22+00:00')

        post = self.construct(dict(Date='Thu, 25 Sep 2003 10:49:41 -0300'), '')
        self.assertEqual(post.posted.isoformat(), '2003-09-25T10:49:41-03:00')

        post = self.construct(dict(Date='Thu, 25 Sep 2003 10:49:41'), '')
        self.assertEqual(post.posted.isoformat(), '2003-09-25T10:49:41')

    def test_parse_header_with_intermediate_lines(self):
        # quick test
        raw = RawPost((None, 123, '<blaat@free.pt>', [
            'Something: blaat',
            'Test: A',
            'B',  # the intermediate line
            'Blaat: lkxjljsdf',
            'Test: C',
            'Another: doebi',
            '',  # start of body, the rest should be ignored by the tested method
            'Test: im the body but i look line a header',
        ]))
        self.assertEqual(
            raw.headers.join_with_intermediate('Test'),
            'A\nB\nC',
        )

        # reparse and verify
        raw.headers.resparse_with_intermediate()
        # make sure that headers following the intermediate line
        # are parsed as headers and not the body, as the initial
        # parsing method would do (that method interprets the intermediate
        # line as the start of the body)
        self.assertEqual(
            raw.headers['Another'],
            'doebi',
        )
        # make sure everything following the empty line is parsed as the body
        self.assertEqual(
            raw.get_content(),
            'Test: im the body but i look line a header',
        )

    def test_parse_xml_header_with_intermediate_lines(self):
        raw = RawPost((None, 123, '<blaat@free.pt>', [
            'Something: blaat',
            'X-XML: <Spotnet><Posting><Key><![CDATA[A',
            'B',  # the intermediate line
            'Blaat: lkxjljsdf',
            'X-XML: C]]></Key></Posting></Spotnet>',
            'Another: doebi',
            '',  # start of body, the rest should be ignored by the tested method
            'X-XML: im the body but i look line a header',
        ]))
        self.assertEqual(
            raw.extra['Key'],
            'A\nB\nC',
        )
        # make sure that headers following the intermediate line
        # are parsed as headers and not the body, as the initial
        # parsing method would do (that method interprets the intermediate
        # line as the start of the body)
        self.assertEqual(
            raw.headers['Another'],
            'doebi',
        )
        # make sure everything following the empty line is parsed as the body
        self.assertEqual(
            raw.get_content(),
            'X-XML: im the body but i look line a header',
        )

    def test_parse_xml_category_with_subcategories(self):
        raw = RawPost((None, 123, '<blaat@free.pt>', [
            'Something: blaat',
            'X-XML: <Spotnet><Posting><Category>01<Sub>01a03</Sub><Sub>01b03</Sub></Category></Posting></Spotnet>',
            '',
            'content',
        ]))
        self.assertEqual(
            raw.extra['Category'],
            '01',
        )
        self.assertEqual(
            raw.extra['Subcategories'],
            ['01a03', '01b03'],
        )
        self.assertEqual(
            raw.category,
            1,
        )
        self.assertEqual(
            raw.subcategories,
            [u'01a03', u'01b03'],
        )


class EncodingParsingTest(ParsingTest):

    def test_decoding_html_entities(self):
        post = self.construct({}, 'whatever')

        # test handling of html entities (both string and unicode)
        self.assertEqual(post.decode_entities('&amp;'), u'&')
        self.assertEqual(post.decode_entities(u'&amp;'), u'&')

        # test handling of decimal html entities (both string and unicode)
        self.assertEqual(post.decode_entities('&#38;'), u'&')
        self.assertEqual(post.decode_entities(u'&#38;'), u'&')

        # test handling of hex html entities (both string and unicode)
        self.assertEqual(post.decode_entities('&#x26;'), u'&')
        self.assertEqual(post.decode_entities(u'&#x26;'), u'&')

    def test_parsing_html_entity_from_header(self):
        raw = self.construct(dict(
            Subject='dokter van een pati&#235;nt',
        ), 'doe &#233;&#233;n ding')
        # check if the html entities are properly decoded
        self.assertEqual(raw.subject, u'dokter van een pati\xebnt')
        self.assertEqual(raw.description, u'doe \xe9\xe9n ding')
        post = Post.from_raw(raw)
        # check if the resulting post instance has the correct strings
        self.assertEqual(post.title, u'dokter van een pati\xebnt')
        self.assertEqual(post.description, u'doe \xe9\xe9n ding')
        post.save()
        post = Post.objects.get(id=post.id)
        # check if the correct title is saved to the database
        self.assertEqual(post.title, u'dokter van een pati\xebnt')
        self.assertEqual(post.description, u'doe \xe9\xe9n ding')

    def test_parsing_html_entity_from_xml(self):
        raw = self.construct(
            {},
            'Not used',
            dict(
                Title='dokter van een pati&#235;nt',
                Description='doe &#233;&#233;n ding',
            )
        )

        # check if the html entities are properly decoded
        self.assertEqual(raw.subject, u'dokter van een pati\xebnt')
        self.assertEqual(raw.description, u'doe \xe9\xe9n ding')
        post = Post.from_raw(raw)

        # check if the resulting post instance has the correct strings
        self.assertEqual(post.title, u'dokter van een pati\xebnt')
        self.assertEqual(post.description, u'doe \xe9\xe9n ding')
        post.save()
        post = Post.objects.get(id=post.id)

        # check if the correct title is saved to the database
        self.assertEqual(post.title, u'dokter van een pati\xebnt')
        self.assertEqual(post.description, u'doe \xe9\xe9n ding')

    def test_parsing_different_encoding_header(self):
        raw = self.construct(dict(
            Subject='=?ISO-8859-7?B?1PHp4e303Pb16+vv8iDM4ezc6u/y?=',
        ), 'blaat')
        self.assertEqual(raw.subject, u'\u03a4\u03c1\u03b9\u03b1\u03bd\u03c4\u03ac\u03c6\u03c5\u03bb\u03bb\u03bf\u03c2 \u039c\u03b1\u03bc\u03ac\u03ba\u03bf\u03c2')

    def test_parsing_different_encoding_header_from_xml(self):
        raw = self.construct(
            {},
            'Not used',
            dict(
                Title='=?ISO-8859-7?B?1PHp4e303Pb16+vv8iDM4ezc6u/y?=',
                Description='blaat',
            )
        )
        # see test_decoding_different_encodings
        # for why this should not be decoded like from headers
        self.assertEqual(raw.subject, u'=?ISO-8859-7?B?1PHp4e303Pb16+vv8iDM4ezc6u/y?=')