Source

django-spotnet / spotnet / connection.py

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
import nntplib
import socket
import errno
import os
import re
import sys
import settings
import zlib
from cStringIO import StringIO
from django.db import transaction, IntegrityError
from django.db.utils import DatabaseError
from models import Post, PostMarker
from post import RawPost, InvalidPost
from nzb import decode_nzb, DecodeNzbError

# TODO:
# maybe decode headers using a function (new in python 3?)
# http://docs.python.org/py3k/library/nntplib.html?highlight=nntp#nntplib.decode_header

noop = lambda x: None


NEWSSERVER_UNAUTH_USERNAME = None
NEWSSERVER_UNAUTH_PASSWORD = None


# matches: [title]"filename"[[ -] 12,34[ ]MB[ -]] yEnc[ (1/234)]
# with optional parts in []
TITLE_SELECTION_REGEXP = re.compile('^(.+)?\".+"(( -)? \d+([,.]\d+)? ?[kKMGTPEZY][bB]( -)?)? yEnc( \(\d+/\d+\))?$')


class ConnectionError(Exception):
    pass


class ConnectError(ConnectionError):
    pass


class NotFoundError(Exception):
    pass


#
# We are working with two types of id's here
#
# - messageid : universally unique id, eg: '<3ae6b2dcccdf42988a85dc314370ba0123@free.pt>'
# - postnumber : server dependent id, eg: '65903' (always use strings in nntplib!)
#


class Connection(object):
    """Class to connect to the nntp server with."""

    def __init__(self, connect=True):
        self._nntp = None
        if connect:
            self.connect()

    # public methods

    def is_connected(self):
        return self._nntp is not None

    def connect(self):
        # connect to server
        try:
            if sys.version[:2] < (3, 2):
                nntp = nntplib.NNTP(
                    host=settings.SERVER_HOST,
                    port=settings.SERVER_PORT,
                    user=(
                        settings.SERVER_USERNAME
                        if settings.SERVER_USERNAME is not None
                        else NEWSSERVER_UNAUTH_USERNAME
                    ),
                    password=(
                        settings.SERVER_PASSWORD
                        if settings.SERVER_PASSWORD is not None
                        else NEWSSERVER_UNAUTH_PASSWORD
                    ),
                    readermode=settings.SERVER_READERMODE,
                    usenetrc=False,
                )
            else:
                # try to encrypt connection using ssl
                nntp = nntplib.NNTP(
                    host=settings.SERVER_HOST,
                    port=settings.SERVER_PORT,
                    readermode=settings.SERVER_READERMODE,
                    usenetrc=False,
                )
                # this 'starttls' method is introduced in python 3.2
                nntp.starttls(ssl_context=None)
                # login, now that we might be encrypted
                nntp.login(
                    user=settings.SERVER_USERNAME if settings.SERVER_USERNAME is not None else NEWSSERVER_UNAUTH_USERNAME,
                    password=settings.SERVER_PASSWORD if settings.SERVER_PASSWORD is not None else NEWSSERVER_UNAUTH_PASSWORD,
                )
        except (nntplib.NNTPError, socket.error) as e:
            raise ConnectError(e)
        self._nntp = nntp

    def disconnect(self):
        if self.is_connected():
            try:
                quitmsg = self._nntp.quit()
            except (EOFError, nntplib.NNTPProtocolError):
                # seems to happen for me, but we're still disconnected
                # TODO: find a way to check if we're really disconnected
                # and rethrow the exception if not
                pass
            except socket.error as e:
                if e.errno != errno.EPIPE:
                    raise
            self._nntp = None

    def update(self, logger=noop):
        "Retrieves all new posts."
        logger("Updating spotnet")
        for group in settings.UPDATE_GROUPS:
            self.update_group(group, logger=lambda x: logger('  %s' % x))

    # private update methods

    def update_group(self, groupname, logger=noop, postnumber_start=None, postnumber_end=None):
        logger("Updating group '%s'" % groupname)
        try:
            gresp = self._nntp.group(groupname)
        except (nntplib.NNTPError, socket.error) as e:
            logger("Updating group '%s' failed: %s" % (groupname, e))
            raise ConnectionError(e)

        first_on_server, last_on_server = gresp[2], gresp[3]  # first < last

        if postnumber_start is None:
            # get the postnumber to start with
            last_in_db = self.get_last_postnumber_in_db()

            start_options = [int(first_on_server)]
            if last_in_db is not None:
                start_options.append(int(last_in_db))
            if settings.UPDATE_MINPOST is not None:
                start_options.append(settings.UPDATE_MINPOST)
            postnumber_start = max(start_options)

        if postnumber_end is None:
            # get the postnumber to end with
            # we try to add some more posts than the server
            # says they have, this seems to work sometimes
            postnumber_end = int(last_on_server) + settings.UPDATE_EXTRA

        curstart = postnumber_start
        last_added = None
        while curstart < postnumber_end:
            x = self.update_group_postnumbers(
                groupname,
                curstart,
                curstart + settings.UPDATE_BULK_COUNT,
                logger=lambda x: logger('  %s' % x),
            )
            if x:
                last_added = x
            curstart += settings.UPDATE_BULK_COUNT

        # store last added post's postnumber
        # since there's not always a way to obtain it
        # from the last messageid alone
        if settings.UPDATE_LAST_STORAGE:
            self.set_last_postnumber_in_db(groupname, last_added)

    def update_group_postnumbers(self, groupname, start, end, logger=noop):
        logger("Updating group '%s', block [%d, %d]" % (groupname, start, end))
        try:
            xover = self._nntp.xover(str(start), str(end))
        except (nntplib.NNTPError, socket.error) as e:
            logger(
                "Error updating group '%s', block [%d, %d]: %s"
                % (groupname, start, end, e)
            )
            raise ConnectionError(e)

        last_added = None
        index = 0
        while index < len(xover[1]):
            post = xover[1][index]

            # unpack the post tuple
            postnumber, subject, poster, date, messageid, references, size, lines = post

            if not self.discard_post(messageid, subject) and \
                    self.is_spotnet_post(messageid, subject):
                if self.is_dispose_message(subject):
                    self.add_dispose_message(
                        postnumber,
                        messageid,
                        poster.decode('utf8', 'ignore'),
                        subject.decode('utf8', 'ignore'),
                        logger=lambda x: logger('  %s' % x),
                    )
                else:
                    self.add_post(
                        postnumber,
                        messageid,
                        logger=lambda x: logger('  %s' % x),
                    )

#                    except socket.error as e:
#                        if e.errno == errno.ECONNRESET:
#                            # this happens, just reconnect and proceed
#                            self.disconnect()
#                            self.connect()
#                            index -= 1
#                        else:
#                            raise

            if False:  # verbosity > 2
                if self.discard_post(messageid, subject):
                    logger(
                        "Skipped discarded post %s: %s"
                        % (postnumber, messageid)
                    )
                if not self.is_spotnet_post(messageid, subject):
                    logger(
                        "Skipped non-spotnet post %s: %s %s"
                        % (postnumber, messageid, subject)
                    )

            index += 1
        return last_added

    def discard_post(self, messageid, subject):
        # the default implementation is to discard all posts
        # where the messageid (without the '<>') part after the '@'
        # is in the SPOTNET_UPDATE_DISCARD_ENDINGS setting iterable.
        return messageid[1:-1].split('@', 1)[-1] in settings.UPDATE_DISCARD_ENDINGS

    def is_spotnet_post(self, messageid, subject):
        # we limit ourselves here since not all posts provide real spotnet posts
        # (they are the files that make up the downloads themselves)
        return (TITLE_SELECTION_REGEXP.match(subject) is None and
            # this seems to conclusively indicate a download file
            not messageid.startswith('<part') and
            # this one too
            not '$' in messageid)

    def is_dispose_message(self, subject):
        return subject.startswith('DISPOSE ')  # and '@' in subject

    def post_exists(self, messageid):
        # we check existance by messageid since it unique and consistent across nntp servers
        return Post.objects.filter(messageid=messageid[:80]).exists()

    def dispose_message_exists(self, messageid, personid, logger):
        # we check existance by messageid since it unique and consistent across nntp servers
        try:
            return PostMarker.objects.filter(messageid=messageid[:80], person_id=personid[:180]).exists()
        except DatabaseError as e:
            if not str(e).startswith('invalid byte sequence for encoding'):
                raise
            else:
                # same situation as in the add_post method
                logger("Skipped invalid dispose message %s: %s" % (messageid, e))
                return False

    @transaction.commit_on_success
    def add_post(self, postnumber, messageid, logger=noop):
        "Add a new post to the database"

        if self.post_exists(messageid):
            if False:  # verbosity > 1
                logger(
                    "Skipped existing post %s: %s"
                    % (postnumber, messageid)
                )
            return False

        # get the full message from the nntp server
        try:
            raw = self.get_raw_post(messageid)
            snp = Post.from_raw(raw)
        except ConnectionError:
            # TODO: don't give up so easily
            return False
        except InvalidPost as e:
            logger("Skipped invalid post %s %s: %s" % (postnumber, messageid, e))
            return False

        try:
            snp.save()
        except IntegrityError as e:
            if str(e) != 'column messageid is not unique':
                raise
            else:
                # this post must already exist
                logger(
                    "Skipped existing post %s: %s"
                    % (postnumber, messageid)
                )
                return False
        except DatabaseError as e:
            if not str(e).startswith('invalid byte sequence for encoding'):
                raise
            else:
                # TODO: this is due to a bug in python
                # source: http://stackoverflow.com/questions/3487377/how-can-i-check-a-python-unicode-string-to-see-that-it-actually-is-proper-unic
                # and the fact that some posts aren't textual posts
                # but encoded binary files that passed the filtering in
                # the is_spotnet_post method. We just ignore them.
                logger("Skipped invalid post %s %s: %s" % (postnumber, messageid, e))
                return False
        else:
            logger("Added post %s: %s" % (postnumber, messageid))
            return True

    @transaction.commit_on_success
    def add_dispose_message(self, postnumber, messageid, poster, subject, logger=noop):
        "Add a dispose message to the database"

        dispose_messageid = '<%s>' % subject[len('DISPOSE '):]

        if self.dispose_message_exists(dispose_messageid, poster, logger):
            if False:  # verbosity > 1
                logger(
                    "Skipped existing dispose message %s: %s"
                    % (postnumber, messageid)
                )
            return False

        try:
            PostMarker.objects.create(
                messageid=dispose_messageid[:80],
                person_id=poster[:180],
                is_good=False,
            )
        except IntegrityError:
            # this marker must already exist
            logger(
                "Skipped existing dispose message %s: %s"
                % (postnumber, messageid)
            )
            return False
        except DatabaseError as e:
            if not str(e).startswith('invalid byte sequence for encoding'):
                raise
            else:
                # same situation as in the add_post method
                logger("Skipped invalid dispose message %s %s: %s" % (postnumber, messageid, e))
                return False
        else:
            logger("Added dispose message %s: %s" % (postnumber, messageid))
            return True

    def get_raw_post(self, messageid):
        try:
            post = self._nntp.article(messageid)
        except (nntplib.NNTPError, socket.error) as e:
            raise ConnectionError(e)
        else:
            return RawPost(post)

    def verify_post(self, post):
        keys = settings.VERIFICATION_KEYS
        if keys is None or len(keys) == 0:
            return True
        else:
            raise NotImplementedError  # TODO

    # public functionality methods

    def get_nzb(self, post):
        "Retrieves the nzb for a post, returned is the nzb content"
        assert self.is_connected()
        zipped = StringIO()  # TODO: maybe replace this with os.tmpfile
        try:
            for messageid in post.nzb:
                self._nntp.body('<%s>' % messageid, zipped)
        except nntplib.NNTPTemporaryError as e:
            if e.response == '430 No such article':
                raise NotFoundError
            else:
                raise
        content = zipped.getvalue()
        del zipped

        try:
            return decode_nzb(content)
        except DecodeNzbError as e:
            raise ConnectionError(e.msg)

    def get_comments(self, post):
        "Retrieves the comments for a post"
        assert self.is_connected()
        raise NotImplementedError  # TODO

    # internal methods

    def get_last_messageid_in_db(self):
        # note that the last posted messageid
        # generally does not have the largest postnumber
        try:
            snp = Post.objects.order_by('-posted').only('messageid')[0]
        except IndexError:
            return None
        else:
            return snp.messageid

    def get_last_postnumber_in_db(self):
        # to be server independent,
        # we get the last messageid
        # and then get the corresponding
        # postnumber
        messageid = self.get_last_messageid_in_db()
        if messageid is None:
            return None
        try:
            stat = self._nntp.stat(messageid)
        except (nntplib.NNTPError, socket.error) as e:
            return None
        else:
            return stat[1]

    def set_last_postnumber_in_db(self, groupname, last_added):
        if settings.UPDATE_LAST_STORAGE:
            raise NotImplementedError