Source

smtpErrorAnalysis / smtpErrorAnalysis / findBadAddresses.py

Full commit
Richard Shea 9c5509f 





Richard Shea a030f1e 
















Richard Shea 9c5509f 






Richard Shea a030f1e 
Richard Shea 9c5509f 

Richard Shea 188aa61 
Richard Shea 9c5509f 









Richard Shea d1e4174 














Richard Shea 9c5509f 















Richard Shea d1e4174 
Richard Shea 9c5509f 















































Richard Shea 188aa61 
Richard Shea a030f1e 
Richard Shea 9c5509f 



Richard Shea d1e4174 







Richard Shea 9c5509f 




















Richard Shea d1e4174 

















































Richard Shea 9c5509f 







Richard Shea a030f1e 

Richard Shea 188aa61 



Richard Shea d1e4174 
Richard Shea 9c5509f 

Richard Shea 188aa61 












Richard Shea d1e4174 
Richard Shea 188aa61 
Richard Shea d1e4174 
Richard Shea 188aa61 














Richard Shea d1e4174 




















Richard Shea 9c5509f 
Richard Shea 188aa61 




















Richard Shea d1e4174 












































Richard Shea 9c5509f 
Richard Shea d1e4174 


Richard Shea 188aa61 
Richard Shea d1e4174 



Richard Shea 188aa61 
Richard Shea d1e4174 









Richard Shea 188aa61 
Richard Shea d1e4174 






Richard Shea 188aa61 
Richard Shea d1e4174 







Richard Shea 188aa61 
Richard Shea d1e4174 




Richard Shea 9c5509f 
Richard Shea d1e4174 









Richard Shea 9c5509f 
Richard Shea d1e4174 












Richard Shea 188aa61 








Richard Shea d1e4174 





Richard Shea 188aa61 
Richard Shea d1e4174 
Richard Shea 188aa61 
Richard Shea d1e4174 
Richard Shea 188aa61 











Richard Shea 9c5509f 










Richard Shea a030f1e 







































Richard Shea 9c5509f 




Richard Shea a030f1e 

Richard Shea 9c5509f 


Richard Shea a030f1e 
Richard Shea 9c5509f 






Richard Shea a030f1e 



Richard Shea 9c5509f 




Richard Shea a030f1e 
Richard Shea 188aa61 
Richard Shea 9c5509f 
Richard Shea a030f1e 

Richard Shea 9c5509f 




  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
'''
Allows a directory of email messages to be parsed for 'bounce messages'
and for those 'bounce messages' to be parsed for details which will 
allow the problems to be analysed.

Particular focus on emails bounced due to sender having used an invalid
address::

    Usage: findBadAddresses.py [options]

    findBadAddresses.py is used to parse a set of files  which represent the
    'inbox' of an email account  and consider those email messages which are
    'bounceback' emails sent by SMTP servers who have found it impossible to
    deliver emails sent by the owner of the 'inbox'.   Command line options
    specify the location of the 'inbox'and where output should be written to.

    Options:
      -h, --help            show this help message and exit
      -i INBOX, --inbox=INBOX
                            Location of INBOX
      -o PATH, --outpath=PATH
                            PATH to output csv file
      -v, --verbose         Show each file processed

'''
import os
import email
import csv
import re
import pprint
from optparse import OptionParser
ERR1 = "Found zero email addresses so don't know what to do" 
ERR2 = "Found more than one email address so don't know what to do [%s]"
ERR3 = "Found duplicate headers so don't know what to do [%s]"
HDR_OUTPUT_COLS = [ 'HUM-READ-EMAIL-ADDR',
                    'ACTION', 
                    'STATUS', 
                    'DIAGNOSTIC-CODE', 
                    'FINAL-RECIPIENT',
                    'ORIGINAL-RECIPIENT', 
                    'SOURCE-FILENAME', 
                    'REMOTE-MTA', 
                    'LAST-ATTEMPT-DATE',
                    'WILL-RETRY-UNTIL'] 
STYLE2_MAPPINGS = { 'TEMPKEY':'DIAGNOSTIC-CODE', 
                    'X-Failed-Recipients':'FINAL-RECIPIENT'} 
class EmailType:
    (STYLE1, STYLE2, UNKNOWN) = range(0, 3)
class HeaderValue(object):
    '''
    Represents a 'header/value' pair in an
    email etc
    '''
    def __init__(self,lst):
        self.headerName = lst[0]
        self.value = lst[1]



class FindBadAddExcptn(Exception):
    '''Base class for errors in this script.'''
    def __init__(self, value):
        super(FindBadAddExcptn, self).__init__(value)
        self.value = value
    def __str__(self):
        return repr(self.value)


def strip_line_feeds(string):
    '''
    Return the input string with CRLF
    characters removed
    '''
    string = string.replace("\r","")
    string = string.replace("\n","")
    vng = string.strip()
    return string

def find_email(instr):
    '''
    Given a string searches for all email addresses contained
    within the string. We assume:

    * At least email address will be found
    * All addresses found will be identical

    If this is so the email address found will be returned.
    If this is not so errors are raised

    '''
    #Following regex found at : http://stackp.online.fr/?p=19
    email_pattern = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')
    results = []
    #Put all email addresses found into a list. The resulting list
    #should contain one or more identical email addresses
    for match in email_pattern.findall(instr):
        results.append(match[0])

    if len(results) == 0: 
        raise FindBadAddExcptn(ERR1)
    elif len(results) > 1:
        for email_address in results[1:]:
            if email_address != results[0]:  
                raise FindBadAddExcptn( ERR2 % pprint.pformat(results))

    return results[0]


def remove_rfc_notation(email_to_be_cleaned):
    '''
    Given a string which contains an email address in oe of the two 
    following formats

        * ``a@foo.bar``
        * ``rfc:a@foo.bar``

    this function will return ``a@foo.bar``
    '''
    l_em_to_be_clnd = email_to_be_cleaned.split(';')
    if len(l_em_to_be_clnd) == 0:
        return l_em_to_be_clnd[0]
    else:
        return l_em_to_be_clnd[1]

def parse_email_for_dlv_stat_info(file_name, path_em_file, 
                                    csv_dict_wrtr, options):
    '''
    Given the text of a SMTP 'bounce message' writes a CSV row 
    to match the headers in the global variable HDR_OUTPUT_COLS.

    There are two styles of 'bounce message' it does this for.

    STYLE 1
    =======

    'STYLE1' is a multi-part mime message.

    Parsing is done by finding the 'message/delivery-status' part of 
    the entire email and parsing the headers.

    An 'message/delivery-status' part of a 'bounce email' looks a 
    little like this ::
    

        Content-Description: Delivery report
        Content-Type: message/delivery-status

        Reporting-MTA: dns; a.b.web              
        X-Postfix-Queue-ID: 808F17F8080
        X-Postfix-Sender: rfc822; someone@c.d.web
        Arrival-Date: Tue,  8 May 2012 16:30:12 -0700 (PDT)

        Final-Recipient: rfc822; john.smith@e.web
        Original-Recipient: rfc822;john.smith@e.web
        Action: failed
        Status: 5.0.0
        Remote-MTA: dns; smtp.e.web
        Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown

    STYLE 2
    =======

    'STYLE2' is not multi-part mime message. It consists of :
      
       * Email headers
       * Text detailing nature of the problem encountered
       * A copy of the original email message as sent


    It's not possible to obtain such rich information from 'STYLE 2' emails
    as it is from 'STYLE 1' as it's a significantly less informative email
    nevertheless it is possible to extract a useful subset of the information
    derived from 'STYLE 1' emails.

    The earlier part of such a message looks a little like this ::

        Return-Path: <>
        X-Original-To: someone@c.d.web
        Delivered-To: someone@c.d.web
        Received: from alpha.g.h.web (beta.g.h.web [101.101.101.101])
                by c.d.web (Postfix) with ESMTP id 010101010
                for <someone@c.d.web>; Tue,  8 May 2012 21:00:38 -0700 (PDT)
        Received: from unknown (HELO alpha.i.j.web) ([102.102.102.102])
          by gamma.i.j.web with ESMTP; 09 May 2012 16:00:24 +1200
        Received: from mailuser by alpha.i.j.web with LOCAL (Exim 4.69)
                id 9ABc2f-111111-AA
                for someone@c.d.web; Wed, 09 May 2012 16:00:24 +1200
        X-Failed-Recipients: John.Smith@e.web
        Auto-Submitted: auto-replied
        From: Mail Delivery System <Mailer-Daemon@i.j.web>
        To: someone@c.d.web
        Subject: Mail delivery failed: returning message to sender
        Message-Id: <E1SRy4e-11111b-DW@alpha.i.j.web>
        Date: Wed, 09 May 2012 16:00:24 +1200

        This message was created automatically by mail delivery software.

        A message that you sent could not be delivered to one or more of its
        recipients. This is a permanent error. The following address(es) failed:

          save to inbox
            generated by John.Smith@e.web
            mailbox is full: retry timeout exceeded

        ------ This is a copy of the message, including all the headers. ------




    NB: All sorts of assumptions are made about the structure of the 
    bounce message which seem to hold true for a large sample I have 
    used in testing but it seems likely that somewhere there are 'bounce
    messages' which follow different conventions. In particular I suspect
    that were the original email message to be something other than a two
    part multipart email message there might be problems

    '''
    if options.verbose:
        print "About to process : %s" % file_name

    if file_name == '''1336519817.V811I75c5d67M552469.diezel''':
        print "Debug here"

    em_style = EmailType.UNKNOWN
    em_file = file(path_em_file)
    em_msg = email.message_from_string(em_file.read())
    '''
    There are two parts of the email message which potentially
    interest us. Those which have a content-description of 

    * 'Notification' or
    * 'Delivery Report'

    We see if we can find both of those for later use and if we can't we
    bail out
    '''
    email_parts = em_msg.get_payload()
    bln_got_notification = False
    bln_got_dlv_report = False

    if em_msg.is_multipart():
        em_style = EmailType.STYLE1
        for email_part in email_parts:
            if email_part.has_key('Content-Description'):
                if email_part['Content-Description'] == 'Notification':
                    email_part_notification = email_part
                    bln_got_notification = True  
                elif email_part['Content-Description'] == 'Delivery report':
                    email_part_dlv_report = email_part
                    bln_got_dlv_report = True  

            if bln_got_dlv_report == False:  
                if email_part.has_key('Content-Type'):
                    if email_part['Content-Type'].split(";")[0] == 'message/delivery-status':
                        email_part_dlv_report = email_part
                        bln_got_dlv_report = True  
    elif em_msg.has_key('X-Failed-Recipients'):
        em_style = EmailType.STYLE2
        email_style2 = em_msg
#        '''
#        TODO:There is another class of SMTP bounceback message which is not
#        multipart and which could be processed here but I'm not going to 
#        try to do that at the moment
#        '''
#        print "1" * 50
#        print "File %s is potentially a unsupported format [d]" % file_name
#        for hdr_kv in em_msg.items():
#            hdr_name = hdr_kv[0].upper()
#            hdr_val = hdr_kv[1]
#            hdr_val = hdr_val.replace("\r","")
#            hdr_val = hdr_val.replace("\n","")
#            print "%s : %s" % (hdr_name, hdr_val)
#        dic_test = None
#        '''
#        for i in em_msg.items():
#            print "%s -> %s" % (i[0],i[1])
#        '''
#        print "2" * 50
    else:
        print "File %s is not a supported format [a]" % file_name
        '''
        print "=" * 40
        pprint.pprint(dir(em_msg))
        print "+" * 40
        partcount = 0
        for email_part in email_parts:
            emwork = email.message_from_string(email_part)
            partcount += 1
            print "%s-start" % partcount * 7 
            for i in emwork.items():
                print "%s -> %s" % (i[0],i[1])
            print "%s-end--" % partcount * 7 
        print "=" * 40
        '''

    '''
    Currently we only parse the 'Delivery Report' part so we only check if 
    that's been found. At some point we might also pull some stuff out 
    'Notification'
    '''
    if em_style == EmailType.STYLE1
        if bln_got_dlv_report == True:  
            #Convert the generator of email.message.Message objects returned
            #by .walk() to a list of email.message.Message
            try:
                lst_email_part_dlv_report = convert_gen_to_list(email_part_dlv_report.walk()) 
            except AttributeError:
                print "File %s is not a supported format [b]" % file_name
            else:
                '''
                The effect of 'walking' the email.message.Message that is 
                email_part_dlv_report is three other email.message.Message
                objects which correspond to the three blocks of headers seen
                above. That is :

                n=0 email.message.Message:
                    Content-Description: Delivery report
                    Content-Type: message/delivery-status

                n=1 email.message.Message:
                    Reporting-MTA: dns; a.b.web              
                    X-Postfix-Queue-ID: 808F17F8080
                    X-Postfix-Sender: rfc822; someone@c.d.web
                    Arrival-Date: Tue,  8 May 2012 16:30:12 -0700 (PDT)

                n=2 email.message.Message:
                    Final-Recipient: rfc822; john.smith@e.web
                    Original-Recipient: rfc822;john.smith@e.web
                    Action: failed
                    Status: 5.0.0
                    Remote-MTA: dns; smtp.e.web
                    Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown

                For our current purposes we are only interested in the content of the
                n=2 email.message.Message and so the others are ignored
                '''
                lst_hdrs_kv_pairs = lst_email_part_dlv_report[2].items()
                lst_hd_val = pre_process_key_values(lst_hdrs_kv_pairs)

                #Populate the dic_header_vals dictionary using the header
                #names as keys and the header values of element values
                dic_header_vals = populate_header_val_dict(em_style, lst_hd_val)

                #Add a couple of non-header derived values and write the row
                common_final_process_and_write(em_style, lst_hdrs_kv_pairs, file_name, csv_dict_wrtr)
        else:
            print "File %s is not a supported format [c]" % file_name
    elif em_style == EmailType.STYLE2
        lst_hdrs_kv_pairs = email_style2.items()

        #Add a couple of non-header derived values and write the row
        common_final_process_and_write(dic_header_vals, file_name, csv_dict_wrtr)
    else:
        print "File %s is not a supported format [d]" % file_name

def common_final_process_and_write(em_style, lst_hdrs_kv_pairs, file_name, csv_dict_wrtr):
    lst_hd_val = pre_process_key_values(lst_hdrs_kv_pairs)
    #Populate the dic_header_vals dictionary using the header
    #names as keys and the header values of element values
    dic_header_vals = populate_header_val_dict(em_style, lst_hd_val)
    dic_header_vals['SOURCE-FILENAME'] = file_name
    dic_header_vals['HUM-READ-EMAIL-ADDR'] = \
            remove_rfc_notation(dic_header_vals['FINAL-RECIPIENT'])
    #Write the dictionary as a CSV row
    csv_dict_wrtr.writerow(dic_header_vals)

def pre_process_key_values(lst_kv):
    '''
    Takes a list of tuples, `lst_kv`, and:
    
    * Uppercases the first element of each tuple
    * Strips LF/CR from second element of each tuple
    '''

    lst_kv_out = []
    for hdr_kv in lst_kv:
        hdr_name = hdr_kv[0].upper()
        hdr_val = hdr_kv[1]
        hdr_val = hdr_val.replace("\r","")
        hdr_val = hdr_val.replace("\n","")
        lst_kv_out.append(HeaderValue([hdr_name, hdr_val))
    return lst_kv_out

def pre_process_headers_for_style2(lst_hv):
    '''
    Takes a list of HeaderValue objects, `lst_hv` 
    and creates another list of HeaderValue objects 
    with modified membership set and content.

    In summary we're dropping some elements of the
    input list and for those we don't drop we're 
    modifying the 'headerName' property of the resulting
    HeaderValue instance.
    
    Each element of the input, `lst_hv` is tested 
    to see if it 'headerName' value exists as a key of 
    the constant dictionary `STYLE2_MAPPINGS` if it does then
    a modified form of that HeaderValue object is added to the 
    output list of HeaderValue objects.

    The output HeaderValue object has a 'headerName' property 
    corresponding to the value of the found element in `STYLE2_MAPPINGS`
    and a 'value' property corresponding to the 'value' element
    of the input HeaderValue object.
    
    '''
    lst_hv_out = []
    for hv in lst_hv:
        if hv.headerName in STYLE2_MAPPINGS:
            lst_hv_out.append(STYLE2_MAPPINGS[hv.headerName], hv.value)
    return lst_hv_out

def populate_header_val_dict(em_style, lst_hv):
    '''
    Populate dictionary with lst of tuples. Using upper-cased n=0 
    elem of each tuple as key and n=1 elem (after stripping lined 
    feeds) as value.

    Throw exception if the input list would generate the same key
    twice
    '''
    dic_header_vals = {}

    if em_style == EmailType.STYLE2:
        lst_hv = pre_process_headers_for_style2(lst_hv)

    for hv in lst_hv:
        if dic_header_vals.has_key(hv.headerName):
            raise FindBadAddExcptn(
                    ERR3 % pprint.pformat(lst_hv))
        else:
            dic_header_vals[hv.headerName] = hv.value 
    return dic_header_vals

def convert_gen_to_list(gen):
    '''
    Converts a generator to a list
    '''
    lstout = []
    for elem in gen:
        lstout.append(elem)
    
    return lstout

def build_ignore_list():
    '''
    Returns a hard-coded list of file names which will be ignored
    in subsequent processing

    This is not currently used but is left in place as it supports        
    the 'ignore me' structure which is in place    
    '''
    lst = []
    return lst

def parse_args():
    '''
    Parses command line arguments using OptionParser.
    Applies validation rules to arguments and then, if OK
    returns them in a 'dictionary like' object ``options``

    '''
    desc = "%prog is used to parse a set of files \n" + \
    "which represent the 'inbox' of an email account \n" + \
    "and consider those email messages which are 'bounceback'\n" + \
    "emails sent by SMTP servers who have found it impossible\n" + \
    "to deliver emails sent by the owner of the 'inbox'.\n" + \
    "\n\n" + \
    "Command line options specify the location of the 'inbox'" + \
    "and where output should be written to." 

    usage_inner = "Usage: %s [options]"
    usage = usage_inner % "%prog"


    parser = OptionParser(description=desc, usage=usage)
    parser.add_option(  "-i", "--inbox", action="store",  dest="inbox", 
                        metavar="INBOX", help="Location of INBOX")
    parser.add_option(  "-o", "--outpath", action="store", dest="outpath",
                        metavar="PATH", help="PATH to output csv file")
    parser.add_option(  "-v", "--verbose", action="store_true", 
                        dest="verbose", help="Show each file processed")

    (options, args) = parser.parse_args()

    if (options.inbox is None) and (options.outpath is None):   
        parser.print_help()
        exit(-1)
    elif not os.path.exists(options.inbox):
        parser.error('inbox location does not exist')
    elif not os.path.exists(os.path.dirname(options.outpath)):
        parser.error('path to ouput location does not exist')

    return options

def main():
    '''
    The main() function

    '''

    options = parse_args()

    #Create a csv.DictWriter to write output to
    csv_dict_wrtr = csv.DictWriter( \
            open(options.outpath, 'wb'), \
            HDR_OUTPUT_COLS, \
            restval='N/A', \
            dialect='excel')

    #Write the initial headers
    csv_dict_wrtr.writerow(dict(zip(HDR_OUTPUT_COLS, HDR_OUTPUT_COLS)))

    lst_files_to_ignore = build_ignore_list() 

    listing = os.listdir(options.inbox)

    #Process each file in turn
    for in_file_name in listing:
        if in_file_name in lst_files_to_ignore:
            pass
        else:
            in_file_path = "%s/%s" % (options.inbox, in_file_name)
            parse_email_for_dlv_stat_info(  in_file_name, 
                                            in_file_path, 
                                            csv_dict_wrtr,
                                            options)

if __name__ == "__main__":
    main()