Source

smtpErrorAnalysis / smtpErrorAnalysis / findBadAddresses.py

Richard Shea 9c5509f 





Richard Shea a030f1e 
















Richard Shea 9c5509f 






Richard Shea a030f1e 
Richard Shea 9c5509f 

Richard Shea 188aa61 
Richard Shea 9c5509f 










































































Richard Shea 188aa61 
Richard Shea a030f1e 
Richard Shea 9c5509f 

































Richard Shea a030f1e 

Richard Shea 188aa61 



Richard Shea 9c5509f 

Richard Shea 188aa61 














































Richard Shea 9c5509f 
Richard Shea 188aa61 





















Richard Shea 9c5509f 


Richard Shea 188aa61 
Richard Shea 9c5509f 
Richard Shea 188aa61 
Richard Shea 9c5509f 
Richard Shea 188aa61 





























Richard Shea 9c5509f 
Richard Shea 188aa61 
Richard Shea 9c5509f 
Richard Shea 188aa61 


Richard Shea 9c5509f 
Richard Shea 188aa61 


Richard Shea 9c5509f 

Richard Shea 188aa61 
































Richard Shea 9c5509f 










Richard Shea a030f1e 







































Richard Shea 9c5509f 




Richard Shea a030f1e 

Richard Shea 9c5509f 


Richard Shea a030f1e 
Richard Shea 9c5509f 






Richard Shea a030f1e 



Richard Shea 9c5509f 




Richard Shea a030f1e 
Richard Shea 188aa61 
Richard Shea 9c5509f 
Richard Shea a030f1e 

Richard Shea 9c5509f 




'''
Allows a directory of email messages to be parsed for 'bounce messages'
and for those 'bounce messages' to be parsed for details which will 
allow the problems to be analysed.

Particular focus on emails bounced due to sender having used an invalid
address::

    Usage: findBadAddresses.py [options]

    findBadAddresses.py is used to parse a set of files  which represent the
    'inbox' of an email account  and consider those email messages which are
    'bounceback' emails sent by SMTP servers who have found it impossible to
    deliver emails sent by the owner of the 'inbox'.   Command line options
    specify the location of the 'inbox'and where output should be written to.

    Options:
      -h, --help            show this help message and exit
      -i INBOX, --inbox=INBOX
                            Location of INBOX
      -o PATH, --outpath=PATH
                            PATH to output csv file
      -v, --verbose         Show each file processed

'''
import os
import email
import csv
import re
import pprint
from optparse import OptionParser
ERR1 = "Found zero email addresses so don't know what to do" 
ERR2 = "Found more than one email address so don't know what to do [%s]"
ERR3 = "Found duplicate headers so don't know what to do [%s]"
HDR_OUTPUT_COLS = [ 'HUM-READ-EMAIL-ADDR',
                    'ACTION', 
                    'STATUS', 
                    'DIAGNOSTIC-CODE', 
                    'FINAL-RECIPIENT',
                    'ORIGINAL-RECIPIENT', 
                    'SOURCE-FILENAME', 
                    'REMOTE-MTA', 
                    'LAST-ATTEMPT-DATE',
                    'WILL-RETRY-UNTIL'] 
class FindBadAddExcptn(Exception):
    '''Base class for errors in this script.'''
    def __init__(self, value):
        super(FindBadAddExcptn, self).__init__(value)
        self.value = value
    def __str__(self):
        return repr(self.value)


def strip_line_feeds(string):
    '''
    Return the input string with CRLF
    characters removed
    '''
    string = string.replace("\r","")
    string = string.replace("\n","")
    string = string.strip()
    return string

def find_email(instr):
    '''
    Given a string searches for all email addresses contained
    within the string. We assume:

    * At least email address will be found
    * All addresses found will be identical

    If this is so the email address found will be returned.
    If this is not so errors are raised

    '''
    #Following regex found at : http://stackp.online.fr/?p=19
    email_pattern = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')
    results = []
    #Put all email addresses found into a list. The resulting list
    #should contain one or more identical email addresses
    for match in email_pattern.findall(instr):
        results.append(match[0])

    if len(results) == 0: 
        raise FindBadAddExcptn(ERR1)
    elif len(results) > 1:
        for email_address in results[1:]:
            if email_address != results[0]:  
                raise FindBadAddExcptn( ERR2 % pprint.pformat(results))

    return results[0]


def remove_rfc_notation(email_to_be_cleaned):
    '''
    Given a string which contains an email address in oe of the two 
    following formats

        * ``a@foo.bar``
        * ``rfc:a@foo.bar``

    this function will return ``a@foo.bar``
    '''
    l_em_to_be_clnd = email_to_be_cleaned.split(';')
    if len(l_em_to_be_clnd) == 0:
        return l_em_to_be_clnd[0]
    else:
        return l_em_to_be_clnd[1]

def parse_email_for_dlv_stat_info(file_name, path_em_file, 
                                    csv_dict_wrtr, options):
    '''
    Given the text of a SMTP 'bounce message' writes a CSV row 
    to match the headers in the global variable HDR_OUTPUT_COLS.

    It does this by finding the 'message/delivery-status' part of 
    the entire email and parsing the headers.

    An 'message/delivery-status' part of a 'bounce email' looks a 
    little like this ::
    

        Content-Description: Delivery report
        Content-Type: message/delivery-status

        Reporting-MTA: dns; a.b.web              
        X-Postfix-Queue-ID: 808F17F8080
        X-Postfix-Sender: rfc822; someone@c.d.web
        Arrival-Date: Tue,  8 May 2012 16:30:12 -0700 (PDT)

        Final-Recipient: rfc822; john.smith@e.web
        Original-Recipient: rfc822;john.smith@e.web
        Action: failed
        Status: 5.0.0
        Remote-MTA: dns; smtp.e.web
        Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown

    NB: All sorts of assumptions are made about the structure of the 
    bounce message which seem to hold true for a large sample I have 
    used in testing but it seems likely that somewhere there are 'bounce
    messages' which follow different conventions. In particular I suspect
    that were the original email message to be something other than a two
    part multipart email message there might be problems

    '''
    if options.verbose:
        print "About to process : %s" % file_name

    if file_name == '''1336519817.V811I75c5d67M552469.diezel''':
        print "Debug here"

    em_file = file(path_em_file)
    em_msg = email.message_from_string(em_file.read())
    '''
    There are two parts of the email message which potentially
    interest us. Those which have a content-description of 

    * 'Notification' or
    * 'Delivery Report'

    We see if we can find both of those for later use and if we can't we
    bail out
    '''
    email_parts = em_msg.get_payload()
    bln_got_notification = False
    bln_got_dlv_report = False
    if em_msg.is_multipart():
        for email_part in email_parts:
            if email_part.has_key('Content-Description'):
                if email_part['Content-Description'] == 'Notification':
                    email_part_notification = email_part
                    bln_got_notification = True  
                elif email_part['Content-Description'] == 'Delivery report':
                    email_part_dlv_report = email_part
                    bln_got_dlv_report = True  

            if bln_got_dlv_report == False:  
                if email_part.has_key('Content-Type'):
                    if email_part['Content-Type'].split(";")[0] == 'message/delivery-status':
                        email_part_dlv_report = email_part
                        bln_got_dlv_report = True  
    elif em_msg.has_key('X-Failed-Recipients'):
        '''
        TODO:There is another class of SMTP bounceback message which is not
        multipart and which could be processed here but I'm not going to 
        try to do that at the moment
        '''
        print "1" * 50
        print "File %s is potentially a unsupported format [d]" % file_name
        for hdr_kv in em_msg.items():
            hdr_name = hdr_kv[0].upper()
            hdr_val = hdr_kv[1]
            hdr_val = hdr_val.replace("\r","")
            hdr_val = hdr_val.replace("\n","")
            print "%s : %s" % (hdr_name, hdr_val)
        '''
        for i in em_msg.items():
            print "%s -> %s" % (i[0],i[1])
        '''
        print "2" * 50
    else:
        print "File %s is not a supported format [a]" % file_name
        '''
        print "=" * 40
        pprint.pprint(dir(em_msg))
        print "+" * 40
        partcount = 0
        for email_part in email_parts:
            emwork = email.message_from_string(email_part)
            partcount += 1
            print "%s-start" % partcount * 7 
            for i in emwork.items():
                print "%s -> %s" % (i[0],i[1])
            print "%s-end--" % partcount * 7 
        print "=" * 40
        '''

    '''
    Currently we only parse the 'Delivery Report' part so we only check if 
    that's been found. At some point we might also pull some stuff out 
    'Notification'
    '''
    if bln_got_dlv_report == True:  
        #Convert the generator of email.message.Message objects returned
        #by .walk() to a list of email.message.Message
        try:
            lst_email_part_dlv_report = convert_gen_to_list(email_part_dlv_report.walk()) 
        except AttributeError:
            print "File %s is not a supported format [b]" % file_name
        else:
            '''
            The effect of 'walking' the email.message.Message that is 
            email_part_dlv_report is three other email.message.Message
            objects which correspond to the three blocks of headers seen
            above. That is :

            n=0 email.message.Message:
                Content-Description: Delivery report
                Content-Type: message/delivery-status

            n=1 email.message.Message:
                Reporting-MTA: dns; a.b.web              
                X-Postfix-Queue-ID: 808F17F8080
                X-Postfix-Sender: rfc822; someone@c.d.web
                Arrival-Date: Tue,  8 May 2012 16:30:12 -0700 (PDT)

            n=2 email.message.Message:
                Final-Recipient: rfc822; john.smith@e.web
                Original-Recipient: rfc822;john.smith@e.web
                Action: failed
                Status: 5.0.0
                Remote-MTA: dns; smtp.e.web
                Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown

            For our current purposes we are only interested in the content of the
            n=2 email.message.Message and so the others are ignored
            '''
            lst_hdrs_kv_pairs = lst_email_part_dlv_report[2].items()

            #Populate the dic_header_vals dictionary using the header
            #names as keys and the header values of element values
            dic_header_vals = populate_header_val_dict(lst_hdrs_kv_pairs)
            #Add a couple of non-header derived values
            dic_header_vals['SOURCE-FILENAME'] = file_name
            dic_header_vals['HUM-READ-EMAIL-ADDR'] = \
                    remove_rfc_notation(dic_header_vals['FINAL-RECIPIENT'])
            #Write the dictionary as a CSV row
            csv_dict_wrtr.writerow(dic_header_vals)
    else:
        print "File %s is not a supported format [c]" % file_name


def populate_header_val_dict(lst_kv):
    '''
    Populate dictionary with lst of tuples. Using upper-cased n=0 
    elem of each tuple as key and n=1 elem (after stripping lined 
    feeds) as value.

    Throw exception if the input list would generate the same key
    twice
    '''
    dic_header_vals = {}
    for hdr_kv in lst_kv:
        hdr_name = hdr_kv[0].upper()
        hdr_val = hdr_kv[1]
        hdr_val = hdr_val.replace("\r","")
        hdr_val = hdr_val.replace("\n","")
        
        if dic_header_vals.has_key(hdr_name):
            raise FindBadAddExcptn(
                    ERR3 % pprint.pformat(lst_kv))
        else:
            dic_header_vals[hdr_name] = hdr_val 
    return dic_header_vals

def convert_gen_to_list(gen):
    '''
    Converts a generator to a list
    '''
    lstout = []
    for elem in gen:
        lstout.append(elem)
    
    return lstout

def build_ignore_list():
    '''
    Returns a hard-coded list of file names which will be ignored
    in subsequent processing

    This is not currently used but is left in place as it supports        
    the 'ignore me' structure which is in place    
    '''
    lst = []
    return lst

def parse_args():
    '''
    Parses command line arguments using OptionParser.
    Applies validation rules to arguments and then, if OK
    returns them in a 'dictionary like' object ``options``

    '''
    desc = "%prog is used to parse a set of files \n" + \
    "which represent the 'inbox' of an email account \n" + \
    "and consider those email messages which are 'bounceback'\n" + \
    "emails sent by SMTP servers who have found it impossible\n" + \
    "to deliver emails sent by the owner of the 'inbox'.\n" + \
    "\n\n" + \
    "Command line options specify the location of the 'inbox'" + \
    "and where output should be written to." 

    usage_inner = "Usage: %s [options]"
    usage = usage_inner % "%prog"


    parser = OptionParser(description=desc, usage=usage)
    parser.add_option(  "-i", "--inbox", action="store",  dest="inbox", 
                        metavar="INBOX", help="Location of INBOX")
    parser.add_option(  "-o", "--outpath", action="store", dest="outpath",
                        metavar="PATH", help="PATH to output csv file")
    parser.add_option(  "-v", "--verbose", action="store_true", 
                        dest="verbose", help="Show each file processed")

    (options, args) = parser.parse_args()

    if (options.inbox is None) and (options.outpath is None):   
        parser.print_help()
        exit(-1)
    elif not os.path.exists(options.inbox):
        parser.error('inbox location does not exist')
    elif not os.path.exists(os.path.dirname(options.outpath)):
        parser.error('path to ouput location does not exist')

    return options

def main():
    '''
    The main() function

    '''

    options = parse_args()

    #Create a csv.DictWriter to write output to
    csv_dict_wrtr = csv.DictWriter( \
            open(options.outpath, 'wb'), \
            HDR_OUTPUT_COLS, \
            restval='N/A', \
            dialect='excel')

    #Write the initial headers
    csv_dict_wrtr.writerow(dict(zip(HDR_OUTPUT_COLS, HDR_OUTPUT_COLS)))

    lst_files_to_ignore = build_ignore_list() 

    listing = os.listdir(options.inbox)

    #Process each file in turn
    for in_file_name in listing:
        if in_file_name in lst_files_to_ignore:
            pass
        else:
            in_file_path = "%s/%s" % (options.inbox, in_file_name)
            parse_email_for_dlv_stat_info(  in_file_name, 
                                            in_file_path, 
                                            csv_dict_wrtr,
                                            options)

if __name__ == "__main__":
    main()