1. Richard Shea
  2. smtpErrorAnalysis

Commits

Richard Shea  committed 09b751b

A lot of changes to improve PEP8 complaince. This is a working version and another snapshot before I get rid of some old stuff which is no longer needed but which might be reviewed in future.

  • Participants
  • Parent commits 9053f35
  • Branches default

Comments (0)

Files changed (1)

File findBadAddresses.py

View file
  • Ignore whitespace
 import email
 import csv
 import sys
-'''
-f=file(fp)
-fp="./1306429999.V811I65cc573M345725.diezel"
-msg = email_message_from_string(f)
-print msg.as_string()
-print msg.get_payload()[0]
-'''
 import re
 import pprint
-class MyError(Exception):
+ERR1 = "Found zero email addresses so don't know what to do" 
+ERR2 = "Found more than one email address so don't know what to do [%s]"
+HDR_OUTPUT_COLS = [ 'HUM-READ-EMAIL-ADDR',
+                    'ACTION', 
+                    'STATUS', 
+                    'DIAGNOSTIC-CODE', 
+                    'FINAL-RECIPIENT',
+                    'ORIGINAL-RECIPIENT', 
+                    'SOURCE-FILENAME', 
+                    'REMOTE-MTA', 
+                    'LAST-ATTEMPT-DATE',
+                    'WILL-RETRY-UNTIL'] 
+class FindBadAddExcptn(Exception):
+    '''Base class for errors in this script.'''
     def __init__(self, value):
         self.value = value
     def __str__(self):
 
 
 def strip_line_feeds(string):
+    '''
+    Return the input string with CRLF
+    characters removed
+    '''
     string = string.replace("\r","")
     string = string.replace("\n","")
     string = string.strip()
     return string
 
 def strip_from_verbose_message(sin, comp_obj_verbose_msg):
+    '''
+    This is an unused function which should be refactored
+    into a more general purpose function or removed
+    '''
     match_obj = comp_obj_verbose_msg.search(sin)
     try:
-        MAILTOADDRESS = match_obj.group('MAILTOADDRESS')
+        mailto_address = match_obj.group('MAILTOADDRESS')
     except AttributeError: 
-        MAILTOADDRESS = sin
+        mailto_address = sin
 
-    return MAILTOADDRESS
-
-def parse_error_component(in_string):
-    '''
-    for line in instr:
-        print line
-    '''
-    dbg_var = -1 
+    return mailto_address
 
 def parse_error_component1(in_msg, dic_diag):
-    dbg_var = -1 
+    '''
+    This is an unused function which should be refactored
+    into a more general purpose function or removed
+    '''
     #Do stupid stuff to convert
     #a generator to a list
     list_parts = []
     return dic_diag
 
 def find_email(instr):
-    ERR1 ="Found zero email addresses so don't know what to do" 
-    ERR2 ="Found more than one email address so don't know what to do [%s]"
+    '''
+    Given a string searches for all email addresses contained
+    within the string. We assume:
+
+    * At least email address will be found
+    * All addresses found will be identical
+
+    If this is so the email address found will be returned.
+    If this is not so errors are raised
+
+    '''
     #Following regex found at : http://stackp.online.fr/?p=19
     email_pattern = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')
-    hitCount = 0
     results = []
     #Put all email addresses found into a list. The resulting list
     #should contain one or more identical email addresses
         results.append(match[0])
 
     if len(results) == 0: 
-        raise MyError(ERR1)
+        raise FindBadAddExcptn(ERR1)
     elif len(results) > 1:
         for email_address in results[1:]:
             if email_address != results[0]:  
-                raise MyError( ERR2 % pprint.pformat(results))
+                raise FindBadAddExcptn( ERR2 % pprint.pformat(results))
 
     return results[0]
 
 
 def remove_rfc_notation(email_to_be_cleaned):
-    l = email_to_be_cleaned.split(';')
-    if len(l) == 0:
-        return l[0]
+    '''
+    Given a string which contains an email address in oe of the two 
+    following formats
+
+    * a@foo.bar
+    * rfc:a@foo.bar
+
+    this function will return a@foo.bar
+    '''
+    l_em_to_be_clnd = email_to_be_cleaned.split(';')
+    if len(l_em_to_be_clnd) == 0:
+        return l_em_to_be_clnd[0]
     else:
-        return l[1]
+        return l_em_to_be_clnd[1]
 
-def process_email_headers(file_name, p, comp_obj, csv_dict_wrtr):
-    ERR1 = "Found a duplicate header when parsing error email"
+def process_email_headers(file_name, path_em_file, csv_dict_wrtr):
+    '''
+    Given the text of a SMTP 'bounce message' writes a CSV row 
+    to match the headers in the global variable HDR_OUTPUT_COLS.
+
+    It does this by finding the 'message/delivery-status' part of 
+    the entire email and parsing the headers.
+
+    An 'message/delivery-status' part of a 'bounce email' looks a 
+    little like this :
+    
+    ==START=====================================================
+
+    Content-Description: Delivery report
+    Content-Type: message/delivery-status
+
+    Reporting-MTA: dns; a.b.web              
+    X-Postfix-Queue-ID: 808F17F8080
+    X-Postfix-Sender: rfc822; someone@c.d.web
+    Arrival-Date: Tue,  8 May 2012 16:30:12 -0700 (PDT)
+
+    Final-Recipient: rfc822; john.smith@e.web
+    Original-Recipient: rfc822;john.smith@e.web
+    Action: failed
+    Status: 5.0.0
+    Remote-MTA: dns; smtp.e.web
+    Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown
+    ==END  =====================================================
+
+    NB: All sorts of assumptions are made about the structure of the 
+    bounce message which seem to hold true for a large sample I have 
+    used in testing but it seems likely that somewhere there are 'bounce
+    messages' which follow different conventions. In particular I suspect
+    that were the original email message to be something other than a two
+    part multipart email message there might be problems
+
+    '''
     print "About to process : %s" % file_name
-    f=file(p)
-    msg = email.message_from_string(f.read())
+    em_file = file(path_em_file)
+    em_msg = email.message_from_string(em_file.read())
     try:
-        msg_debug = msg.get_payload()[1]
+        #Get the second email.message.Message 
+        #from the list of email.message.Message
+        #in the email message contained in em_file
+        em_msg_dlv_status = em_msg.get_payload()[1]
     except IndexError:
         print "File %s is not a recognised format [a]" % file_name
     else:
-        #Do stupid stuff to convert
-        #a generator to a list
-        list_parts = []
+        #Convert the generator of email.message.Message objects returned
+        #by .walk() to a list of email.message.Message
+        lst_em_msg_dlv_status = []
         try:
-            for elem in msg_debug.walk():
-                list_parts.append(elem)
+            for elem in em_msg_dlv_status.walk():
+                lst_em_msg_dlv_status.append(elem)
         except AttributeError:
             print "File %s is not a recognised format [b]" % file_name
         else:
-            lstHeadersKeyValues = list_parts[2].items()
-            dic_diag = {'SOURCE-FILENAME': file_name}
-            for headerKeyValue in lstHeadersKeyValues:
-                headerName = headerKeyValue[0].upper()
-                headerValue = headerKeyValue[1]
-                headerValue = headerValue.replace("\r","")
-                headerValue = headerValue.replace("\n","")
+            #Get the message's field headers and values
+            lst_hdrs_kv_pairs = lst_em_msg_dlv_status[2].items()
+            #Populate the dic_diag dictionary using the header
+            #names as keys and the header values of element values
+            dic_diag = {}
+            for hdr_kv in lst_hdrs_kv_pairs:
+                hdr_name = hdr_kv[0].upper()
+                hdr_val = hdr_kv[1]
+                hdr_val = hdr_val.replace("\r","")
+                hdr_val = hdr_val.replace("\n","")
                 
-                if dic_diag.has_key(headerName):
-                    raise MyError(ERR1 % pprint.pformat(lstHeadersKeyValues))
+                if dic_diag.has_key(hdr_name):
+                    raise FindBadAddExcptn(ERR1 % pprint.pformat(lst_hdrs_kv_pairs))
                 else:
-                    dic_diag[headerName] = headerValue 
+                    dic_diag[hdr_name] = hdr_val 
+            #Add a couple of non-header derived values
+            dic_diag['SOURCE-FILENAME'] = file_name
             dic_diag['HUM-READ-EMAIL-ADDR'] = \
                     remove_rfc_notation(dic_diag['FINAL-RECIPIENT'])
+            #Write the dictionary as a CSV row
             csv_dict_wrtr.writerow(dic_diag)
 
 def process_file(   file_name, 
-                    p, 
+                    path_em_file, 
                     comp_obj, 
                     csv_row_wrtr, 
                     comp_obj_verbose_msg, 
                     dic_diag, 
                     use_alt_email_regex = True):
+    '''
+    This is an unused function which should be refactored
+    into a more general purpose function or removed
+    '''
     print "About to process : %s" % file_name
     pprint.pprint(dic_diag)
-    f=file(p)
-    msg = email.message_from_string(f.read())
-    err_txt = msg.get_payload()[0]
+    em_file = file(path_em_file)
+    em_msg = email.message_from_string(em_file.read())
+    err_txt = em_msg.get_payload()[0]
     print "(" * 50
-    msg_debug = msg.get_payload()[1]
+    msg_debug = em_msg.get_payload()[1]
     print "Dump of headers STARTS"
     pprint.pprint(msg_debug.items())
     print "Dump of headers STOPS"
     dic_diag = parse_error_component1(msg_debug, dic_diag)
     print "Walk of message ENDS  "
     print ")" * 50
-    # method 1: using a compile object
-    '''
-    print err_txt
-    print type(err_txt)
-    import pprint       
-    pprint.pprint(dir(err_txt))
-    '''
+    #
     good_err_msg = False
     good_email = False
     # Retrieve group(s) by name
         print "Unexpected error:", sys.exc_info()[0]
         raise
     else:
-        full_error_msg = strip_line_feeds(fullErrorMessage)
+        full_error_msg = strip_line_feeds(full_error_msg)
         good_err_msg = True 
 
     
         print "+" * 60
 
 
-def process_file_plain(p, comp_obj):
-    f=file(p)
-    msg = email.message_from_string(f.read())
-    err_txt = msg.get_payload()[0]
-    return err_txt 
+def build_ignore_list():
+    '''
+    Returns a hard-coded list of file names which will be ignored
+    in subsequent processing
 
-def build_ignore_list():
+    This is an, effectively, an unused function which should be refactored
+    into a more general purpose function or removed
+    '''
     lst = []
     '''
     lst.append('1306107016.V811I65cc3caM321663.diezel')
     return lst
 
 def main():
+    '''
+    The main() function
+
+    Needs work in order that the location of email files to be parsed
+    and the location of output files may be specificed via command
+    line params
+    '''
     lst_files_to_ignore = build_ignore_list() 
-    hdr_output_cols = ['HUM-READ-EMAIL-ADDR',
-                        'ACTION', 
-                        'STATUS', 
-                        'DIAGNOSTIC-CODE', 
-                        'FINAL-RECIPIENT',
-                        'ORIGINAL-RECIPIENT', 
-                        'SOURCE-FILENAME', 
-                        'REMOTE-MTA', 
-                        'LAST-ATTEMPT-DATE',
-                        'WILL-RETRY-UNTIL'] 
     path = 'C:/usr/rshea/mytemp/20110609/NZLPProblemEmails-20120510/'
-    rawstr = r"""^(?P<fullErrorMessage>[<](?P<emailAddress>.+)[>].*)"""
+    #rawstr = r"""^(?P<fullErrorMessage>[<](?P<emailAddress>.+)[>].*)"""
 
-    rw_str_verbose_msg = r"""<head>.*</head><body>.+mailto.*?>(?P<MAILTOADDRESS>.*?)<.*</body>"""
+    #rw_str_verbose_msg = r"""<head>.*</head><body>.+mailto.*?>(?P<MAILTOADDRESS>.*?)<.*</body>"""
     listing = os.listdir(path)
-    comp_obj = re.compile(rawstr,  re.MULTILINE| re.DOTALL)
-    comp_obj_verbose_msg = re.compile( rw_str_verbose_msg,  \
-                                        re.IGNORECASE|re.MULTILINE| re.DOTALL)
+    #comp_obj = re.compile(rawstr,  re.MULTILINE| re.DOTALL)
+    #comp_obj_verbose_msg = re.compile( rw_str_verbose_msg,  \
+    #                                    re.IGNORECASE|re.MULTILINE| re.DOTALL)
 
-    csv_row_wrtr = csv.writer(
-            open('NZLP-bademailaddresses-20120510.csv', 'wb'), \
-            dialect='excel')
+    #Create a csv.DictWriter to write output to
     csv_dict_wrtr = csv.DictWriter( \
             open('NZLP-bademailaddresses-headers-20120510.csv', 'wb'), \
-            hdr_output_cols, \
+            HDR_OUTPUT_COLS, \
             restval='N/A', \
             dialect='excel')
-    csv_dict_wrtr.writerow(dict(zip(hdr_output_cols,hdr_output_cols)))
+    #Write the initial headers
+    csv_dict_wrtr.writerow(dict(zip(HDR_OUTPUT_COLS, HDR_OUTPUT_COLS)))
 
-    '''
-    fileCnt = 1  
-    for infileName in listing:
-        fileCnt += 1 
-        if infileName in lst_files_to_ignore:
+    dic_diag = {}
+    for in_file_name in listing:
+        if in_file_name in lst_files_to_ignore:
             pass
         else:
-            infileFullPath = "%s/%s" % (path, infileName)
-            process_file(infileName, infileFullPath, \
-                        comp_obj, csv_row_wrtr, \
-                        comp_obj_verbose_msg, dic_diag, True)
-    '''
-
-    dic_diag = {}
-    for infileName in listing:
-        if infileName in lst_files_to_ignore:
-            pass
-        else:
-            infileFullPath = "%s/%s" % (path, infileName)
-            process_email_headers(infileName, infileFullPath, \
-                    comp_obj, csv_dict_wrtr)
+            in_file_path = "%s/%s" % (path, in_file_name)
+            process_email_headers(in_file_name, in_file_path, csv_dict_wrtr)
     pprint.pprint(dic_diag)
 
 if __name__ == "__main__":