Source

smtpErrorAnalysis / smtpErrorAnalysis / findBadAddresses.py

Diff from to

File smtpErrorAnalysis/findBadAddresses.py

                     'REMOTE-MTA', 
                     'LAST-ATTEMPT-DATE',
                     'WILL-RETRY-UNTIL'] 
+STYLE2_MAPPINGS = { 'TEMPKEY':'DIAGNOSTIC-CODE', 
+                    'X-Failed-Recipients':'FINAL-RECIPIENT'} 
+class EmailType:
+    (STYLE1, STYLE2, UNKNOWN) = range(0, 3)
+class HeaderValue(object):
+    '''
+    Represents a 'header/value' pair in an
+    email etc
+    '''
+    def __init__(self,lst):
+        self.headerName = lst[0]
+        self.value = lst[1]
+
+
+
 class FindBadAddExcptn(Exception):
     '''Base class for errors in this script.'''
     def __init__(self, value):
     '''
     string = string.replace("\r","")
     string = string.replace("\n","")
-    string = string.strip()
+    vng = string.strip()
     return string
 
 def find_email(instr):
     Given the text of a SMTP 'bounce message' writes a CSV row 
     to match the headers in the global variable HDR_OUTPUT_COLS.
 
-    It does this by finding the 'message/delivery-status' part of 
+    There are two styles of 'bounce message' it does this for.
+
+    STYLE 1
+    =======
+
+    'STYLE1' is a multi-part mime message.
+
+    Parsing is done by finding the 'message/delivery-status' part of 
     the entire email and parsing the headers.
 
     An 'message/delivery-status' part of a 'bounce email' looks a 
         Remote-MTA: dns; smtp.e.web
         Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown
 
+    STYLE 2
+    =======
+
+    'STYLE2' is not multi-part mime message. It consists of :
+      
+       * Email headers
+       * Text detailing nature of the problem encountered
+       * A copy of the original email message as sent
+
+
+    It's not possible to obtain such rich information from 'STYLE 2' emails
+    as it is from 'STYLE 1' as it's a significantly less informative email
+    nevertheless it is possible to extract a useful subset of the information
+    derived from 'STYLE 1' emails.
+
+    The earlier part of such a message looks a little like this ::
+
+        Return-Path: <>
+        X-Original-To: someone@c.d.web
+        Delivered-To: someone@c.d.web
+        Received: from alpha.g.h.web (beta.g.h.web [101.101.101.101])
+                by c.d.web (Postfix) with ESMTP id 010101010
+                for <someone@c.d.web>; Tue,  8 May 2012 21:00:38 -0700 (PDT)
+        Received: from unknown (HELO alpha.i.j.web) ([102.102.102.102])
+          by gamma.i.j.web with ESMTP; 09 May 2012 16:00:24 +1200
+        Received: from mailuser by alpha.i.j.web with LOCAL (Exim 4.69)
+                id 9ABc2f-111111-AA
+                for someone@c.d.web; Wed, 09 May 2012 16:00:24 +1200
+        X-Failed-Recipients: John.Smith@e.web
+        Auto-Submitted: auto-replied
+        From: Mail Delivery System <Mailer-Daemon@i.j.web>
+        To: someone@c.d.web
+        Subject: Mail delivery failed: returning message to sender
+        Message-Id: <E1SRy4e-11111b-DW@alpha.i.j.web>
+        Date: Wed, 09 May 2012 16:00:24 +1200
+
+        This message was created automatically by mail delivery software.
+
+        A message that you sent could not be delivered to one or more of its
+        recipients. This is a permanent error. The following address(es) failed:
+
+          save to inbox
+            generated by John.Smith@e.web
+            mailbox is full: retry timeout exceeded
+
+        ------ This is a copy of the message, including all the headers. ------
+
+
+
+
     NB: All sorts of assumptions are made about the structure of the 
     bounce message which seem to hold true for a large sample I have 
     used in testing but it seems likely that somewhere there are 'bounce
     if file_name == '''1336519817.V811I75c5d67M552469.diezel''':
         print "Debug here"
 
+    em_style = EmailType.UNKNOWN
     em_file = file(path_em_file)
     em_msg = email.message_from_string(em_file.read())
     '''
     email_parts = em_msg.get_payload()
     bln_got_notification = False
     bln_got_dlv_report = False
+
     if em_msg.is_multipart():
+        em_style = EmailType.STYLE1
         for email_part in email_parts:
             if email_part.has_key('Content-Description'):
                 if email_part['Content-Description'] == 'Notification':
                         email_part_dlv_report = email_part
                         bln_got_dlv_report = True  
     elif em_msg.has_key('X-Failed-Recipients'):
-        '''
-        TODO:There is another class of SMTP bounceback message which is not
-        multipart and which could be processed here but I'm not going to 
-        try to do that at the moment
-        '''
-        print "1" * 50
-        print "File %s is potentially a unsupported format [d]" % file_name
-        for hdr_kv in em_msg.items():
-            hdr_name = hdr_kv[0].upper()
-            hdr_val = hdr_kv[1]
-            hdr_val = hdr_val.replace("\r","")
-            hdr_val = hdr_val.replace("\n","")
-            print "%s : %s" % (hdr_name, hdr_val)
-        '''
-        for i in em_msg.items():
-            print "%s -> %s" % (i[0],i[1])
-        '''
-        print "2" * 50
+        em_style = EmailType.STYLE2
+        email_style2 = em_msg
+#        '''
+#        TODO:There is another class of SMTP bounceback message which is not
+#        multipart and which could be processed here but I'm not going to 
+#        try to do that at the moment
+#        '''
+#        print "1" * 50
+#        print "File %s is potentially a unsupported format [d]" % file_name
+#        for hdr_kv in em_msg.items():
+#            hdr_name = hdr_kv[0].upper()
+#            hdr_val = hdr_kv[1]
+#            hdr_val = hdr_val.replace("\r","")
+#            hdr_val = hdr_val.replace("\n","")
+#            print "%s : %s" % (hdr_name, hdr_val)
+#        dic_test = None
+#        '''
+#        for i in em_msg.items():
+#            print "%s -> %s" % (i[0],i[1])
+#        '''
+#        print "2" * 50
     else:
         print "File %s is not a supported format [a]" % file_name
         '''
     that's been found. At some point we might also pull some stuff out 
     'Notification'
     '''
-    if bln_got_dlv_report == True:  
-        #Convert the generator of email.message.Message objects returned
-        #by .walk() to a list of email.message.Message
-        try:
-            lst_email_part_dlv_report = convert_gen_to_list(email_part_dlv_report.walk()) 
-        except AttributeError:
-            print "File %s is not a supported format [b]" % file_name
+    if em_style == EmailType.STYLE1
+        if bln_got_dlv_report == True:  
+            #Convert the generator of email.message.Message objects returned
+            #by .walk() to a list of email.message.Message
+            try:
+                lst_email_part_dlv_report = convert_gen_to_list(email_part_dlv_report.walk()) 
+            except AttributeError:
+                print "File %s is not a supported format [b]" % file_name
+            else:
+                '''
+                The effect of 'walking' the email.message.Message that is 
+                email_part_dlv_report is three other email.message.Message
+                objects which correspond to the three blocks of headers seen
+                above. That is :
+
+                n=0 email.message.Message:
+                    Content-Description: Delivery report
+                    Content-Type: message/delivery-status
+
+                n=1 email.message.Message:
+                    Reporting-MTA: dns; a.b.web              
+                    X-Postfix-Queue-ID: 808F17F8080
+                    X-Postfix-Sender: rfc822; someone@c.d.web
+                    Arrival-Date: Tue,  8 May 2012 16:30:12 -0700 (PDT)
+
+                n=2 email.message.Message:
+                    Final-Recipient: rfc822; john.smith@e.web
+                    Original-Recipient: rfc822;john.smith@e.web
+                    Action: failed
+                    Status: 5.0.0
+                    Remote-MTA: dns; smtp.e.web
+                    Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown
+
+                For our current purposes we are only interested in the content of the
+                n=2 email.message.Message and so the others are ignored
+                '''
+                lst_hdrs_kv_pairs = lst_email_part_dlv_report[2].items()
+                lst_hd_val = pre_process_key_values(lst_hdrs_kv_pairs)
+
+                #Populate the dic_header_vals dictionary using the header
+                #names as keys and the header values of element values
+                dic_header_vals = populate_header_val_dict(em_style, lst_hd_val)
+
+                #Add a couple of non-header derived values and write the row
+                common_final_process_and_write(em_style, lst_hdrs_kv_pairs, file_name, csv_dict_wrtr)
         else:
-            '''
-            The effect of 'walking' the email.message.Message that is 
-            email_part_dlv_report is three other email.message.Message
-            objects which correspond to the three blocks of headers seen
-            above. That is :
+            print "File %s is not a supported format [c]" % file_name
+    elif em_style == EmailType.STYLE2
+        lst_hdrs_kv_pairs = email_style2.items()
 
-            n=0 email.message.Message:
-                Content-Description: Delivery report
-                Content-Type: message/delivery-status
+        #Add a couple of non-header derived values and write the row
+        common_final_process_and_write(dic_header_vals, file_name, csv_dict_wrtr)
+    else:
+        print "File %s is not a supported format [d]" % file_name
 
-            n=1 email.message.Message:
-                Reporting-MTA: dns; a.b.web              
-                X-Postfix-Queue-ID: 808F17F8080
-                X-Postfix-Sender: rfc822; someone@c.d.web
-                Arrival-Date: Tue,  8 May 2012 16:30:12 -0700 (PDT)
+def common_final_process_and_write(em_style, lst_hdrs_kv_pairs, file_name, csv_dict_wrtr):
+    lst_hd_val = pre_process_key_values(lst_hdrs_kv_pairs)
+    #Populate the dic_header_vals dictionary using the header
+    #names as keys and the header values of element values
+    dic_header_vals = populate_header_val_dict(em_style, lst_hd_val)
+    dic_header_vals['SOURCE-FILENAME'] = file_name
+    dic_header_vals['HUM-READ-EMAIL-ADDR'] = \
+            remove_rfc_notation(dic_header_vals['FINAL-RECIPIENT'])
+    #Write the dictionary as a CSV row
+    csv_dict_wrtr.writerow(dic_header_vals)
 
-            n=2 email.message.Message:
-                Final-Recipient: rfc822; john.smith@e.web
-                Original-Recipient: rfc822;john.smith@e.web
-                Action: failed
-                Status: 5.0.0
-                Remote-MTA: dns; smtp.e.web
-                Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown
+def pre_process_key_values(lst_kv):
+    '''
+    Takes a list of tuples, `lst_kv`, and:
+    
+    * Uppercases the first element of each tuple
+    * Strips LF/CR from second element of each tuple
+    '''
 
-            For our current purposes we are only interested in the content of the
-            n=2 email.message.Message and so the others are ignored
-            '''
-            lst_hdrs_kv_pairs = lst_email_part_dlv_report[2].items()
+    lst_kv_out = []
+    for hdr_kv in lst_kv:
+        hdr_name = hdr_kv[0].upper()
+        hdr_val = hdr_kv[1]
+        hdr_val = hdr_val.replace("\r","")
+        hdr_val = hdr_val.replace("\n","")
+        lst_kv_out.append(HeaderValue([hdr_name, hdr_val))
+    return lst_kv_out
 
-            #Populate the dic_header_vals dictionary using the header
-            #names as keys and the header values of element values
-            dic_header_vals = populate_header_val_dict(lst_hdrs_kv_pairs)
-            #Add a couple of non-header derived values
-            dic_header_vals['SOURCE-FILENAME'] = file_name
-            dic_header_vals['HUM-READ-EMAIL-ADDR'] = \
-                    remove_rfc_notation(dic_header_vals['FINAL-RECIPIENT'])
-            #Write the dictionary as a CSV row
-            csv_dict_wrtr.writerow(dic_header_vals)
-    else:
-        print "File %s is not a supported format [c]" % file_name
+def pre_process_headers_for_style2(lst_hv):
+    '''
+    Takes a list of HeaderValue objects, `lst_hv` 
+    and creates another list of HeaderValue objects 
+    with modified membership set and content.
 
+    In summary we're dropping some elements of the
+    input list and for those we don't drop we're 
+    modifying the 'headerName' property of the resulting
+    HeaderValue instance.
+    
+    Each element of the input, `lst_hv` is tested 
+    to see if it 'headerName' value exists as a key of 
+    the constant dictionary `STYLE2_MAPPINGS` if it does then
+    a modified form of that HeaderValue object is added to the 
+    output list of HeaderValue objects.
 
-def populate_header_val_dict(lst_kv):
+    The output HeaderValue object has a 'headerName' property 
+    corresponding to the value of the found element in `STYLE2_MAPPINGS`
+    and a 'value' property corresponding to the 'value' element
+    of the input HeaderValue object.
+    
+    '''
+    lst_hv_out = []
+    for hv in lst_hv:
+        if hv.headerName in STYLE2_MAPPINGS:
+            lst_hv_out.append(STYLE2_MAPPINGS[hv.headerName], hv.value)
+    return lst_hv_out
+
+def populate_header_val_dict(em_style, lst_hv):
     '''
     Populate dictionary with lst of tuples. Using upper-cased n=0 
     elem of each tuple as key and n=1 elem (after stripping lined 
     twice
     '''
     dic_header_vals = {}
-    for hdr_kv in lst_kv:
-        hdr_name = hdr_kv[0].upper()
-        hdr_val = hdr_kv[1]
-        hdr_val = hdr_val.replace("\r","")
-        hdr_val = hdr_val.replace("\n","")
-        
-        if dic_header_vals.has_key(hdr_name):
+
+    if em_style == EmailType.STYLE2:
+        lst_hv = pre_process_headers_for_style2(lst_hv)
+
+    for hv in lst_hv:
+        if dic_header_vals.has_key(hv.headerName):
             raise FindBadAddExcptn(
-                    ERR3 % pprint.pformat(lst_kv))
+                    ERR3 % pprint.pformat(lst_hv))
         else:
-            dic_header_vals[hdr_name] = hdr_val 
+            dic_header_vals[hv.headerName] = hv.value 
     return dic_header_vals
 
 def convert_gen_to_list(gen):