Commits

Richard Shea committed 188aa61

Comments (0)

Files changed (1)

smtpErrorAnalysis/findBadAddresses.py

 from optparse import OptionParser
 ERR1 = "Found zero email addresses so don't know what to do" 
 ERR2 = "Found more than one email address so don't know what to do [%s]"
+ERR3 = "Found duplicate headers so don't know what to do [%s]"
 HDR_OUTPUT_COLS = [ 'HUM-READ-EMAIL-ADDR',
                     'ACTION', 
                     'STATUS', 
     else:
         return l_em_to_be_clnd[1]
 
-def parse_email_for_del_stat_part(file_name, path_em_file, 
+def parse_email_for_dlv_stat_info(file_name, path_em_file, 
                                     csv_dict_wrtr, options):
     '''
     Given the text of a SMTP 'bounce message' writes a CSV row 
     '''
     if options.verbose:
         print "About to process : %s" % file_name
+
+    if file_name == '''1336519817.V811I75c5d67M552469.diezel''':
+        print "Debug here"
+
     em_file = file(path_em_file)
     em_msg = email.message_from_string(em_file.read())
-    try:
-        #Get the second email.message.Message 
-        #from the list of email.message.Message
-        #in the email message contained in em_file
-        em_msg_dlv_status = em_msg.get_payload()[1]
-    except IndexError:
-        print "File %s is not a recognised format [a]" % file_name
+    '''
+    There are two parts of the email message which potentially
+    interest us. Those which have a content-description of 
+
+    * 'Notification' or
+    * 'Delivery Report'
+
+    We see if we can find both of those for later use and if we can't we
+    bail out
+    '''
+    email_parts = em_msg.get_payload()
+    bln_got_notification = False
+    bln_got_dlv_report = False
+    if em_msg.is_multipart():
+        for email_part in email_parts:
+            if email_part.has_key('Content-Description'):
+                if email_part['Content-Description'] == 'Notification':
+                    email_part_notification = email_part
+                    bln_got_notification = True  
+                elif email_part['Content-Description'] == 'Delivery report':
+                    email_part_dlv_report = email_part
+                    bln_got_dlv_report = True  
+
+            if bln_got_dlv_report == False:  
+                if email_part.has_key('Content-Type'):
+                    if email_part['Content-Type'].split(";")[0] == 'message/delivery-status':
+                        email_part_dlv_report = email_part
+                        bln_got_dlv_report = True  
+    elif em_msg.has_key('X-Failed-Recipients'):
+        '''
+        TODO:There is another class of SMTP bounceback message which is not
+        multipart and which could be processed here but I'm not going to 
+        try to do that at the moment
+        '''
+        print "1" * 50
+        print "File %s is potentially a unsupported format [d]" % file_name
+        for hdr_kv in em_msg.items():
+            hdr_name = hdr_kv[0].upper()
+            hdr_val = hdr_kv[1]
+            hdr_val = hdr_val.replace("\r","")
+            hdr_val = hdr_val.replace("\n","")
+            print "%s : %s" % (hdr_name, hdr_val)
+        '''
+        for i in em_msg.items():
+            print "%s -> %s" % (i[0],i[1])
+        '''
+        print "2" * 50
     else:
+        print "File %s is not a supported format [a]" % file_name
+        '''
+        print "=" * 40
+        pprint.pprint(dir(em_msg))
+        print "+" * 40
+        partcount = 0
+        for email_part in email_parts:
+            emwork = email.message_from_string(email_part)
+            partcount += 1
+            print "%s-start" % partcount * 7 
+            for i in emwork.items():
+                print "%s -> %s" % (i[0],i[1])
+            print "%s-end--" % partcount * 7 
+        print "=" * 40
+        '''
+
+    '''
+    Currently we only parse the 'Delivery Report' part so we only check if 
+    that's been found. At some point we might also pull some stuff out 
+    'Notification'
+    '''
+    if bln_got_dlv_report == True:  
         #Convert the generator of email.message.Message objects returned
         #by .walk() to a list of email.message.Message
-        lst_em_msg_dlv_status = []
         try:
-            for elem in em_msg_dlv_status.walk():
-                lst_em_msg_dlv_status.append(elem)
+            lst_email_part_dlv_report = convert_gen_to_list(email_part_dlv_report.walk()) 
         except AttributeError:
-            print "File %s is not a recognised format [b]" % file_name
+            print "File %s is not a supported format [b]" % file_name
         else:
-            #Get the message's field headers and values
-            lst_hdrs_kv_pairs = lst_em_msg_dlv_status[2].items()
-            #Populate the dic_diag dictionary using the header
+            '''
+            The effect of 'walking' the email.message.Message that is 
+            email_part_dlv_report is three other email.message.Message
+            objects which correspond to the three blocks of headers seen
+            above. That is :
+
+            n=0 email.message.Message:
+                Content-Description: Delivery report
+                Content-Type: message/delivery-status
+
+            n=1 email.message.Message:
+                Reporting-MTA: dns; a.b.web              
+                X-Postfix-Queue-ID: 808F17F8080
+                X-Postfix-Sender: rfc822; someone@c.d.web
+                Arrival-Date: Tue,  8 May 2012 16:30:12 -0700 (PDT)
+
+            n=2 email.message.Message:
+                Final-Recipient: rfc822; john.smith@e.web
+                Original-Recipient: rfc822;john.smith@e.web
+                Action: failed
+                Status: 5.0.0
+                Remote-MTA: dns; smtp.e.web
+                Diagnostic-Code: smtp; 550 <john.smith@e.web>, Recipient unknown
+
+            For our current purposes we are only interested in the content of the
+            n=2 email.message.Message and so the others are ignored
+            '''
+            lst_hdrs_kv_pairs = lst_email_part_dlv_report[2].items()
+
+            #Populate the dic_header_vals dictionary using the header
             #names as keys and the header values of element values
-            dic_diag = {}
-            for hdr_kv in lst_hdrs_kv_pairs:
-                hdr_name = hdr_kv[0].upper()
-                hdr_val = hdr_kv[1]
-                hdr_val = hdr_val.replace("\r","")
-                hdr_val = hdr_val.replace("\n","")
-                
-                if dic_diag.has_key(hdr_name):
-                    raise FindBadAddExcptn(
-                            ERR1 % pprint.pformat(lst_hdrs_kv_pairs))
-                else:
-                    dic_diag[hdr_name] = hdr_val 
+            dic_header_vals = populate_header_val_dict(lst_hdrs_kv_pairs)
             #Add a couple of non-header derived values
-            dic_diag['SOURCE-FILENAME'] = file_name
-            dic_diag['HUM-READ-EMAIL-ADDR'] = \
-                    remove_rfc_notation(dic_diag['FINAL-RECIPIENT'])
+            dic_header_vals['SOURCE-FILENAME'] = file_name
+            dic_header_vals['HUM-READ-EMAIL-ADDR'] = \
+                    remove_rfc_notation(dic_header_vals['FINAL-RECIPIENT'])
             #Write the dictionary as a CSV row
-            csv_dict_wrtr.writerow(dic_diag)
+            csv_dict_wrtr.writerow(dic_header_vals)
+    else:
+        print "File %s is not a supported format [c]" % file_name
 
 
+def populate_header_val_dict(lst_kv):
+    '''
+    Populate dictionary with lst of tuples. Using upper-cased n=0 
+    elem of each tuple as key and n=1 elem (after stripping lined 
+    feeds) as value.
+
+    Throw exception if the input list would generate the same key
+    twice
+    '''
+    dic_header_vals = {}
+    for hdr_kv in lst_kv:
+        hdr_name = hdr_kv[0].upper()
+        hdr_val = hdr_kv[1]
+        hdr_val = hdr_val.replace("\r","")
+        hdr_val = hdr_val.replace("\n","")
+        
+        if dic_header_vals.has_key(hdr_name):
+            raise FindBadAddExcptn(
+                    ERR3 % pprint.pformat(lst_kv))
+        else:
+            dic_header_vals[hdr_name] = hdr_val 
+    return dic_header_vals
+
+def convert_gen_to_list(gen):
+    '''
+    Converts a generator to a list
+    '''
+    lstout = []
+    for elem in gen:
+        lstout.append(elem)
+    
+    return lstout
+
 def build_ignore_list():
     '''
     Returns a hard-coded list of file names which will be ignored
             pass
         else:
             in_file_path = "%s/%s" % (options.inbox, in_file_name)
-            parse_email_for_del_stat_part(  in_file_name, 
+            parse_email_for_dlv_stat_info(  in_file_name, 
                                             in_file_path, 
                                             csv_dict_wrtr,
                                             options)