Commits

Richard Shea  committed 1b8f18c

Happy to call this 0.1.0 !

  • Participants
  • Parent commits 09b751b

Comments (0)

Files changed (1)

File findBadAddresses.py

+'''
+Allows a directory of email messages to be parsed for 'bounce messages'
+and for those 'bounce messages' to be parsed for details which will 
+allow the problems to be analysed.
+
+Particular focus on emails bounced due to sender having used an invalid
+address
+
+'''
 import os
 import email
 import csv
-import sys
 import re
 import pprint
 ERR1 = "Found zero email addresses so don't know what to do" 
 class FindBadAddExcptn(Exception):
     '''Base class for errors in this script.'''
     def __init__(self, value):
+        super(FindBadAddExcptn, self).__init__(value)
         self.value = value
     def __str__(self):
         return repr(self.value)
     else:
         return l_em_to_be_clnd[1]
 
-def process_email_headers(file_name, path_em_file, csv_dict_wrtr):
+def parse_email_for_del_stat_part(file_name, path_em_file, csv_dict_wrtr):
     '''
     Given the text of a SMTP 'bounce message' writes a CSV row 
     to match the headers in the global variable HDR_OUTPUT_COLS.
                 hdr_val = hdr_val.replace("\n","")
                 
                 if dic_diag.has_key(hdr_name):
-                    raise FindBadAddExcptn(ERR1 % pprint.pformat(lst_hdrs_kv_pairs))
+                    raise FindBadAddExcptn(
+                            ERR1 % pprint.pformat(lst_hdrs_kv_pairs))
                 else:
                     dic_diag[hdr_name] = hdr_val 
             #Add a couple of non-header derived values
             #Write the dictionary as a CSV row
             csv_dict_wrtr.writerow(dic_diag)
 
-def process_file(   file_name, 
-                    path_em_file, 
-                    comp_obj, 
-                    csv_row_wrtr, 
-                    comp_obj_verbose_msg, 
-                    dic_diag, 
-                    use_alt_email_regex = True):
-    '''
-    This is an unused function which should be refactored
-    into a more general purpose function or removed
-    '''
-    print "About to process : %s" % file_name
-    pprint.pprint(dic_diag)
-    em_file = file(path_em_file)
-    em_msg = email.message_from_string(em_file.read())
-    err_txt = em_msg.get_payload()[0]
-    print "(" * 50
-    msg_debug = em_msg.get_payload()[1]
-    print "Dump of headers STARTS"
-    pprint.pprint(msg_debug.items())
-    print "Dump of headers STOPS"
-    print "Walk of message STARTS"
-    dic_diag = parse_error_component1(msg_debug, dic_diag)
-    print "Walk of message ENDS  "
-    print ")" * 50
-    #
-    good_err_msg = False
-    good_email = False
-    # Retrieve group(s) by name
-    match_obj = comp_obj.search(err_txt.as_string())
-    try:
-        full_error_msg = match_obj.group('fullErrorMessage')
-    #except AttributeError as (errno, strerror):
-    except AttributeError:
-        print "Error 1" 
-    except:
-        print "Unexpected error:", sys.exc_info()[0]
-        raise
-    else:
-        full_error_msg = strip_line_feeds(full_error_msg)
-        good_err_msg = True 
-
-    
-    if use_alt_email_regex == True:
-        try:
-            email_address = find_email(err_txt.as_string())
-        except AttributeError:
-            print "Error 1" 
-        except:
-            print "Unexpected error:", sys.exc_info()[0]
-            raise
-        else:
-            good_email = True
-    else:
-        try:
-            email_address = match_obj.group('emailAddress')
-        except AttributeError:
-            print "Error 1" 
-        except:
-            print "Unexpected error:", sys.exc_info()[0]
-            raise
-        else:
-            email_address = \
-                    strip_from_verbose_message(email_address, comp_obj_verbose_msg)
-            email_address = strip_line_feeds(email_address)
-            good_email = True
-
-    if good_email == True and good_err_msg:
-        print "%s -> %s" % (email_address, full_error_msg)
-        csv_row_wrtr.writerow([file_name, email_address, full_error_msg])
-    else:
-        print "+" * 60
-        print err_txt.as_string()
-        print "+" * 60
-
 
 def build_ignore_list():
     '''
     into a more general purpose function or removed
     '''
     lst = []
-    '''
-    lst.append('1306107016.V811I65cc3caM321663.diezel')
-    lst.append('1306107022.V811I65cc3e2M13969.diezel')
-    lst.append('1306107023.V811I65cc3e4M209786.diezel')
-    lst.append('1306107024.V811I65cc3e5M217596.diezel')
-    lst.append('1306139420.V811I65cc4e2M320634.diezel')
-    lst.append('1306139432.V811I65cc4efM494423.diezel')
-    lst.append('1306144832.V811I65cc4f6M753347.diezel')
-    lst.append('1306148434.V811I65cc518M29274.diezel')
-    lst.append('1306159236.V811I65cc556M88068.diezel')
-    #New Ignores
-    #Seems to have dodgy headers
-    #lst.append('1336530626.V811Ia5c41b8M190259.diezel')
-    #lst.append('1336545015.V811Ia5c4247M447388.diezel')
-    #Can't seem to walk the next two - possibly not an error email ?
-    lst.append('1336536039.V811Ia5c41e9M291749.diezel')
-    lst.append('1336536446.V811Ia5c41eaM317118.diezel')
-    #Another formatting problem but not the same as the last one
-    #again possilby not an error message at all
-    lst.append('1336539641.V811Ia5c4213M679074.diezel')
-    lst.append('1336541421.V811Ia5c4223M269284.diezel')
-    lst.append('1336545017.V811Ia5c424fM26027.diezel')
-    lst.append('1336545027.V811Ia5c4255M679749.diezel')
-    lst.append('1336550426.V811Ia5c4279M556351.diezel')
-    lst.append('1336561380.V811Ia5c42c5M700541.diezel')
-    '''
     return lst
 
 def main():
     '''
     lst_files_to_ignore = build_ignore_list() 
     path = 'C:/usr/rshea/mytemp/20110609/NZLPProblemEmails-20120510/'
-    #rawstr = r"""^(?P<fullErrorMessage>[<](?P<emailAddress>.+)[>].*)"""
-
-    #rw_str_verbose_msg = r"""<head>.*</head><body>.+mailto.*?>(?P<MAILTOADDRESS>.*?)<.*</body>"""
     listing = os.listdir(path)
-    #comp_obj = re.compile(rawstr,  re.MULTILINE| re.DOTALL)
-    #comp_obj_verbose_msg = re.compile( rw_str_verbose_msg,  \
-    #                                    re.IGNORECASE|re.MULTILINE| re.DOTALL)
 
     #Create a csv.DictWriter to write output to
     csv_dict_wrtr = csv.DictWriter( \
             HDR_OUTPUT_COLS, \
             restval='N/A', \
             dialect='excel')
+
     #Write the initial headers
     csv_dict_wrtr.writerow(dict(zip(HDR_OUTPUT_COLS, HDR_OUTPUT_COLS)))
 
-    dic_diag = {}
+    #Process each file in turn
     for in_file_name in listing:
         if in_file_name in lst_files_to_ignore:
             pass
         else:
             in_file_path = "%s/%s" % (path, in_file_name)
-            process_email_headers(in_file_name, in_file_path, csv_dict_wrtr)
-    pprint.pprint(dic_diag)
+            parse_email_for_del_stat_part(  in_file_name, 
+                                            in_file_path, 
+                                            csv_dict_wrtr)
 
 if __name__ == "__main__":
     main()