Richard Shea avatar Richard Shea committed 9053f35

In the middle of a monster cleanup to make code PEP8 complaint. This commit is a working version and I wanted to snapshot before I threw away the original way the parsing was done in case I wanted to come back to it later.

Comments (0)

Files changed (1)

findBadAddresses.py

+import os
+import email
+import csv
+import sys
 '''
 f=file(fp)
 fp="./1306429999.V811I65cc573M345725.diezel"
         return repr(self.value)
 
 
-def stripLineFeeds(string):
+def strip_line_feeds(string):
     string = string.replace("\r","")
     string = string.replace("\n","")
     string = string.strip()
     return string
 
-def stripFromVerboseMessage(sIn, compile_objVerboseMsg):
-    match_obj = compile_objVerboseMsg.search(sIn)
+def strip_from_verbose_message(sin, comp_obj_verbose_msg):
+    match_obj = comp_obj_verbose_msg.search(sin)
     try:
         MAILTOADDRESS = match_obj.group('MAILTOADDRESS')
     except AttributeError: 
-        MAILTOADDRESS = sIn
+        MAILTOADDRESS = sin
 
     return MAILTOADDRESS
 
-def parseErrorComponent(inString):
+def parse_error_component(in_string):
     '''
-    for line in inString:
+    for line in instr:
         print line
     '''
-    debugVar = -1 
+    dbg_var = -1 
 
-def parseErrorComponent1(inMsg, dicDiag):
-    '''
-    for line in inString:
-        print line
-    '''
-    debugVar = -1 
-    '''
-    for part in inMsg.walk():
-        print ""
-        print part.items()
-        print ""
-    '''
+def parse_error_component1(in_msg, dic_diag):
+    dbg_var = -1 
     #Do stupid stuff to convert
     #a generator to a list
-    lParts = []
-    for lPart in inMsg.walk():
-        lParts.append(lPart)
+    list_parts = []
+    for elem in in_msg.walk():
+        list_parts.append(elem)
 
-    items = lParts[2].items()
+    items = list_parts[2].items()
     print items
     for item in items:
-        if dicDiag.has_key(item[0]):
-            dicDiag[item[0]].append(item[1]) 
+        if dic_diag.has_key(item[0]):
+            dic_diag[item[0]].append(item[1]) 
         else:
-            dicDiag[item[0]] = [] 
+            dic_diag[item[0]] = [] 
 
-    return dicDiag
+    return dic_diag
 
-def findEmail(inString):
+def find_email(instr):
+    ERR1 ="Found zero email addresses so don't know what to do" 
+    ERR2 ="Found more than one email address so don't know what to do [%s]"
     #Following regex found at : http://stackp.online.fr/?p=19
     email_pattern = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')
     hitCount = 0
     results = []
     #Put all email addresses found into a list. The resulting list
     #should contain one or more identical email addresses
-    for match in email_pattern.findall(inString):
+    for match in email_pattern.findall(instr):
         results.append(match[0])
 
     if len(results) == 0: 
-        raise MyError("Found zero email addresses so don't know what to do")
+        raise MyError(ERR1)
     elif len(results) > 1:
-        for emailAddress in results[1:]:
-            if emailAddress != results[0]:  
-                print "+" * 50
-                print inString
-                print "-" * 50
-                pprint.pprint(results)
-                print "^" * 50
-                raise MyError("Found more than one email address so don't know what to do [%s]" % pprint.pformat(results))
+        for email_address in results[1:]:
+            if email_address != results[0]:  
+                raise MyError( ERR2 % pprint.pformat(results))
 
     return results[0]
 
 
-def removeRFCNotation(emailToBeCleaned):
-    l = emailToBeCleaned.split(';')
+def remove_rfc_notation(email_to_be_cleaned):
+    l = email_to_be_cleaned.split(';')
     if len(l) == 0:
         return l[0]
     else:
         return l[1]
 
-def processEmailHeaders(fileName, p, compile_obj, headerDictWriter):
-    print "About to process : %s" % fileName
+def process_email_headers(file_name, p, comp_obj, csv_dict_wrtr):
+    ERR1 = "Found a duplicate header when parsing error email"
+    print "About to process : %s" % file_name
     f=file(p)
     msg = email.message_from_string(f.read())
     try:
-        msgDebug = msg.get_payload()[1]
+        msg_debug = msg.get_payload()[1]
     except IndexError:
-        print "File %s is not a recognised format [a]" % fileName
+        print "File %s is not a recognised format [a]" % file_name
     else:
         #Do stupid stuff to convert
         #a generator to a list
-        lParts = []
+        list_parts = []
         try:
-            for lPart in msgDebug.walk():
-                lParts.append(lPart)
+            for elem in msg_debug.walk():
+                list_parts.append(elem)
         except AttributeError:
-            print "File %s is not a recognised format [b]" % fileName
+            print "File %s is not a recognised format [b]" % file_name
         else:
-            lstHeadersKeyValues = lParts[2].items()
-            dicDiag = {'SOURCE-FILENAME': fileName}
+            lstHeadersKeyValues = list_parts[2].items()
+            dic_diag = {'SOURCE-FILENAME': file_name}
             for headerKeyValue in lstHeadersKeyValues:
                 headerName = headerKeyValue[0].upper()
                 headerValue = headerKeyValue[1]
                 headerValue = headerValue.replace("\r","")
                 headerValue = headerValue.replace("\n","")
                 
-                if dicDiag.has_key(headerName):
-                    raise MyError("Found a duplicate header when parsing error email" % pprint.pformat(lstHeadersKeyValues))
+                if dic_diag.has_key(headerName):
+                    raise MyError(ERR1 % pprint.pformat(lstHeadersKeyValues))
                 else:
-                    dicDiag[headerName] = headerValue 
-            dicDiag['HUM-READ-EMAIL-ADDR'] = removeRFCNotation(dicDiag['FINAL-RECIPIENT'])
-            headerDictWriter.writerow(dicDiag)
+                    dic_diag[headerName] = headerValue 
+            dic_diag['HUM-READ-EMAIL-ADDR'] = \
+                    remove_rfc_notation(dic_diag['FINAL-RECIPIENT'])
+            csv_dict_wrtr.writerow(dic_diag)
 
-def processFile(fileName, p, compile_obj, spamWriter, compile_objVerboseMsg, dicDiag, useAlternateEmailRegex = True):
-    print "About to process : %s" % fileName
-    pprint.pprint(dicDiag)
+def process_file(   file_name, 
+                    p, 
+                    comp_obj, 
+                    csv_row_wrtr, 
+                    comp_obj_verbose_msg, 
+                    dic_diag, 
+                    use_alt_email_regex = True):
+    print "About to process : %s" % file_name
+    pprint.pprint(dic_diag)
     f=file(p)
     msg = email.message_from_string(f.read())
-    sErrText = msg.get_payload()[0]
+    err_txt = msg.get_payload()[0]
     print "(" * 50
-    msgDebug = msg.get_payload()[1]
+    msg_debug = msg.get_payload()[1]
     print "Dump of headers STARTS"
-    pprint.pprint(msgDebug.items())
+    pprint.pprint(msg_debug.items())
     print "Dump of headers STOPS"
     print "Walk of message STARTS"
-    dicDiag = parseErrorComponent1(msgDebug, dicDiag)
+    dic_diag = parse_error_component1(msg_debug, dic_diag)
     print "Walk of message ENDS  "
     print ")" * 50
     # method 1: using a compile object
     '''
-    print sErrText
-    print type(sErrText)
+    print err_txt
+    print type(err_txt)
     import pprint       
-    pprint.pprint(dir(sErrText))
+    pprint.pprint(dir(err_txt))
     '''
-    blnGoodErrMsg = False
-    blnGoodEmail = False
+    good_err_msg = False
+    good_email = False
     # Retrieve group(s) by name
-    match_obj = compile_obj.search(sErrText.as_string())
+    match_obj = comp_obj.search(err_txt.as_string())
     try:
-        fullErrorMessage = match_obj.group('fullErrorMessage')
+        full_error_msg = match_obj.group('fullErrorMessage')
     #except AttributeError as (errno, strerror):
     except AttributeError:
         print "Error 1" 
         print "Unexpected error:", sys.exc_info()[0]
         raise
     else:
-        fullErrorMessage = stripLineFeeds(fullErrorMessage)
-        blnGoodErrMsg = True 
+        full_error_msg = strip_line_feeds(fullErrorMessage)
+        good_err_msg = True 
 
     
-    if useAlternateEmailRegex == True:
+    if use_alt_email_regex == True:
         try:
-            emailAddress = findEmail(sErrText.as_string())
+            email_address = find_email(err_txt.as_string())
         except AttributeError:
             print "Error 1" 
         except:
             print "Unexpected error:", sys.exc_info()[0]
             raise
         else:
-            blnGoodEmail = True
+            good_email = True
     else:
         try:
-            emailAddress = match_obj.group('emailAddress')
+            email_address = match_obj.group('emailAddress')
         except AttributeError:
             print "Error 1" 
         except:
             print "Unexpected error:", sys.exc_info()[0]
             raise
         else:
-            emailAddress = stripFromVerboseMessage(emailAddress, compile_objVerboseMsg)
-            emailAddress = stripLineFeeds(emailAddress)
-            blnGoodEmail = True
+            email_address = \
+                    strip_from_verbose_message(email_address, comp_obj_verbose_msg)
+            email_address = strip_line_feeds(email_address)
+            good_email = True
 
-    if blnGoodEmail == True and blnGoodErrMsg:
-        print "%s -> %s" % (emailAddress, fullErrorMessage)
-        spamWriter.writerow([fileName, emailAddress, fullErrorMessage])
+    if good_email == True and good_err_msg:
+        print "%s -> %s" % (email_address, full_error_msg)
+        csv_row_wrtr.writerow([file_name, email_address, full_error_msg])
     else:
         print "+" * 60
-        print sErrText.as_string()
+        print err_txt.as_string()
         print "+" * 60
 
 
-def processFilePlain(p, compile_obj):
+def process_file_plain(p, comp_obj):
     f=file(p)
     msg = email.message_from_string(f.read())
-    sErrText = msg.get_payload()[0]
-    return sErrText 
+    err_txt = msg.get_payload()[0]
+    return err_txt 
 
-def buildIgnoreList():
+def build_ignore_list():
     lst = []
     '''
     lst.append('1306107016.V811I65cc3caM321663.diezel')
     lst.append('1336561380.V811Ia5c42c5M700541.diezel')
     '''
     return lst
-import os
-import email
-import csv
-import sys
-lstFilesToIgnore = buildIgnoreList()
-headerOutputCols = ['HUM-READ-EMAIL-ADDR','ACTION', 'STATUS', 'DIAGNOSTIC-CODE', 'FINAL-RECIPIENT','ORIGINAL-RECIPIENT', 'SOURCE-FILENAME', 'REMOTE-MTA', 'LAST-ATTEMPT-DATE','WILL-RETRY-UNTIL'] 
-path = 'C:/usr/rshea/mytemp/20110609/NZLPProblemEmails-20120510/'
-rawstr = r"""^(?P<fullErrorMessage>[<](?P<emailAddress>.+)[>].*)"""
 
-rawstrVerboseMsg = r"""<head>.*</head><body>.+mailto.*?>(?P<MAILTOADDRESS>.*?)<.*</body>"""
-listing = os.listdir(path)
-compile_obj = re.compile(rawstr,  re.MULTILINE| re.DOTALL)
-compile_objVerboseMsg = re.compile(rawstrVerboseMsg,  re.IGNORECASE|re.MULTILINE| re.DOTALL)
+def main():
+    lst_files_to_ignore = build_ignore_list() 
+    hdr_output_cols = ['HUM-READ-EMAIL-ADDR',
+                        'ACTION', 
+                        'STATUS', 
+                        'DIAGNOSTIC-CODE', 
+                        'FINAL-RECIPIENT',
+                        'ORIGINAL-RECIPIENT', 
+                        'SOURCE-FILENAME', 
+                        'REMOTE-MTA', 
+                        'LAST-ATTEMPT-DATE',
+                        'WILL-RETRY-UNTIL'] 
+    path = 'C:/usr/rshea/mytemp/20110609/NZLPProblemEmails-20120510/'
+    rawstr = r"""^(?P<fullErrorMessage>[<](?P<emailAddress>.+)[>].*)"""
 
-spamWriter = csv.writer(open('NZLP-bademailaddresses-20120510.csv', 'wb'), dialect='excel')
-headerDictWriter = csv.DictWriter(open('NZLP-bademailaddresses-headers-20120510.csv', 'wb'), headerOutputCols, restval='N/A', dialect='excel')
-#headerDictWriter.writeheader()
-headerDictWriter.writerow(dict(zip(headerOutputCols,headerOutputCols)))
+    rw_str_verbose_msg = r"""<head>.*</head><body>.+mailto.*?>(?P<MAILTOADDRESS>.*?)<.*</body>"""
+    listing = os.listdir(path)
+    comp_obj = re.compile(rawstr,  re.MULTILINE| re.DOTALL)
+    comp_obj_verbose_msg = re.compile( rw_str_verbose_msg,  \
+                                        re.IGNORECASE|re.MULTILINE| re.DOTALL)
 
-'''
-fileCnt = 1  
-for infileName in listing:
-    fileCnt += 1 
-    if infileName in lstFilesToIgnore:
-        pass
-    else:
-        infileFullPath = "%s/%s" % (path, infileName)
-        processFile(infileName, infileFullPath,compile_obj, spamWriter, compile_objVerboseMsg, dicDiag, True)
-'''
+    csv_row_wrtr = csv.writer(
+            open('NZLP-bademailaddresses-20120510.csv', 'wb'), \
+            dialect='excel')
+    csv_dict_wrtr = csv.DictWriter( \
+            open('NZLP-bademailaddresses-headers-20120510.csv', 'wb'), \
+            hdr_output_cols, \
+            restval='N/A', \
+            dialect='excel')
+    csv_dict_wrtr.writerow(dict(zip(hdr_output_cols,hdr_output_cols)))
 
-dicDiag = {}
-for infileName in listing:
-    if infileName in lstFilesToIgnore:
-        pass
-    else:
-        infileFullPath = "%s/%s" % (path, infileName)
-        if infileName == '''1336519819.V811I75c5d6dM736965.diezel''':
-            debugVar = 1
-        processEmailHeaders(infileName, infileFullPath,compile_obj, headerDictWriter)
-pprint.pprint(dicDiag)
+    '''
+    fileCnt = 1  
+    for infileName in listing:
+        fileCnt += 1 
+        if infileName in lst_files_to_ignore:
+            pass
+        else:
+            infileFullPath = "%s/%s" % (path, infileName)
+            process_file(infileName, infileFullPath, \
+                        comp_obj, csv_row_wrtr, \
+                        comp_obj_verbose_msg, dic_diag, True)
+    '''
 
+    dic_diag = {}
+    for infileName in listing:
+        if infileName in lst_files_to_ignore:
+            pass
+        else:
+            infileFullPath = "%s/%s" % (path, infileName)
+            process_email_headers(infileName, infileFullPath, \
+                    comp_obj, csv_dict_wrtr)
+    pprint.pprint(dic_diag)
 
+if __name__ == "__main__":
+    main()
+
+
+
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.