Commits

Richard Shea committed 1aa4369

A good (albeit rough) version of findBadAddresses.py

Comments (0)

Files changed (1)

findBadAddresses.py

     return results[0]
 
 
+def removeRFCNotation(emailToBeCleaned):
+    l = emailToBeCleaned.split(';')
+    if len(l) == 0:
+        return l[0]
+    else:
+        return l[1]
+
+def processEmailHeaders(fileName, p, compile_obj, headerDictWriter):
+    print "About to process : %s" % fileName
+    f=file(p)
+    msg = email.message_from_string(f.read())
+    try:
+        msgDebug = msg.get_payload()[1]
+    except IndexError:
+        print "File %s is not a recognised format [a]" % fileName
+    else:
+        #Do stupid stuff to convert
+        #a generator to a list
+        lParts = []
+        try:
+            for lPart in msgDebug.walk():
+                lParts.append(lPart)
+        except AttributeError:
+            print "File %s is not a recognised format [b]" % fileName
+        else:
+            lstHeadersKeyValues = lParts[2].items()
+            dicDiag = {'SOURCE-FILENAME': fileName}
+            for headerKeyValue in lstHeadersKeyValues:
+                headerName = headerKeyValue[0].upper()
+                headerValue = headerKeyValue[1]
+                headerValue = headerValue.replace("\r","")
+                headerValue = headerValue.replace("\n","")
+                
+                if dicDiag.has_key(headerName):
+                    raise MyError("Found a duplicate header when parsing error email" % pprint.pformat(lstHeadersKeyValues))
+                else:
+                    dicDiag[headerName] = headerValue 
+            dicDiag['HUM-READ-EMAIL-ADDR'] = removeRFCNotation(dicDiag['FINAL-RECIPIENT'])
+            headerDictWriter.writerow(dicDiag)
+
 def processFile(fileName, p, compile_obj, spamWriter, compile_objVerboseMsg, dicDiag, useAlternateEmailRegex = True):
     print "About to process : %s" % fileName
     pprint.pprint(dicDiag)
 
 def buildIgnoreList():
     lst = []
+    '''
     lst.append('1306107016.V811I65cc3caM321663.diezel')
     lst.append('1306107022.V811I65cc3e2M13969.diezel')
     lst.append('1306107023.V811I65cc3e4M209786.diezel')
     lst.append('1306144832.V811I65cc4f6M753347.diezel')
     lst.append('1306148434.V811I65cc518M29274.diezel')
     lst.append('1306159236.V811I65cc556M88068.diezel')
+    #New Ignores
+    #Seems to have dodgy headers
+    #lst.append('1336530626.V811Ia5c41b8M190259.diezel')
+    #lst.append('1336545015.V811Ia5c4247M447388.diezel')
+    #Can't seem to walk the next two - possibly not an error email ?
+    lst.append('1336536039.V811Ia5c41e9M291749.diezel')
+    lst.append('1336536446.V811Ia5c41eaM317118.diezel')
+    #Another formatting problem but not the same as the last one
+    #again possilby not an error message at all
+    lst.append('1336539641.V811Ia5c4213M679074.diezel')
+    lst.append('1336541421.V811Ia5c4223M269284.diezel')
+    lst.append('1336545017.V811Ia5c424fM26027.diezel')
+    lst.append('1336545027.V811Ia5c4255M679749.diezel')
+    lst.append('1336550426.V811Ia5c4279M556351.diezel')
+    lst.append('1336561380.V811Ia5c42c5M700541.diezel')
+    '''
     return lst
 import os
 import email
 import csv
 import sys
 lstFilesToIgnore = buildIgnoreList()
-
+headerOutputCols = ['HUM-READ-EMAIL-ADDR','ACTION', 'STATUS', 'DIAGNOSTIC-CODE', 'FINAL-RECIPIENT','ORIGINAL-RECIPIENT', 'SOURCE-FILENAME', 'REMOTE-MTA', 'LAST-ATTEMPT-DATE','WILL-RETRY-UNTIL'] 
 path = 'C:/usr/rshea/mytemp/20110609/NZLPProblemEmails-20120510/'
 rawstr = r"""^(?P<fullErrorMessage>[<](?P<emailAddress>.+)[>].*)"""
 
 compile_objVerboseMsg = re.compile(rawstrVerboseMsg,  re.IGNORECASE|re.MULTILINE| re.DOTALL)
 
 spamWriter = csv.writer(open('NZLP-bademailaddresses-20120510.csv', 'wb'), dialect='excel')
+headerDictWriter = csv.DictWriter(open('NZLP-bademailaddresses-headers-20120510.csv', 'wb'), headerOutputCols, restval='N/A', dialect='excel')
+#headerDictWriter.writeheader()
+headerDictWriter.writerow(dict(zip(headerOutputCols,headerOutputCols)))
 
+'''
 fileCnt = 1  
-dicDiag = {}
 for infileName in listing:
     fileCnt += 1 
     if infileName in lstFilesToIgnore:
     else:
         infileFullPath = "%s/%s" % (path, infileName)
         processFile(infileName, infileFullPath,compile_obj, spamWriter, compile_objVerboseMsg, dicDiag, True)
+'''
+
+dicDiag = {}
+for infileName in listing:
+    if infileName in lstFilesToIgnore:
+        pass
+    else:
+        infileFullPath = "%s/%s" % (path, infileName)
+        if infileName == '''1336519819.V811I75c5d6dM736965.diezel''':
+            debugVar = 1
+        processEmailHeaders(infileName, infileFullPath,compile_obj, headerDictWriter)
 pprint.pprint(dicDiag)