Source

smtpErrorAnalysis / findBadAddresses.py

Full commit
'''
f=file(fp)
fp="./1306429999.V811I65cc573M345725.diezel"
msg = email_message_from_string(f)
print msg.as_string()
print msg.get_payload()[0]
'''
import re
import pprint
class MyError(Exception):
    def __init__(self, value):
        self.value = value
    def __str__(self):
        return repr(self.value)


def stripLineFeeds(string):
    string = string.replace("\r","")
    string = string.replace("\n","")
    string = string.strip()
    return string

def stripFromVerboseMessage(sIn, compile_objVerboseMsg):
    match_obj = compile_objVerboseMsg.search(sIn)
    try:
        MAILTOADDRESS = match_obj.group('MAILTOADDRESS')
    except AttributeError: 
        MAILTOADDRESS = sIn

    return MAILTOADDRESS

def parseErrorComponent(inString):
    '''
    for line in inString:
        print line
    '''
    debugVar = -1 

def parseErrorComponent1(inMsg, dicDiag):
    '''
    for line in inString:
        print line
    '''
    debugVar = -1 
    '''
    for part in inMsg.walk():
        print ""
        print part.items()
        print ""
    '''
    #Do stupid stuff to convert
    #a generator to a list
    lParts = []
    for lPart in inMsg.walk():
        lParts.append(lPart)

    items = lParts[2].items()
    print items
    for item in items:
        if dicDiag.has_key(item[0]):
            dicDiag[item[0]].append(item[1]) 
        else:
            dicDiag[item[0]] = [] 

    return dicDiag

def findEmail(inString):
    #Following regex found at : http://stackp.online.fr/?p=19
    email_pattern = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')
    hitCount = 0
    results = []
    #Put all email addresses found into a list. The resulting list
    #should contain one or more identical email addresses
    for match in email_pattern.findall(inString):
        results.append(match[0])

    if len(results) == 0: 
        raise MyError("Found zero email addresses so don't know what to do")
    elif len(results) > 1:
        for emailAddress in results[1:]:
            if emailAddress != results[0]:  
                print "+" * 50
                print inString
                print "-" * 50
                pprint.pprint(results)
                print "^" * 50
                raise MyError("Found more than one email address so don't know what to do [%s]" % pprint.pformat(results))

    return results[0]


def removeRFCNotation(emailToBeCleaned):
    l = emailToBeCleaned.split(';')
    if len(l) == 0:
        return l[0]
    else:
        return l[1]

def processEmailHeaders(fileName, p, compile_obj, headerDictWriter):
    print "About to process : %s" % fileName
    f=file(p)
    msg = email.message_from_string(f.read())
    try:
        msgDebug = msg.get_payload()[1]
    except IndexError:
        print "File %s is not a recognised format [a]" % fileName
    else:
        #Do stupid stuff to convert
        #a generator to a list
        lParts = []
        try:
            for lPart in msgDebug.walk():
                lParts.append(lPart)
        except AttributeError:
            print "File %s is not a recognised format [b]" % fileName
        else:
            lstHeadersKeyValues = lParts[2].items()
            dicDiag = {'SOURCE-FILENAME': fileName}
            for headerKeyValue in lstHeadersKeyValues:
                headerName = headerKeyValue[0].upper()
                headerValue = headerKeyValue[1]
                headerValue = headerValue.replace("\r","")
                headerValue = headerValue.replace("\n","")
                
                if dicDiag.has_key(headerName):
                    raise MyError("Found a duplicate header when parsing error email" % pprint.pformat(lstHeadersKeyValues))
                else:
                    dicDiag[headerName] = headerValue 
            dicDiag['HUM-READ-EMAIL-ADDR'] = removeRFCNotation(dicDiag['FINAL-RECIPIENT'])
            headerDictWriter.writerow(dicDiag)

def processFile(fileName, p, compile_obj, spamWriter, compile_objVerboseMsg, dicDiag, useAlternateEmailRegex = True):
    print "About to process : %s" % fileName
    pprint.pprint(dicDiag)
    f=file(p)
    msg = email.message_from_string(f.read())
    sErrText = msg.get_payload()[0]
    print "(" * 50
    msgDebug = msg.get_payload()[1]
    print "Dump of headers STARTS"
    pprint.pprint(msgDebug.items())
    print "Dump of headers STOPS"
    print "Walk of message STARTS"
    dicDiag = parseErrorComponent1(msgDebug, dicDiag)
    print "Walk of message ENDS  "
    print ")" * 50
    # method 1: using a compile object
    '''
    print sErrText
    print type(sErrText)
    import pprint       
    pprint.pprint(dir(sErrText))
    '''
    blnGoodErrMsg = False
    blnGoodEmail = False
    # Retrieve group(s) by name
    match_obj = compile_obj.search(sErrText.as_string())
    try:
        fullErrorMessage = match_obj.group('fullErrorMessage')
    #except AttributeError as (errno, strerror):
    except AttributeError:
        print "Error 1" 
    except:
        print "Unexpected error:", sys.exc_info()[0]
        raise
    else:
        fullErrorMessage = stripLineFeeds(fullErrorMessage)
        blnGoodErrMsg = True 

    
    if useAlternateEmailRegex == True:
        try:
            emailAddress = findEmail(sErrText.as_string())
        except AttributeError:
            print "Error 1" 
        except:
            print "Unexpected error:", sys.exc_info()[0]
            raise
        else:
            blnGoodEmail = True
    else:
        try:
            emailAddress = match_obj.group('emailAddress')
        except AttributeError:
            print "Error 1" 
        except:
            print "Unexpected error:", sys.exc_info()[0]
            raise
        else:
            emailAddress = stripFromVerboseMessage(emailAddress, compile_objVerboseMsg)
            emailAddress = stripLineFeeds(emailAddress)
            blnGoodEmail = True

    if blnGoodEmail == True and blnGoodErrMsg:
        print "%s -> %s" % (emailAddress, fullErrorMessage)
        spamWriter.writerow([fileName, emailAddress, fullErrorMessage])
    else:
        print "+" * 60
        print sErrText.as_string()
        print "+" * 60


def processFilePlain(p, compile_obj):
    f=file(p)
    msg = email.message_from_string(f.read())
    sErrText = msg.get_payload()[0]
    return sErrText 

def buildIgnoreList():
    lst = []
    '''
    lst.append('1306107016.V811I65cc3caM321663.diezel')
    lst.append('1306107022.V811I65cc3e2M13969.diezel')
    lst.append('1306107023.V811I65cc3e4M209786.diezel')
    lst.append('1306107024.V811I65cc3e5M217596.diezel')
    lst.append('1306139420.V811I65cc4e2M320634.diezel')
    lst.append('1306139432.V811I65cc4efM494423.diezel')
    lst.append('1306144832.V811I65cc4f6M753347.diezel')
    lst.append('1306148434.V811I65cc518M29274.diezel')
    lst.append('1306159236.V811I65cc556M88068.diezel')
    #New Ignores
    #Seems to have dodgy headers
    #lst.append('1336530626.V811Ia5c41b8M190259.diezel')
    #lst.append('1336545015.V811Ia5c4247M447388.diezel')
    #Can't seem to walk the next two - possibly not an error email ?
    lst.append('1336536039.V811Ia5c41e9M291749.diezel')
    lst.append('1336536446.V811Ia5c41eaM317118.diezel')
    #Another formatting problem but not the same as the last one
    #again possilby not an error message at all
    lst.append('1336539641.V811Ia5c4213M679074.diezel')
    lst.append('1336541421.V811Ia5c4223M269284.diezel')
    lst.append('1336545017.V811Ia5c424fM26027.diezel')
    lst.append('1336545027.V811Ia5c4255M679749.diezel')
    lst.append('1336550426.V811Ia5c4279M556351.diezel')
    lst.append('1336561380.V811Ia5c42c5M700541.diezel')
    '''
    return lst
import os
import email
import csv
import sys
lstFilesToIgnore = buildIgnoreList()
headerOutputCols = ['HUM-READ-EMAIL-ADDR','ACTION', 'STATUS', 'DIAGNOSTIC-CODE', 'FINAL-RECIPIENT','ORIGINAL-RECIPIENT', 'SOURCE-FILENAME', 'REMOTE-MTA', 'LAST-ATTEMPT-DATE','WILL-RETRY-UNTIL'] 
path = 'C:/usr/rshea/mytemp/20110609/NZLPProblemEmails-20120510/'
rawstr = r"""^(?P<fullErrorMessage>[<](?P<emailAddress>.+)[>].*)"""

rawstrVerboseMsg = r"""<head>.*</head><body>.+mailto.*?>(?P<MAILTOADDRESS>.*?)<.*</body>"""
listing = os.listdir(path)
compile_obj = re.compile(rawstr,  re.MULTILINE| re.DOTALL)
compile_objVerboseMsg = re.compile(rawstrVerboseMsg,  re.IGNORECASE|re.MULTILINE| re.DOTALL)

spamWriter = csv.writer(open('NZLP-bademailaddresses-20120510.csv', 'wb'), dialect='excel')
headerDictWriter = csv.DictWriter(open('NZLP-bademailaddresses-headers-20120510.csv', 'wb'), headerOutputCols, restval='N/A', dialect='excel')
#headerDictWriter.writeheader()
headerDictWriter.writerow(dict(zip(headerOutputCols,headerOutputCols)))

'''
fileCnt = 1  
for infileName in listing:
    fileCnt += 1 
    if infileName in lstFilesToIgnore:
        pass
    else:
        infileFullPath = "%s/%s" % (path, infileName)
        processFile(infileName, infileFullPath,compile_obj, spamWriter, compile_objVerboseMsg, dicDiag, True)
'''

dicDiag = {}
for infileName in listing:
    if infileName in lstFilesToIgnore:
        pass
    else:
        infileFullPath = "%s/%s" % (path, infileName)
        if infileName == '''1336519819.V811I75c5d6dM736965.diezel''':
            debugVar = 1
        processEmailHeaders(infileName, infileFullPath,compile_obj, headerDictWriter)
pprint.pprint(dicDiag)