Commits

Richard Shea  committed 88423c2

This is a 'working' version which I wish to snapshot in case I break things subsequently. The headers of the 'error' part of the bounce-back email are captured to a dictionary and dumped out at the end the processing of each email .

  • Participants
  • Parent commits 2582b0a

Comments (0)

Files changed (1)

File findBadAddresses.py

 print msg.get_payload()[0]
 '''
 import re
+import pprint
+class MyError(Exception):
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        return repr(self.value)
+
+
 def stripLineFeeds(string):
     string = string.replace("\r","")
     string = string.replace("\n","")
 
     return MAILTOADDRESS
 
-def processFile(fileName, p, compile_obj, spamWriter, compile_objVerboseMsg):
+def parseErrorComponent(inString):
+    '''
+    for line in inString:
+        print line
+    '''
+    debugVar = -1 
+
+def parseErrorComponent1(inMsg, dicDiag):
+    '''
+    for line in inString:
+        print line
+    '''
+    debugVar = -1 
+    '''
+    for part in inMsg.walk():
+        print ""
+        print part.items()
+        print ""
+    '''
+    #Do stupid stuff to convert
+    #a generator to a list
+    lParts = []
+    for lPart in inMsg.walk():
+        lParts.append(lPart)
+
+    items = lParts[2].items()
+    print items
+    for item in items:
+        if dicDiag.has_key(item[0]):
+            dicDiag[item[0]].append(item[1]) 
+        else:
+            dicDiag[item[0]] = [] 
+
+    return dicDiag
+
+def findEmail(inString):
+    #Following regex found at : http://stackp.online.fr/?p=19
+    email_pattern = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')
+    hitCount = 0
+    results = []
+    #Put all email addresses found into a list. The resulting list
+    #should contain one or more identical email addresses
+    for match in email_pattern.findall(inString):
+        results.append(match[0])
+
+    if len(results) == 0: 
+        raise MyError("Found zero email addresses so don't know what to do")
+    elif len(results) > 1:
+        for emailAddress in results[1:]:
+            if emailAddress != results[0]:  
+                print "+" * 50
+                print inString
+                print "-" * 50
+                pprint.pprint(results)
+                print "^" * 50
+                raise MyError("Found more than one email address so don't know what to do [%s]" % pprint.pformat(results))
+
+    return results[0]
+
+
+def processFile(fileName, p, compile_obj, spamWriter, compile_objVerboseMsg, dicDiag, useAlternateEmailRegex = True):
+    print "About to process : %s" % fileName
+    pprint.pprint(dicDiag)
     f=file(p)
     msg = email.message_from_string(f.read())
     sErrText = msg.get_payload()[0]
+    print "(" * 50
+    msgDebug = msg.get_payload()[1]
+    print "Dump of headers STARTS"
+    pprint.pprint(msgDebug.items())
+    print "Dump of headers STOPS"
+    print "Walk of message STARTS"
+    dicDiag = parseErrorComponent1(msgDebug, dicDiag)
+    print "Walk of message ENDS  "
+    print ")" * 50
     # method 1: using a compile object
     '''
     print sErrText
         blnGoodErrMsg = True 
 
     
-    try:
-        emailAddress = match_obj.group('emailAddress')
-    #except AttributeError as (errno, strerror):
-    except AttributeError:
-        print "Error 1" 
-    except:
-        print "Unexpected error:", sys.exc_info()[0]
-        raise
+    if useAlternateEmailRegex == True:
+        try:
+            emailAddress = findEmail(sErrText.as_string())
+        except AttributeError:
+            print "Error 1" 
+        except:
+            print "Unexpected error:", sys.exc_info()[0]
+            raise
+        else:
+            blnGoodEmail = True
     else:
-        emailAddress = stripFromVerboseMessage(emailAddress, compile_objVerboseMsg)
-        emailAddress = stripLineFeeds(emailAddress)
-        blnGoodEmail = True
+        try:
+            emailAddress = match_obj.group('emailAddress')
+        except AttributeError:
+            print "Error 1" 
+        except:
+            print "Unexpected error:", sys.exc_info()[0]
+            raise
+        else:
+            emailAddress = stripFromVerboseMessage(emailAddress, compile_objVerboseMsg)
+            emailAddress = stripLineFeeds(emailAddress)
+            blnGoodEmail = True
 
     if blnGoodEmail == True and blnGoodErrMsg:
         print "%s -> %s" % (emailAddress, fullErrorMessage)
 import os
 import email
 import csv
+import sys
 lstFilesToIgnore = buildIgnoreList()
 
-path = 'C:/usr/rshea/mytemp/20110609/NZLPProblemEmails/'
+path = 'C:/usr/rshea/mytemp/20110609/NZLPProblemEmails-20120510/'
 rawstr = r"""^(?P<fullErrorMessage>[<](?P<emailAddress>.+)[>].*)"""
 
 rawstrVerboseMsg = r"""<head>.*</head><body>.+mailto.*?>(?P<MAILTOADDRESS>.*?)<.*</body>"""
 compile_obj = re.compile(rawstr,  re.MULTILINE| re.DOTALL)
 compile_objVerboseMsg = re.compile(rawstrVerboseMsg,  re.IGNORECASE|re.MULTILINE| re.DOTALL)
 
-spamWriter = csv.writer(open('NZLP-bademailaddresses.csv', 'wb'), dialect='excel')
+spamWriter = csv.writer(open('NZLP-bademailaddresses-20120510.csv', 'wb'), dialect='excel')
 
 fileCnt = 1  
+dicDiag = {}
 for infileName in listing:
     fileCnt += 1 
     if infileName in lstFilesToIgnore:
         pass
     else:
         infileFullPath = "%s/%s" % (path, infileName)
-        processFile(infileName, infileFullPath,compile_obj, spamWriter, compile_objVerboseMsg)
+        processFile(infileName, infileFullPath,compile_obj, spamWriter, compile_objVerboseMsg, dicDiag, True)
+pprint.pprint(dicDiag)
 
+