Commits

Jason Baldridge committed 82b5ddf

Updated the tokenization.

Comments (0)

Files changed (1)

src/main/scala/fogbow/util/StringUtil.scala

 
 object StringCleaner {
 
+  // Cleans up a string by ripping out punctuation, turning all digit
+  // sequences into a single numeric symbol, and getting rid of tokens
+  // that contain mixtures of alphabetic and numeric characters.
   def apply (raw: String, doLowerCase: Boolean = true): String = {
     val cleaned = 
-      raw.replaceAll("[^\\p{L}\\p{N}]", " ").replaceAll("""\b\d+\b""", "[-numeric-]").trim()
+      raw.replaceAll("[^\\p{L}\\p{N}]", " ").replaceAll("""\b\d+\b""", "[-numeric-]").replaceAll("""\b[^\s]{10,}\b""","").replaceAll("""\b[^\s]*\w+\d+[^\s]*\b""","").replaceAll("""\b[^\s]*\d+\w+[^\s]*\b""","").trim()
     doLowerCase match {
       case true => cleaned.toLowerCase
       case false => cleaned