Commits

Jason Baldridge committed 73aa7ab

Added StringUtil for simple tokenization.

Comments (0)

Files changed (1)

src/main/scala/fogbow/util/StringUtil.scala

+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package fogbow.util;
+
+// A tokenizer that replaces all non-word characters with whitespace
+// and then returns a StringTokenizer.
+object CleanStringTokenizer {
+  import java.util.StringTokenizer
+
+  def apply (raw: String, doLowerCase: Boolean = true): StringTokenizer = {
+    val cleaned = raw.replaceAll("[^\\p{L}\\p{N}]", " ")
+    if (doLowerCase)
+      new StringTokenizer(cleaned.toLowerCase)
+    else
+      new StringTokenizer(cleaned)
+  }
+
+  // Need this since Java doesn't do defaults...
+  def apply (raw: String): StringTokenizer = apply(raw, true)
+
+}