Commits

firefly committed 5fc0b53

Unicode lookup module: change ugly Java tool into lookup in UnicodeData.txt.

Comments (0)

Files changed (5)

modules/05-unicode-lookup/CharDB.java

-public class CharDB {
-  public static String typeToString(int codepoint) {
-      switch (codepoint) {
-          case Character.COMBINING_SPACING_MARK:
-              return "combining spacing mark";
-          case Character.CONNECTOR_PUNCTUATION:
-              return "connector punctuation";
-          case Character.CONTROL:
-              return "control";
-          case Character.CURRENCY_SYMBOL:
-              return "currency symbol";
-          case Character.DASH_PUNCTUATION:
-              return "dash punctuation";
-          case Character.DECIMAL_DIGIT_NUMBER:
-              return "decimal digit number";
-          case Character.ENCLOSING_MARK:
-              return "enclosing mark";
-          case Character.END_PUNCTUATION:
-              return "end punctuation";
-          case Character.FINAL_QUOTE_PUNCTUATION:
-              return "final quote punctuation";
-          case Character.FORMAT:
-              return "format";
-          case Character.INITIAL_QUOTE_PUNCTUATION:
-              return "initial quote punctuation";
-          case Character.LETTER_NUMBER:
-              return "letter number";
-          case Character.LINE_SEPARATOR:
-              return "line separator";
-          case Character.LOWERCASE_LETTER:
-              return "lowercase letter";
-          case Character.MATH_SYMBOL:
-              return "math symbol";
-          case Character.MODIFIER_LETTER:
-              return "modifier letter";
-          case Character.MODIFIER_SYMBOL:
-              return "modifier symbol";
-          case Character.NON_SPACING_MARK:
-              return "non spacing mark";
-          case Character.OTHER_LETTER:
-              return "other letter";
-          case Character.OTHER_NUMBER:
-              return "other number";
-          case Character.OTHER_PUNCTUATION:
-              return "other punctuation";
-          case Character.OTHER_SYMBOL:
-              return "other symbol";
-          case Character.PARAGRAPH_SEPARATOR:
-              return "paragraph separator";
-          case Character.PRIVATE_USE:
-              return "private use";
-          case Character.SPACE_SEPARATOR:
-              return "space separator";
-          case Character.START_PUNCTUATION:
-              return "start punctuation";
-          case Character.SURROGATE:
-              return "surrogate";
-          case Character.TITLECASE_LETTER:
-              return "titlecase letter";
-          case Character.UNASSIGNED:
-              return "unassigned";
-          case Character.UPPERCASE_LETTER:
-              return "uppercase letter";
-          default:
-              return "<Missing>";
-      }
-  }
-  public static String directionalityToString(int codepoint) {
-      switch (codepoint) {
-          case Character.DIRECTIONALITY_UNDEFINED:
-              return "directionality undefined";
-          case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
-              return "directionality left to right";
-          case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
-              return "directionality right to left";
-          case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
-              return "directionality right to left arabic";
-          case Character.DIRECTIONALITY_EUROPEAN_NUMBER:
-              return "directionality european number";
-          case Character.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
-              return "directionality european number separator";
-          case Character.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
-              return "directionality european number terminator";
-          case Character.DIRECTIONALITY_ARABIC_NUMBER:
-              return "directionality arabic number";
-          case Character.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
-              return "directionality common number separator";
-          case Character.DIRECTIONALITY_NONSPACING_MARK:
-              return "directionality nonspacing mark";
-          case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
-              return "directionality boundary neutral";
-          case Character.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
-              return "directionality paragraph separator";
-          case Character.DIRECTIONALITY_SEGMENT_SEPARATOR:
-              return "directionality segment separator";
-          case Character.DIRECTIONALITY_WHITESPACE:
-              return "directionality whitespace";
-          case Character.DIRECTIONALITY_OTHER_NEUTRALS:
-              return "directionality other neutrals";
-          case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
-              return "directionality left to right embedding";
-          case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
-              return "directionality left to right override";
-          case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
-              return "directionality right to left embedding";
-          case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
-              return "directionality right to left override";
-          case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
-              return "directionality pop directional format";
-          default:
-              return "<Missing>";
-      }
-  }
-}

modules/05-unicode-lookup/CharInfo.java

-
-public class CharInfo {
-	private static String codepointToJSON(int cp) {
-		String directionality = CharDB.directionalityToString(
-				Character.getDirectionality(cp));
-
-		String type = CharDB.typeToString(Character.getType(cp));
-
-		return jsonProp("name"           , Character.getName(cp))
-		     + jsonProp("type"           , type)
-		     + jsonProp("directionality" , directionality)
-	//	     + "\n"
-	//	     + jsonProp("isIdeographic"  , Character.isIdeographic(cp))
-	//	     + jsonProp("isAlphabetic"   , Character.isAlphabetic(cp))
-	//	     + jsonProp("isBmpCodePoint" , Character.isBmpCodePoint(cp))
-	//	     + jsonProp("isDigit"        , Character.isDigit(cp))
-	//	     + jsonProp("isWhitespace"   , Character.isWhitespace(cp))
-	//	     + "\n"
-	//	     + jsonProp("numericValue"   , Character.getNumericValue(cp))
-		     + "}";
-	}
-
-	//-- JSON output helpers ----------------------------------------
-	private static boolean first = true;
-	private static String jsonPropRaw(String name, String encodedValue) {
-		String prefix;
-
-		if (first) {
-			first = false;
-			prefix = "{";
-		} else {
-			prefix = ",";
-		}
-
-		return prefix + " \"" + name + "\" : " + encodedValue + "\n";
-	}
-
-	private static String jsonProp(String name, String value) {
-		return jsonPropRaw(name, "\"" + value + "\"");
-	}
-	private static String jsonProp(String name, double value) {
-		return jsonPropRaw(name, "" + value);
-	}
-	private static String jsonProp(String name, boolean value) {
-		return jsonPropRaw(name, "" + value);
-	}
-
-	//-- main -------------------------------------------------------
-	public static void main(String[] args) {
-		if (args.length != 1) {
-			throw new RuntimeException("Invalid argument count!");
-		}
-
-		int codepoint;
-		if (Character.codePointCount(args[0], 0, args[0].length()) == 1) {
-			codepoint = Character.codePointAt(args[0], 0);
-		} else {
-			System.err.println("Charlength, codepointlength: " +
-                                           args[0].length() + " " +
-                                           Character.codePointCount(args[0], 0, args[0].length()) + " " +
-                                           args[0]);
-			throw new RuntimeException("Invalid character count!");
-		}
-
-		System.out.println(codepointToJSON(codepoint));
-	}
-}

modules/05-unicode-lookup/index.js

 var util          = require('util')
   , child_process = require('child_process')
 
-function parseChar(chr) {
-  // Extract codepoint
-  if (match = chr.match(/^U\+([\dA-F]+)$/i)) {
-    var codepoint = parseInt(match[1], 16)
-
-  } else if (chr.match(/^U\+/)) {
-    throw new Error("Invalid U+ codepoint.")
-
-  } else {
-    var codepoint = chr.charCodeAt()
-    // TODO: Add support for >U+FFFF here too
-  }
-
-  // Create string encoding the codepoint
-  if (codepoint < 0x10000) {
-    var str = String.fromCharCode(codepoint)
-
-  } else {
-    var codepoint_ = codepoint - 0x10000
-      , lead       = (codepoint_ >> 10)   + 0xD800
-      , trail      = (codepoint_ & 0x3FF) + 0xDC00
-      , str        = String.fromCharCode(lead, trail)
-  }
-
-  // Return result
-  return { codepoint: codepoint,
-           string:    str,
-           input:     chr }
-}
-
-function lookup(chr, callback) {
+function lookup(mode, input, callback) {
   // Do the lookup
-  var proc = child_process.spawn('java', ['-cp', moduleReldir, 'CharInfo', chr])
+  var proc = child_process.spawn('sh', [ moduleReldir + '/lookup.sh', mode, input ])
 
   var errData = []
   proc.stderr.on('data', function (data) {
 
     } else if (code != 0) {
       // Abnormal exit code
-      var stderr = errData.join("")
-      callback(new Error("stderr: " + stderr))
+      if (code == 1) {
+        callback(new Error("Character not found!"))
+      } else {
+        var stderr = errData.join("")
+        callback(new Error(code + " stderr: " + stderr))
+      }
 
     } else {
       // Handle normal exit
       var stdout = outData.join("")
-      callback(null, JSON.parse(stdout))
+      callback(null, stdout.trim().split('\n').map(function (line) {
+        return line.split(';')
+      }))
     }
   })
-
-  /*
-  child_process.exec('java -cp "' + moduleReldir + '" CharInfo "' + chr + '"',
-        function(err, stdout, stderr) {
-    if (err) return callback(err)
-
-    if (stderr.length) {
-      callback(new Error("stderr: " + stderr))
-    }
-
-    callback(null, JSON.parse(stdout))
-  })
-  */
 }
 
 bot.triggers.command.add('char', {
   help: "Looks up information about a unicode character.",
-  exec: function(data) {
+  exec: function (data) {
     var self = this
       , match
+      , args = Array.prototype.slice.call(arguments, 1)
       , chrs = data.line.split(/ (?! |$)/)
 
-    if (chrs.length < 1) {
-      self.send(data.target, "Error: usage: 'char <character>' or "
-                           + "'char U+<codepoint>'")
-      return
-
-    } else if (chrs.length > 3) {
-      self.send(data.target, "Error: no more than three characters allowed.")
+    if (data.line == "") {
+      self.send(data.target, "Error: usage: 'char <char> ...', "
+                           + "'char U+<codepoint> ...' or 'char <name fragment>'")
       return
     }
 
-    chrs.forEach(function (chr) {
-      var data = parseChar(chr)
+    // U+...
+    if (args.every(isCodePoint)) {
+      chain(args.slice(0,3), function (str, cb) {
+        var cp = str.match(/[0-9A-Fa-f]+/)[0]
+        lookup('codepoint', cp, cb)
+      }, charCallback)
 
-      lookup(data.string, function (err, charinfo) {
-          if (err) throw err
+    // Codepoint names
+    } else if (args.every(isWordLike) && args[0].length >= 2) {
+      var name = args.join(" ").toUpperCase()
+      lookup('name', name, charCallback)
 
-          var codepointPretty = data.codepoint.toString(16).toUpperCase()
-          if (codepointPretty.length < 4) {
-              codepointPretty = Array(5 - codepointPretty.length).join("0") + codepointPretty
-          }
+    // Characters
+    } else {
+      args = chrs.every(isOneChar)? chrs : data.line.split("")
+      chain(args.slice(0,3), function (chr, cb) {
+        var cp = chr.charCodeAt()
+        lookup('codepoint', cp.toString(16).toUpperCase(), cb)
+      }, charCallback)
+    }
 
-          console.warn("TEST", data.string.charCodeAt(), data.string.charCodeAt(1), data.string.length)
+    function chain(arr, fn, cb) {
+      var idx = 0
+      return f(null, null)
 
-          self.reply(util.format("%s (%s) [U+%s %s]", charinfo.name,
-                                 charinfo.type, codepointPretty, data.string))
+      function f(err, res) {
+        if (err) {
+          cb(err, null)
+        } else {
+          if (idx > 0) cb(null, res)
+          if (idx < arr.length) fn(arr[idx++], f)
+        }
+      }
+    }
+
+    function charCallback(err, infos) {
+      if (err) return self.reply(String(err))
+
+      infos.slice(0,3).forEach(function (info) {
+        var chr = toChar(parseInt(info[0], 16))
+        self.reply(util.format("U+%s [%s] %s: %s ", info[0], info[2], info[1], chr))
       })
-    })
+
+      function toChar(cp) {
+        var fcc = String.fromCharCode.bind(String)
+        return cp < 0x10000? fcc(cp)
+             :               fcc(0xD800 | ((cp >> 10) & 0x3F),
+                                 0xDC00 | ((cp >>  0) & 0x3FF))
+      }
+    }
+
+    function isCodePoint(str) { return /^U\+[0-9A-Fa-f]+,?$/.test(str) }
+    function isWordLike(str)  { return /^[A-Za-z0-9\-]+$/.test(str) }
+    function isOneChar(str)   { return str.length == 1 }
   }
 })
 

modules/05-unicode-lookup/lookup.sh

+#!/bin/sh
+
+dir="modules/05-unicode-lookup"
+database="$dir/UnicodeData.txt"
+
+case "$1" in
+  codepoint) grep "^0*$2;"    "$database" ;;
+  name)      grep ";$2;"      "$database" || \
+             grep ";[^;]*$2"  "$database" ;;
+  *)         echo >&2 "$0: unknown command: '$1'"
+esac

modules/05-unicode-lookup/mkCharDB_java.sh

-#!/bin/sh
-
-function printFunc {
-	name="$1"
-	values="$2"
-
-	echo "  public static String ${name}(int codepoint) {"
-	echo "      switch (codepoint) {"
-
-	echo "${values}" | sed 's/,\s*/\n/g' | while read const; do
-		pretty="$(echo $const | tr '_[:upper:]' ' [:lower:]')"
-		echo "          case Character.${const}:"
-		echo "              return \"${pretty}\";"
-	done
-
-	echo "          default:"
-	echo "              return \"<Missing>\";"
-	echo "      }"
-	echo "  }"
-}
-
-echo "public class CharDB {"
-
-TYPES='COMBINING_SPACING_MARK, CONNECTOR_PUNCTUATION, CONTROL, CURRENCY_SYMBOL, DASH_PUNCTUATION, DECIMAL_DIGIT_NUMBER, ENCLOSING_MARK, END_PUNCTUATION, FINAL_QUOTE_PUNCTUATION, FORMAT, INITIAL_QUOTE_PUNCTUATION, LETTER_NUMBER, LINE_SEPARATOR, LOWERCASE_LETTER, MATH_SYMBOL, MODIFIER_LETTER, MODIFIER_SYMBOL, NON_SPACING_MARK, OTHER_LETTER, OTHER_NUMBER, OTHER_PUNCTUATION, OTHER_SYMBOL, PARAGRAPH_SEPARATOR, PRIVATE_USE, SPACE_SEPARATOR, START_PUNCTUATION, SURROGATE, TITLECASE_LETTER, UNASSIGNED, UPPERCASE_LETTER'
-DIRECTIONALITIES='DIRECTIONALITY_UNDEFINED, DIRECTIONALITY_LEFT_TO_RIGHT, DIRECTIONALITY_RIGHT_TO_LEFT, DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC, DIRECTIONALITY_EUROPEAN_NUMBER, DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR, DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, DIRECTIONALITY_ARABIC_NUMBER, DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, DIRECTIONALITY_NONSPACING_MARK, DIRECTIONALITY_BOUNDARY_NEUTRAL, DIRECTIONALITY_PARAGRAPH_SEPARATOR, DIRECTIONALITY_SEGMENT_SEPARATOR, DIRECTIONALITY_WHITESPACE, DIRECTIONALITY_OTHER_NEUTRALS, DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING, DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE, DIRECTIONALITY_POP_DIRECTIONAL_FORMAT'
-
-printFunc 'typeToString'           "$TYPES"
-printFunc 'directionalityToString' "$DIRECTIONALITIES"
-
-echo "}"
-