Jim Baker avatar Jim Baker committed e545087

Added category enum mapping

Comments (0)

Files changed (1)


 from com.ibm.icu.lang import UCharacter, UProperty
 from com.ibm.icu.util import VersionInfo
 from com.ibm.icu.lang.UCharacter import EastAsianWidth
-from com.ibm.icu.lang.UCharacterEnums import ECharacterDirection
+from com.ibm.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection
 # FIXME add __all__
 # To map from ICU4J enumerations for category, bidirection, and
 # east_asian_width to the underlying property values that Python uses
-# from UnicodeData.txt required a manual mapping from ICU4J docs,
-# using this file as reference:
+# from UnicodeData.txt required a manual mapping between the following
+# two files:
+# http://icu-project.org/apiref/icu4j/constant-values.html
 # http://www.unicode.org/Public/6.3.0/ucd/PropertyValueAliases.txt
+_cat = {
+    ECharacterCategory.COMBINING_SPACING_MARK: "Mc",
+    ECharacterCategory.CONNECTOR_PUNCTUATION: "Pc",
+    ECharacterCategory.CONTROL: "Cc",
+    ECharacterCategory.CURRENCY_SYMBOL: "Sc",
+    ECharacterCategory.DASH_PUNCTUATION: "Pd",
+    ECharacterCategory.DECIMAL_DIGIT_NUMBER: "Nd",
+    ECharacterCategory.ENCLOSING_MARK: "Me",
+    ECharacterCategory.END_PUNCTUATION: "Pe",
+    ECharacterCategory.FINAL_PUNCTUATION: "Pf",
+    ECharacterCategory.FORMAT: "Cf",
+    # per http://icu-project.org/apiref/icu4j/com/ibm/icu/lang/UCharacterEnums.ECharacterCategory.html#GENERAL_OTHER_TYPES
+    # - no characters in [UnicodeData.txt] have this property
+    ECharacterCategory.GENERAL_OTHER_TYPES: "Cn Not Assigned",
+    ECharacterCategory.INITIAL_PUNCTUATION: "Pi",
+    ECharacterCategory.LETTER_NUMBER: "Nl",
+    ECharacterCategory.LINE_SEPARATOR: "Zl",
+    ECharacterCategory.LOWERCASE_LETTER: "Ll",
+    ECharacterCategory.MATH_SYMBOL: "Sm",
+    ECharacterCategory.MODIFIER_LETTER: "Lm",
+    ECharacterCategory.MODIFIER_SYMBOL: "Sk",
+    ECharacterCategory.NON_SPACING_MARK: "Mn",
+    ECharacterCategory.OTHER_LETTER: "Lo",
+    ECharacterCategory.OTHER_NUMBER: "No",
+    ECharacterCategory.OTHER_PUNCTUATION: "Po",
+    ECharacterCategory.OTHER_SYMBOL: "So",
+    ECharacterCategory.PARAGRAPH_SEPARATOR: "Zp",
+    ECharacterCategory.PRIVATE_USE: "Co",
+    ECharacterCategory.SPACE_SEPARATOR: "Zs",
+    ECharacterCategory.START_PUNCTUATION: "Ps",
+    ECharacterCategory.SURROGATE: "Cs",
+    ECharacterCategory.TITLECASE_LETTER: "Lt",
+    ECharacterCategory.UNASSIGNED: "Cn",
+    ECharacterCategory.UPPERCASE_LETTER: "Lu",
 def category(unichr):
-    return UCharacter.getType(get_codepoint(unichr))
+    return _cat[UCharacter.getType(get_codepoint(unichr))]
 _dir = {
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.