Commits

Anonymous committed 7898d76

import from //branches/cupcake/...@142529

Comments (0)

Files changed (182)

+include $(call all-subdir-makefiles)
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+         $(call all-subdir-java-files)
+
+LOCAL_PACKAGE_NAME := PinyinIME
+
+LOCAL_JNI_SHARED_LIBRARIES := libjni_pinyinime
+
+LOCAL_STATIC_JAVA_LIBRARIES := com.android.inputmethod.pinyin.lib
+
+LOCAL_CERTIFICATE := shared
+
+# Make sure our dictionary file is not compressed, so we can read it with
+# a raw file descriptor.
+LOCAL_AAPT_FLAGS := -0 .dat
+
+include $(BUILD_PACKAGE)
+
+MY_PATH := $(LOCAL_PATH)
+
+include $(MY_PATH)/jni/Android.mk
+include $(MY_PATH)/lib/Android.mk

PinyinIME/AndroidManifest.xml

+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Copyright (C) 2009 The Android Open Source Project
+
+     Licensed under the Apache License, Version 2.0 (the "License");
+     you may not use this file except in compliance with the License.
+     You may obtain a copy of the License at
+
+          http://www.apache.org/licenses/LICENSE-2.0
+
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+-->
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.inputmethod.pinyin"
+    android:sharedUserId="android.uid.shared">
+        <uses-permission android:name="android.permission.VIBRATE"/>
+
+        <application android:icon="@drawable/app_icon"
+          android:label="@string/ime_name">
+            <service android:name=".PinyinDecoderService"
+                android:exported="true">
+                <intent-filter>
+                    <action android:name="com.android.inputmethod.pinyin.Decoder_Service" />
+                    <category android:name="android.intent.category.DEFAULT" />
+                </intent-filter>
+            </service>
+
+            <service android:name=".PinyinIME"
+                android:label="@string/ime_name"
+                    android:permission="android.permission.BIND_INPUT_METHOD">
+                <intent-filter>
+                    <action android:name="android.view.InputMethod" />
+                </intent-filter>
+                <meta-data android:name="android.view.im" android:resource="@xml/method" />
+            </service>
+
+            <activity android:name=".SettingsActivity"
+                android:label="@string/ime_settings_activity_name">
+                <intent-filter>
+                    <action android:name="android.intent.action.MAIN"/>
+                </intent-filter>
+            </activity>
+
+        </application>
+</manifest>

PinyinIME/jni/Android.mk

+LOCAL_PATH := $(call my-dir)
+
+### shared library
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+	android/com_android_inputmethod_pinyin_PinyinDecoderService.cpp \
+	share/dictbuilder.cpp \
+	share/dictlist.cpp \
+	share/dicttrie.cpp \
+	share/lpicache.cpp \
+	share/matrixsearch.cpp \
+	share/mystdlib.cpp \
+	share/ngram.cpp \
+	share/pinyinime.cpp \
+	share/searchutility.cpp \
+	share/spellingtable.cpp \
+	share/spellingtrie.cpp \
+	share/splparser.cpp \
+	share/userdict.cpp \
+	share/utf16char.cpp \
+	share/utf16reader.cpp \
+	share/sync.cpp
+
+LOCAL_C_INCLUDES += $(JNI_H_INCLUDE)
+LOCAL_LDLIBS += -lpthread
+LOCAL_MODULE := libjni_pinyinime
+LOCAL_PRELINK_MODULE := false
+LOCAL_SHARED_LIBRARIES := libcutils libutils
+
+include $(BUILD_SHARED_LIBRARY)

PinyinIME/jni/android/com_android_inputmethod_pinyin_PinyinDecoderService.cpp

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <cutils/log.h>
+#include <jni.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../include/pinyinime.h"
+#include "../include/sync.h"
+#include "../include/userdict.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+using namespace ime_pinyin;
+
+#define RET_BUF_LEN 256
+
+static char16 retbuf[RET_BUF_LEN];
+static char16 (*predict_buf)[kMaxPredictSize + 1] = NULL;
+static size_t predict_len;
+
+static Sync sync_worker;
+
+static struct file_descriptor_offsets_t
+{
+  jclass mClass;
+  jfieldID mDescriptor;
+} gFileDescriptorOffsets;
+
+JNIEXPORT jboolean JNICALL nativeImOpenDecoder(JNIEnv* env, jclass jclazz,
+                                               jbyteArray fn_sys_dict,
+                                               jbyteArray fn_usr_dict) {
+  jbyte *fsd = (*env).GetByteArrayElements(fn_sys_dict, 0);
+  jbyte *fud = (*env).GetByteArrayElements(fn_usr_dict, 0);
+
+  if (im_open_decoder((const char*)fsd, (const char*)fud))
+    return JNI_TRUE;
+
+  return JNI_FALSE;
+}
+
+JNIEXPORT jboolean JNICALL nativeImOpenDecoderFd(JNIEnv* env, jclass jclazz,
+                                                 jobject fd_sys_dict,
+                                                 jlong startoffset,
+                                                 jlong length,
+                                                 jbyteArray fn_usr_dict) {
+  jint fd = env->GetIntField(fd_sys_dict, gFileDescriptorOffsets.mDescriptor);
+  jbyte *fud = (*env).GetByteArrayElements(fn_usr_dict, 0);
+
+  int newfd = dup(fd);
+  if (im_open_decoder_fd(newfd, startoffset, length, (const char*)fud)) {
+    close(newfd);
+    return JNI_TRUE;
+  }
+
+  close(newfd);
+  return JNI_FALSE;
+}
+
+JNIEXPORT void JNICALL nativeImSetMaxLens(JNIEnv* env, jclass jclazz,
+                                          jint max_sps_len,
+                                          jint max_hzs_len) {
+  im_set_max_lens(static_cast<size_t>(max_sps_len),
+                  static_cast<size_t>(max_hzs_len));
+  return;
+}
+
+JNIEXPORT jboolean JNICALL nativeImCloseDecoder(JNIEnv* env, jclass jclazz) {
+  im_close_decoder();
+  return JNI_TRUE;
+}
+
+JNIEXPORT jint JNICALL nativeImSearch(JNIEnv* env, jclass jclazz,
+                                      jbyteArray pybuf, jint pylen) {
+  jbyte *array_body = (*env).GetByteArrayElements(pybuf, 0);
+
+  if (NULL == array_body)
+    return 0;
+
+  return im_search((const char*)array_body, pylen);
+}
+
+JNIEXPORT jint JNICALL nativeImDelSearch(JNIEnv* env, jclass jclazz, jint pos,
+                                         jboolean is_pos_in_splid,
+                                         jboolean clear_fixed_this_step) {
+  return im_delsearch(pos, is_pos_in_splid, clear_fixed_this_step);
+}
+
+JNIEXPORT void JNICALL nativeImResetSearch(JNIEnv* env, jclass jclazz) {
+  im_reset_search();
+  return;
+}
+
+JNIEXPORT jint JNICALL nativeImAddLetter(JNIEnv *env, jclass clazz, jbyte ch) {
+  return im_add_letter(ch);
+}
+
+JNIEXPORT jstring JNICALL nativeImGetPyStr(JNIEnv* env, jclass jclazz,
+                                           jboolean decoded) {
+  size_t py_len;
+  const char *py = im_get_sps_str(&py_len);  // py_len gets decoded length
+  assert(NULL != py);
+  if (!decoded)
+    py_len = strlen(py);
+
+  const unsigned short *spl_start;
+  size_t len;
+  len = im_get_spl_start_pos(spl_start);
+
+  size_t i;
+  for (i = 0; i < py_len; i++)
+    retbuf[i] = py[i];
+  retbuf[i] = (char16)'\0';
+
+  jstring retstr = (*env).NewString((unsigned short*)retbuf, i);
+  return retstr;
+}
+
+JNIEXPORT jint JNICALL nativeImGetPyStrLen(JNIEnv* env, jclass jclazz,
+                                           jboolean decoded) {
+  size_t py_len;
+  const char *py = im_get_sps_str(&py_len);  // py_len gets decoded length
+  assert(NULL != py);
+  if (!decoded)
+    py_len = strlen(py);
+  return py_len;
+}
+
+JNIEXPORT jintArray JNICALL nativeImGetSplStart(JNIEnv* env, jclass jclazz) {
+  const unsigned short *spl_start;
+  size_t len;
+
+  // There will be len + 1 elements in the buffer when len > 0.
+  len = im_get_spl_start_pos(spl_start);
+
+  jintArray arr = (*env).NewIntArray(len + 2);
+  jint *arr_body = (*env).GetIntArrayElements(arr, 0);
+  assert(NULL != arr_body);
+  arr_body[0] = len; // element 0 is used to store the length of buffer.
+  for (size_t i = 0; i <= len; i++)
+    arr_body[i + 1] = spl_start[i];
+  return arr;
+}
+
+JNIEXPORT jstring JNICALL nativeImGetChoice(JNIEnv *env, jclass clazz,
+                                            jint candidateId) {
+  jstring retstr;
+  if(im_get_candidate(candidateId, retbuf, RET_BUF_LEN)) {
+    retstr = (*env).NewString(retbuf, utf16_strlen(retbuf));
+    return retstr;
+  } else {
+    retstr = (*env).NewString((unsigned short*)retbuf, 0);
+    return retstr;
+  }
+}
+
+JNIEXPORT jint JNICALL nativeImChoose(JNIEnv *env, jclass clazz,
+                                      jint choice_id) {
+  return im_choose(choice_id);
+}
+
+JNIEXPORT jint JNICALL nativeImCancelLastChoice(JNIEnv *env, jclass clazz) {
+  return im_cancel_last_choice();
+}
+
+JNIEXPORT jint JNICALL nativeImGetFixedLen(JNIEnv *env, jclass clazz) {
+  return im_get_fixed_len();
+}
+
+JNIEXPORT jboolean JNICALL nativeImCancelInput(JNIEnv *env, jclass clazz) {
+  if (im_cancel_input())
+    return JNI_TRUE;
+
+  return JNI_FALSE;
+}
+
+JNIEXPORT jboolean JNICALL nativeImFlushCache(JNIEnv *env, jclass clazz) {
+  im_flush_cache();
+  return JNI_TRUE;
+}
+
+JNIEXPORT jint JNICALL nativeImGetPredictsNum(JNIEnv *env, jclass clazz,
+                                              jstring fixed_str) {
+  char16 *fixed_ptr = (char16*)(*env).GetStringChars(fixed_str, false);
+  size_t fixed_len = (size_t)(*env).GetStringLength(fixed_str);
+
+  char16 fixed_buf[kMaxPredictSize + 1];
+
+  if (fixed_len > kMaxPredictSize) {
+    fixed_ptr += fixed_len - kMaxPredictSize;
+    fixed_len = kMaxPredictSize;
+  }
+  utf16_strncpy(fixed_buf, fixed_ptr, fixed_len);
+  fixed_buf[fixed_len] = (char16)'\0';
+
+  predict_len = im_get_predicts(fixed_buf, predict_buf);
+
+  return predict_len;
+}
+
+JNIEXPORT jstring JNICALL nativeImGetPredictItem(JNIEnv *env, jclass clazz,
+                                                 jint predict_no) {
+  jstring retstr;
+
+  if (predict_no < 0 || (size_t)predict_no >= predict_len) {
+    retstr = (*env).NewString((unsigned short*)predict_buf[0], 0);
+  } else {
+    retstr = (*env).NewString((unsigned short*)predict_buf[predict_no],
+                              utf16_strlen(predict_buf[predict_no]));
+  }
+  return retstr;
+}
+
+JNIEXPORT jboolean JNICALL nativeSyncBegin(JNIEnv *env, jclass clazz,
+                                           jbyteArray dict_file) {
+  jbyte *file_name = (*env).GetByteArrayElements(dict_file, 0);
+  if (true == sync_worker.begin((const char *)file_name))
+    return JNI_TRUE;
+  return JNI_FALSE;
+}
+
+JNIEXPORT jboolean JNICALL nativeSyncFinish(JNIEnv *env, jclass clazz) {
+  sync_worker.finish();
+  return JNI_TRUE;
+}
+
+JNIEXPORT jint JNICALL nativeSyncGetCapacity(JNIEnv *env, jclass clazz) {
+  return sync_worker.get_capacity();
+}
+
+JNIEXPORT jint JNICALL nativeSyncPutLemmas(JNIEnv *env, jclass clazz,
+                                           jstring tomerge) {
+
+  char16 *ptr = (char16*)(*env).GetStringChars(tomerge, NULL);
+  int len = (size_t)(*env).GetStringLength(tomerge);
+
+  int added = sync_worker.put_lemmas(ptr, len);
+
+  (*env).ReleaseStringChars(tomerge, ptr);
+
+  return added;
+}
+
+JNIEXPORT jstring JNICALL nativeSyncGetLemmas(JNIEnv *env, jclass clazz) {
+
+  int len = sync_worker.get_lemmas(retbuf, RET_BUF_LEN);
+  if (len == 0)
+    return NULL;
+  jstring retstr;
+  retstr = (*env).NewString((unsigned short*)retbuf, len);
+  return retstr;
+}
+
+JNIEXPORT jint JNICALL nativeSyncGetLastCount(JNIEnv *env, jclass clazz) {
+  return sync_worker.get_last_got_count();
+}
+
+JNIEXPORT jint JNICALL nativeSyncGetTotalCount(JNIEnv *env, jclass clazz) {
+  return sync_worker.get_total_count();
+}
+
+JNIEXPORT jboolean JNICALL nativeSyncClearLastGot(JNIEnv *env, jclass clazz) {
+  sync_worker.clear_last_got();
+  return JNI_TRUE;
+}
+
+/**
+ * Table of methods associated with a single class.
+ */
+static JNINativeMethod gMethods[] = {
+    /* name, signature, funcPtr */
+    /* ------Functions for Pinyin-to-hanzi decoding begin--------->> */
+    { "nativeImOpenDecoder", "([B[B)Z",
+            (void*) nativeImOpenDecoder },
+    { "nativeImOpenDecoderFd", "(Ljava/io/FileDescriptor;JJ[B)Z",
+            (void*) nativeImOpenDecoderFd },
+    { "nativeImSetMaxLens", "(II)V",
+            (void*) nativeImSetMaxLens },
+    { "nativeImCloseDecoder", "()Z",
+            (void*) nativeImCloseDecoder },
+    { "nativeImSearch",  "([BI)I",
+            (void*) nativeImSearch },
+    { "nativeImDelSearch",  "(IZZ)I",
+            (void*) nativeImDelSearch },
+    { "nativeImResetSearch",  "()V",
+            (void*) nativeImResetSearch },
+    { "nativeImAddLetter", "(B)I",
+            (void*) nativeImAddLetter },
+    { "nativeImGetPyStr", "(Z)Ljava/lang/String;",
+            (void*) nativeImGetPyStr },
+    { "nativeImGetPyStrLen", "(Z)I",
+            (void*) nativeImGetPyStrLen },
+    { "nativeImGetSplStart", "()[I",
+            (void*) nativeImGetSplStart },
+    { "nativeImGetChoice", "(I)Ljava/lang/String;",
+            (void*) nativeImGetChoice },
+    { "nativeImChoose", "(I)I",
+            (void*) nativeImChoose },
+    { "nativeImCancelLastChoice", "()I",
+            (void*) nativeImCancelLastChoice },
+    { "nativeImGetFixedLen", "()I",
+            (void*) nativeImGetFixedLen },
+    { "nativeImGetPredictsNum", "(Ljava/lang/String;)I",
+            (void*) nativeImGetPredictsNum },
+    { "nativeImGetPredictItem", "(I)Ljava/lang/String;",
+            (void*) nativeImGetPredictItem },
+    { "nativeImCancelInput", "()Z",
+            (void*) nativeImCancelInput },
+    { "nativeImFlushCache", "()Z",
+            (void*) nativeImFlushCache },
+    /* <<----Functions for Pinyin-to-hanzi decoding end------------- */
+
+    /* ------Functions for sync begin----------------------------->> */
+    { "nativeSyncBegin", "([B)Z",
+            (void*) nativeSyncBegin },
+    { "nativeSyncFinish", "()Z",
+            (void*) nativeSyncFinish },
+    { "nativeSyncPutLemmas", "(Ljava/lang/String;)I",
+            (void*) nativeSyncPutLemmas },
+    { "nativeSyncGetLemmas", "()Ljava/lang/String;",
+            (void*) nativeSyncGetLemmas },
+    { "nativeSyncGetLastCount", "()I",
+            (void*) nativeSyncGetLastCount },
+    { "nativeSyncGetTotalCount", "()I",
+            (void*) nativeSyncGetTotalCount },
+    { "nativeSyncClearLastGot", "()Z",
+            (void*) nativeSyncClearLastGot },
+    { "nativeSyncGetCapacity", "()I",
+            (void*) nativeSyncGetCapacity },
+    /* <<----Functions for sync end--------------------------------- */
+};
+
+
+/*
+ * Register several native methods for one class.
+ */
+static int registerNativeMethods(JNIEnv* env, const char* className,
+    JNINativeMethod* gMethods, int numMethods)
+{
+    jclass clazz;
+
+    clazz = (*env).FindClass(className);
+    if (clazz == NULL) {
+        return JNI_FALSE;
+    }
+    if ((*env).RegisterNatives(clazz, gMethods, numMethods) < 0) {
+        return JNI_FALSE;
+    }
+
+    clazz = env->FindClass("java/io/FileDescriptor");
+    LOG_FATAL_IF(clazz == NULL, "Unable to find Java class java.io.FileDescriptor");
+    gFileDescriptorOffsets.mClass = (jclass) env->NewGlobalRef(clazz);
+    gFileDescriptorOffsets.mDescriptor = env->GetFieldID(clazz, "descriptor", "I");
+    LOG_FATAL_IF(gFileDescriptorOffsets.mDescriptor == NULL,
+                 "Unable to find descriptor field in java.io.FileDescriptor");
+
+    return JNI_TRUE;
+}
+
+/*
+ * Register native methods for all classes we know about.
+ */
+static int registerNatives(JNIEnv* env)
+{
+    if (!registerNativeMethods(env,
+           "com/android/inputmethod/pinyin/PinyinDecoderService",
+            gMethods, sizeof(gMethods) / sizeof(gMethods[0])))
+        return JNI_FALSE;
+
+    return JNI_TRUE;
+}
+
+/*
+ * Set some test stuff up.
+ *
+ * Returns the JNI version on success, -1 on failure.
+ */
+JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved)
+{
+    JNIEnv* env = NULL;
+    jint result = -1;
+
+    if ((*vm).GetEnv((void**) &env, JNI_VERSION_1_4) != JNI_OK) {
+        goto bail;
+    }
+    assert(env != NULL);
+
+    if (!registerNatives(env)) {
+        goto bail;
+    }
+
+    /* success -- return valid version number */
+    result = JNI_VERSION_1_4;
+
+bail:
+    return result;
+}
+
+#ifdef __cplusplus
+}
+#endif

PinyinIME/jni/command/Makefile

+CC=gcc
+CFLAGS= -g -Wall -std=c99
+CPP=g++
+CPPFLAGS= -g3 -Wall -lpthread
+
+PINYINIME_DICTBUILDER=pinyinime_dictbuilder
+
+LIBRARY_SRC= \
+	    ../share/dictbuilder.cpp \
+	    ../share/dictlist.cpp \
+	    ../share/dicttrie.cpp \
+	    ../share/lpicache.cpp \
+	    ../share/mystdlib.cpp \
+	    ../share/ngram.cpp \
+	    ../share/searchutility.cpp \
+	    ../share/spellingtable.cpp \
+	    ../share/spellingtrie.cpp \
+	    ../share/splparser.cpp \
+	    ../share/utf16char.cpp \
+	    ../share/utf16reader.cpp \
+
+all: engine
+
+engine: $(PINYINIME_DICTBUILDER)
+
+$(PINYINIME_DICTBUILDER): $(LIBRARY_SRC) pinyinime_dictbuilder.cpp
+	@$(CPP) $(CPPFLAGS) -o $@ $?
+
+
+clean:
+	-rm -rf $(PINYINIME_DICTBUILDER)
+
+.PHONY: clean

PinyinIME/jni/command/pinyinime_dictbuilder.cpp

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <unistd.h>
+#include "../include/dicttrie.h"
+
+using namespace ime_pinyin;
+
+/**
+ * Build binary dictionary model. Make sure that ___BUILD_MODEL___ is defined
+ * in dictdef.h.
+ */
+int main(int argc, char* argv[]) {
+  DictTrie* dict_trie = new DictTrie();
+  bool success;
+  if (argc >= 3)
+     success = dict_trie->build_dict(argv[1], argv[2]);
+  else
+     success = dict_trie->build_dict("../data/rawdict_utf16_65105_freq.txt",
+                                     "../data/valid_utf16.txt");
+
+  if (success) {
+    printf("Build dictionary successfully.\n");
+  } else {
+    printf("Build dictionary unsuccessfully.\n");
+    return -1;
+  }
+
+  success = dict_trie->save_dict("../../res/raw/dict_pinyin.dat");
+
+  if (success) {
+    printf("Save dictionary successfully.\n");
+  } else {
+    printf("Save dictionary unsuccessfully.\n");
+    return -1;
+  }
+
+  return 0;
+}

PinyinIME/jni/data/rawdict_utf16_65105_freq.txt

Binary file added.

PinyinIME/jni/data/valid_utf16.txt

Binary file added.

PinyinIME/jni/include/atomdictbase.h

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This class defines AtomDictBase class which is the base class for all atom
+ * dictionaries. Atom dictionaries are managed by the decoder class
+ * MatrixSearch.
+ *
+ * When the user appends a new character to the Pinyin string, all enabled atom
+ * dictionaries' extend_dict() will be called at least once to get candidates
+ * ended in this step (the information of starting step is also given in the
+ * parameter). Usually, when extend_dict() is called, a MileStoneHandle object
+ * returned by a previous calling for a earlier step is given to speed up the
+ * look-up process, and a new MileStoneHandle object will be returned if
+ * the extension is successful.
+ *
+ * A returned MileStoneHandle object should keep alive until Function
+ * reset_milestones() is called and this object is noticed to be reset.
+ *
+ * Usually, the atom dictionary can use step information to manage its
+ * MileStoneHandle objects, or it can make the objects in ascendant order to
+ * make the reset easier.
+ *
+ * When the decoder loads the dictionary, it will give a starting lemma id for
+ * this atom dictionary to map a inner id to a global id. Global ids should be
+ * used when an atom dictionary talks to any component outside.
+ */
+#ifndef PINYINIME_INCLUDE_ATOMDICTBASE_H__
+#define PINYINIME_INCLUDE_ATOMDICTBASE_H__
+
+#include <stdlib.h>
+#include "./dictdef.h"
+#include "./searchutility.h"
+
+namespace ime_pinyin {
+class AtomDictBase {
+ public:
+  virtual ~AtomDictBase() {}
+
+  /**
+   * Load an atom dictionary from a file.
+   *
+   * @param file_name The file name to load dictionary.
+   * @param start_id The starting id used for this atom dictionary.
+   * @param end_id The end id (included) which can be used for this atom
+   * dictionary. User dictionary will always use the last id space, so it can
+   * ignore this paramter. All other atom dictionaries should check this
+   * parameter.
+   * @return True if succeed.
+   */
+  virtual bool load_dict(const char *file_name, LemmaIdType start_id,
+                         LemmaIdType end_id) = 0;
+
+  /**
+   * Close this atom dictionary.
+   *
+   * @return True if succeed.
+   */
+  virtual bool close_dict() = 0;
+
+  /**
+   * Get the total number of lemmas in this atom dictionary.
+   *
+   * @return The total number of lemmas.
+   */
+  virtual size_t number_of_lemmas() = 0;
+
+  /**
+   * This function is called by the decoder when user deletes a character from
+   * the input string, or begins a new input string.
+   *
+   * Different atom dictionaries may implement this function in different way.
+   * an atom dictionary can use one of these two parameters (or both) to reset
+   * its corresponding MileStoneHandle objects according its detailed
+   * implementation.
+   *
+   * For example, if an atom dictionary uses step information to manage its
+   * MileStoneHandle objects, parameter from_step can be used to identify which
+   * objects should be reset; otherwise, if another atom dictionary does not
+   * use the detailed step information, it only uses ascendant handles
+   * (according to step. For the same step, earlier call, smaller handle), it
+   * can easily reset those MileStoneHandle which are larger than from_handle.
+   *
+   * The decoder always reset the decoding state by step. So when it begins
+   * resetting, it will call reset_milestones() of its atom dictionaries with
+   * the step information, and the MileStoneHandle objects returned by the
+   * earliest calling of extend_dict() for that step.
+   *
+   * If an atom dictionary does not implement incremental search, this function
+   * can be totally ignored.
+   *
+   * @param from_step From which step(included) the MileStoneHandle
+   * objects should be reset.
+   * @param from_handle The ealiest MileStoneHandle object for step from_step
+   */
+  virtual void reset_milestones(uint16 from_step,
+                                MileStoneHandle from_handle) = 0;
+
+  /**
+   * Used to extend in this dictionary. The handle returned should keep valid
+   * until reset_milestones() is called.
+   *
+   * @param from_handle Its previous returned extended handle without the new
+   * spelling id, it can be used to speed up the extending.
+   * @param dep The paramter used for extending.
+   * @param lpi_items Used to fill in the lemmas matched.
+   * @param lpi_max The length of the buffer
+   * @param lpi_num Used to return the newly added items.
+   * @return The new mile stone for this extending. 0 if fail.
+   */
+  virtual MileStoneHandle extend_dict(MileStoneHandle from_handle,
+                                      const DictExtPara *dep,
+                                      LmaPsbItem *lpi_items,
+                                      size_t lpi_max, size_t *lpi_num) = 0;
+
+  /**
+   * Get lemma items with scores according to a spelling id stream.
+   * This atom dictionary does not need to sort the returned items.
+   *
+   * @param splid_str The spelling id stream buffer.
+   * @param splid_str_len The length of the spelling id stream buffer.
+   * @param lpi_items Used to return matched lemma items with scores.
+   * @param lpi_max The maximum size of the buffer to return result.
+   * @return The number of matched items which have been filled in to lpi_items.
+   */
+  virtual size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
+                          LmaPsbItem *lpi_items, size_t lpi_max) = 0;
+
+  /**
+   * Get a lemma string (The Chinese string) by the given lemma id.
+   *
+   * @param id_lemma The lemma id to get the string.
+   * @param str_buf The buffer to return the Chinese string.
+   * @param str_max The maximum size of the buffer.
+   * @return The length of the string, 0 if fail.
+   */
+  virtual uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
+                               uint16 str_max) = 0;
+
+  /**
+   * Get the full spelling ids for the given lemma id.
+   * If the given buffer is too short, return 0.
+   *
+   * @param splids Used to return the spelling ids.
+   * @param splids_max The maximum buffer length of splids.
+   * @param arg_valid Used to indicate if the incoming parameters have been
+   * initialized are valid. If it is true, the splids and splids_max are valid
+   * and there may be half ids in splids to be updated to full ids. In this
+   * case, splids_max is the number of valid ids in splids.
+   * @return The number of ids in the buffer.
+   */
+  virtual uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
+                                  uint16 splids_max, bool arg_valid) = 0;
+
+  /**
+   * Function used for prediction.
+   * No need to sort the newly added items.
+   *
+   * @param last_hzs The last n Chinese chracters(called Hanzi), its length
+   * should be less than or equal to kMaxPredictSize.
+   * @param hzs_len specifies the length(<= kMaxPredictSize) of the history.
+   * @param npre_items Used used to return the result.
+   * @param npre_max The length of the buffer to return result
+   * @param b4_used Number of prediction result (from npre_items[-b4_used])
+   * from other atom dictionaries. A atom ditionary can just ignore it.
+   * @return The number of prediction result from this atom dictionary.
+   */
+  virtual size_t predict(const char16 last_hzs[], uint16 hzs_len,
+                         NPredictItem *npre_items, size_t npre_max,
+                         size_t b4_used) = 0;
+
+  /**
+   * Add a lemma to the dictionary. If the dictionary allows to add new
+   * items and this item does not exist, add it.
+   *
+   * @param lemma_str The Chinese string of the lemma.
+   * @param splids The spelling ids of the lemma.
+   * @param lemma_len The length of the Chinese lemma.
+   * @param count The frequency count for this lemma.
+   */
+  virtual LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
+                                uint16 lemma_len, uint16 count) = 0;
+
+  /**
+   * Update a lemma's occuring count.
+   *
+   * @param lemma_id The lemma id to update.
+   * @param delta_count The frequnecy count to ajust.
+   * @param selected Indicate whether this lemma is selected by user and
+   * submitted to target edit box.
+   * @return The id if succeed, 0 if fail.
+   */
+  virtual LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
+                                   bool selected) = 0;
+
+  /**
+   * Get the lemma id for the given lemma.
+   *
+   * @param lemma_str The Chinese string of the lemma.
+   * @param splids The spelling ids of the lemma.
+   * @param lemma_len The length of the lemma.
+   * @return The matched lemma id, or 0 if fail.
+   */
+  virtual LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
+                                   uint16 lemma_len) = 0;
+
+  /**
+   * Get the lemma score.
+   *
+   * @param lemma_id The lemma id to get score.
+   * @return The score of the lemma, or 0 if fail.
+   */
+  virtual LmaScoreType get_lemma_score(LemmaIdType lemma_id) = 0;
+
+  /**
+   * Get the lemma score.
+   *
+   * @param lemma_str The Chinese string of the lemma.
+   * @param splids The spelling ids of the lemma.
+   * @param lemma_len The length of the lemma.
+   * @return The score of the lamm, or 0 if fail.
+   */
+  virtual LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
+                                uint16 lemma_len) = 0;
+
+  /**
+   * If the dictionary allowed, remove a lemma from it.
+   *
+   * @param lemma_id The id of the lemma to remove.
+   * @return True if succeed.
+   */
+  virtual bool remove_lemma(LemmaIdType lemma_id) = 0;
+
+  /**
+   * Get the total occuring count of this atom dictionary.
+   *
+   * @return The total occuring count of this atom dictionary.
+   */
+  virtual size_t get_total_lemma_count() = 0;
+
+  /**
+   * Set the total occuring count of other atom dictionaries.
+   *
+   * @param count The total occuring count of other atom dictionaies.
+   */
+  virtual void set_total_lemma_count_of_others(size_t count) = 0;
+
+  /**
+   * Notify this atom dictionary to flush the cached data to persistent storage
+   * if necessary.
+   */
+  virtual void flush_cache() = 0;
+};
+}
+
+#endif  // PINYINIME_INCLUDE_ATOMDICTBASE_H__

PinyinIME/jni/include/dictbuilder.h

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
+#define PINYINIME_INCLUDE_DICTBUILDER_H__
+
+#include <stdlib.h>
+#include "./utf16char.h"
+#include "./dictdef.h"
+#include "./dictlist.h"
+#include "./spellingtable.h"
+#include "./spellingtrie.h"
+#include "./splparser.h"
+
+namespace ime_pinyin {
+
+#ifdef ___BUILD_MODEL___
+
+#define ___DO_STATISTICS___
+
+class DictTrie;
+
+class DictBuilder {
+ private:
+  // The raw lemma array buffer.
+  LemmaEntry *lemma_arr_;
+  size_t lemma_num_;
+
+  // Used to store all possible single char items.
+  // Two items may have the same Hanzi while their spelling ids are different.
+  SingleCharItem *scis_;
+  size_t scis_num_;
+
+  // In the tree, root's level is -1.
+  // Lemma nodes for root, and level 0
+  LmaNodeLE0 *lma_nodes_le0_;
+
+  // Lemma nodes for layers whose levels are deeper than 0
+  LmaNodeGE1 *lma_nodes_ge1_;
+
+  // Number of used lemma nodes
+  size_t lma_nds_used_num_le0_;
+  size_t lma_nds_used_num_ge1_;
+
+  // Used to store homophonies' ids.
+  LemmaIdType *homo_idx_buf_;
+  // Number of homophonies each of which only contains one Chinese character.
+  size_t homo_idx_num_eq1_;
+  // Number of homophonies each of which contains more than one character.
+  size_t homo_idx_num_gt1_;
+
+  // The items with highest scores.
+  LemmaEntry *top_lmas_;
+  size_t top_lmas_num_;
+
+  SpellingTable *spl_table_;
+  SpellingParser *spl_parser_;
+
+#ifdef ___DO_STATISTICS___
+  size_t max_sonbuf_len_[kMaxLemmaSize];
+  size_t max_homobuf_len_[kMaxLemmaSize];
+
+  size_t total_son_num_[kMaxLemmaSize];
+  size_t total_node_hasson_[kMaxLemmaSize];
+  size_t total_sonbuf_num_[kMaxLemmaSize];
+  size_t total_sonbuf_allnoson_[kMaxLemmaSize];
+  size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
+  size_t total_homo_num_[kMaxLemmaSize];
+
+  size_t sonbufs_num1_;     // Number of son buffer with only 1 son
+  size_t sonbufs_numgt1_;   // Number of son buffer with more 1 son;
+
+  size_t total_lma_node_num_;
+
+  void stat_init();
+  void stat_print();
+#endif
+
+ public:
+
+  DictBuilder();
+  ~DictBuilder();
+
+  // Build dictionary trie from the file fn_raw. File fn_validhzs provides
+  // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
+  // included.
+  bool build_dict(const char* fn_raw, const char* fn_validhzs,
+                  DictTrie *dict_trie);
+
+ private:
+  // Fill in the buffer with id. The caller guarantees that the paramters are
+  // vaild.
+  void id_to_charbuf(unsigned char *buf, LemmaIdType id);
+
+  // Update the offset of sons for a node.
+  void set_son_offset(LmaNodeGE1 *node, size_t offset);
+
+  // Update the offset of homophonies' ids for a node.
+  void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);
+
+  // Format a speling string.
+  void format_spelling_str(char *spl_str);
+
+  // Sort the lemma_arr by the hanzi string, and give each of unique items
+  // a id. Why we need to sort the lemma list according to their Hanzi string
+  // is to find items started by a given prefix string to do prediction.
+  // Actually, the single char items are be in other order, for example,
+  // in spelling id order, etc.
+  // Return value is next un-allocated idx available.
+  LemmaIdType sort_lemmas_by_hz();
+
+  // Build the SingleCharItem list, and fill the hanzi_scis_ids in the
+  // lemma buffer lemma_arr_.
+  // This function should be called after the lemma array is ready.
+  // Return the number of unique SingleCharItem elements.
+  size_t build_scis();
+
+  // Construct a subtree using a subset of the spelling array (from
+  // item_star to item_end)
+  // parent is the parent node to update the necessary information
+  // parent can be a member of LmaNodeLE0 or LmaNodeGE1
+  bool construct_subset(void* parent, LemmaEntry* lemma_arr,
+                        size_t item_start, size_t item_end, size_t level);
+
+
+  // Read valid Chinese Hanzis from the given file.
+  // num is used to return number of chars.
+  // The return buffer is sorted and caller needs to free the returned buffer.
+  char16* read_valid_hanzis(const char *fn_validhzs, size_t *num);
+
+
+  // Read a raw dictionary. max_item is the maximum number of items. If there
+  // are more items in the ditionary, only the first max_item will be read.
+  // Returned value is the number of items successfully read from the file.
+  size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,
+                       size_t max_item);
+
+  // Try to find if a character is in hzs buffer.
+  bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);
+
+  // Try to find if all characters in str are in hzs buffer.
+  bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
+                          const char16 *str, size_t str_len);
+
+  // Get these lemmas with toppest scores.
+  void get_top_lemmas();
+
+  // Allocate resource to build dictionary.
+  // lma_num is the number of items to be loaded
+  bool alloc_resource(size_t lma_num);
+
+  // Free resource.
+  void free_resource();
+};
+#endif  // ___BUILD_MODEL___
+}
+
+#endif  // PINYINIME_INCLUDE_DICTBUILDER_H__

PinyinIME/jni/include/dictdef.h

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_DICTDEF_H__
+#define PINYINIME_INCLUDE_DICTDEF_H__
+
+#include <stdlib.h>
+#include "./utf16char.h"
+
+namespace ime_pinyin {
+
+// Enable the following line when building the binary dictionary model.
+// #define ___BUILD_MODEL___
+
+typedef unsigned char      uint8;
+typedef unsigned short     uint16;
+typedef unsigned int       uint32;
+
+typedef signed char        int8;
+typedef short              int16;
+typedef int                int32;
+typedef long long          int64;
+typedef unsigned long long uint64;
+
+const bool kPrintDebug0 = false;
+const bool kPrintDebug1 = false;
+const bool kPrintDebug2 = false;
+
+// The max length of a lemma.
+const size_t kMaxLemmaSize = 8;
+
+// The max length of a Pinyin (spelling).
+const size_t kMaxPinyinSize = 6;
+
+// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
+// See SpellingTrie.h for details.
+const size_t kHalfSpellingIdNum = 29;
+
+// The maximum number of full spellings. For Chinese Pinyin, there are only
+// about 410 spellings.
+// If change this value is bigger(needs more bits), please also update
+// other structures like SpellingNode, to make sure than a spelling id can be
+// stored.
+// -1 is because that 0 is never used.
+const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
+const size_t kMaxSearchSteps = 40;
+
+// One character predicts its following characters.
+const size_t kMaxPredictSize = (kMaxLemmaSize - 1);
+
+// LemmaIdType must always be size_t.
+typedef size_t LemmaIdType;
+const size_t kLemmaIdSize = 3;  // Actually, a Id occupies 3 bytes in storage.
+const size_t kLemmaIdComposing = 0xffffff;
+
+typedef uint16 LmaScoreType;
+typedef uint16 KeyScoreType;
+
+// Number of items with highest score are kept for prediction purpose.
+const size_t kTopScoreLemmaNum = 10;
+
+const size_t kMaxPredictNumByGt3 = 1;
+const size_t kMaxPredictNumBy3 = 2;
+const size_t kMaxPredictNumBy2 = 2;
+
+// The last lemma id (included) for the system dictionary. The system
+// dictionary's ids always start from 1.
+const LemmaIdType kSysDictIdEnd = 500000;
+
+// The first lemma id for the user dictionary.
+const LemmaIdType kUserDictIdStart = 500001;
+
+// The last lemma id (included) for the user dictionary.
+const LemmaIdType kUserDictIdEnd = 600000;
+
+typedef struct {
+  uint16 half_splid:5;
+  uint16 full_splid:11;
+} SpellingId, *PSpellingId;
+
+
+/**
+ * We use different node types for different layers
+ * Statistical data of the building result for a testing dictionary:
+ *                              root,   level 0,   level 1,   level 2,   level 3
+ * max son num of one node:     406        280         41          2          -
+ * max homo num of one node:      0         90         23          2          2
+ * total node num of a layer:     1        406      31766      13516        993
+ * total homo num of a layer:     9       5674      44609      12667        995
+ *
+ * The node number for root and level 0 won't be larger than 500
+ * According to the information above, two kinds of nodes can be used; one for
+ * root and level 0, the other for these layers deeper than 0.
+ *
+ * LE = less and equal,
+ * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
+ */
+struct LmaNodeLE0 {
+  size_t son_1st_off;
+  size_t homo_idx_buf_off;
+  uint16 spl_idx;
+  uint16 num_of_son;
+  uint16 num_of_homo;
+};
+
+/**
+ * GE = great and equal
+ * A node occupies 8 bytes.
+ */
+struct LmaNodeGE1 {
+  uint16 son_1st_off_l;        // Low bits of the son_1st_off
+  uint16 homo_idx_buf_off_l;   // Low bits of the homo_idx_buf_off_1
+  uint16 spl_idx;
+  unsigned char num_of_son;            // number of son nodes
+  unsigned char num_of_homo;           // number of homo words
+  unsigned char son_1st_off_h;         // high bits of the son_1st_off
+  unsigned char homo_idx_buf_off_h;    // high bits of the homo_idx_buf_off
+};
+
+#ifdef ___BUILD_MODEL___
+struct SingleCharItem {
+  float freq;
+  char16 hz;
+  SpellingId splid;
+};
+
+struct LemmaEntry {
+  LemmaIdType idx_by_py;
+  LemmaIdType idx_by_hz;
+  char16 hanzi_str[kMaxLemmaSize + 1];
+
+  // The SingleCharItem id for each Hanzi.
+  uint16 hanzi_scis_ids[kMaxLemmaSize];
+
+  uint16 spl_idx_arr[kMaxLemmaSize + 1];
+  char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
+  unsigned char hz_str_len;
+  float freq;
+};
+#endif  // ___BUILD_MODEL___
+
+}  //  namespace ime_pinyin
+
+#endif  // PINYINIME_INCLUDE_DICTDEF_H__

PinyinIME/jni/include/dictlist.h

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_DICTLIST_H__
+#define PINYINIME_INCLUDE_DICTLIST_H__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "./dictdef.h"
+#include "./searchutility.h"
+#include "./spellingtrie.h"
+#include "./utf16char.h"
+
+namespace ime_pinyin {
+
+class DictList {
+ private:
+  bool initialized_;
+
+  const SpellingTrie *spl_trie_;
+
+  // Number of SingCharItem. The first is blank, because id 0 is invalid.
+  size_t scis_num_;
+  char16 *scis_hz_;
+  SpellingId *scis_splid_;
+
+  // The large memory block to store the word list.
+  char16 *buf_;
+
+  // Starting position of those words whose lengths are i+1, counted in
+  // char16
+  size_t start_pos_[kMaxLemmaSize + 1];
+
+  size_t start_id_[kMaxLemmaSize + 1];
+
+  int (*cmp_func_[kMaxLemmaSize])(const void *, const void *);
+
+  bool alloc_resource(size_t buf_size, size_t scim_num);
+
+  void free_resource();
+
+#ifdef ___BUILD_MODEL___
+  // Calculate the requsted memory, including the start_pos[] buffer.
+  size_t calculate_size(const LemmaEntry *lemma_arr, size_t lemma_num);
+
+  void fill_scis(const SingleCharItem *scis, size_t scis_num);
+
+  // Copy the related content to the inner buffer
+  // It should be called after calculate_size()
+  void fill_list(const LemmaEntry *lemma_arr, size_t lemma_num);
+
+  // Find the starting position for the buffer of those 2-character Chinese word
+  // whose first character is the given Chinese character.
+  char16* find_pos2_startedbyhz(char16 hz_char);
+#endif
+
+  // Find the starting position for the buffer of those words whose lengths are
+  // word_len. The given parameter cmp_func decides how many characters from
+  // beginning will be used to compare.
+  char16* find_pos_startedbyhzs(const char16 last_hzs[],
+                                size_t word_Len,
+                                int (*cmp_func)(const void *, const void *));
+
+ public:
+
+  DictList();
+  ~DictList();
+
+  bool save_list(FILE *fp);
+  bool load_list(FILE *fp);
+
+#ifdef ___BUILD_MODEL___
+  // Init the list from the LemmaEntry array.
+  // lemma_arr should have been sorted by the hanzi_str, and have been given
+  // ids from 1
+  bool init_list(const SingleCharItem *scis, size_t scis_num,
+                 const LemmaEntry *lemma_arr, size_t lemma_num);
+#endif
+
+  // Get the hanzi string for the given id
+  uint16 get_lemma_str(LemmaIdType id_hz, char16 *str_buf, uint16 str_max);
+
+  void convert_to_hanzis(char16 *str, uint16 str_len);
+
+  void convert_to_scis_ids(char16 *str, uint16 str_len);
+
+  // last_hzs stores the last n Chinese characters history, its length should be
+  // less or equal than kMaxPredictSize.
+  // hzs_len specifies the length(<= kMaxPredictSize).
+  // predict_buf is used to store the result.
+  // buf_len specifies the buffer length.
+  // b4_used specifies how many items before predict_buf have been used.
+  // Returned value is the number of newly added items.
+  size_t predict(const char16 last_hzs[], uint16 hzs_len,
+                 NPredictItem *npre_items, size_t npre_max,
+                 size_t b4_used);
+
+  // If half_splid is a valid half spelling id, return those full spelling
+  // ids which share this half id.
+  uint16 get_splids_for_hanzi(char16 hanzi, uint16 half_splid,
+                              uint16 *splids, uint16 max_splids);
+
+  LemmaIdType get_lemma_id(const char16 *str, uint16 str_len);
+};
+}
+
+#endif  // PINYINIME_INCLUDE_DICTLIST_H__

PinyinIME/jni/include/dicttrie.h

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_DICTTRIE_H__
+#define PINYINIME_INCLUDE_DICTTRIE_H__
+
+#include <stdlib.h>
+#include "./atomdictbase.h"
+#include "./dictdef.h"
+#include "./dictlist.h"
+#include "./searchutility.h"
+
+namespace ime_pinyin {
+
+class DictTrie : AtomDictBase {
+ private:
+  typedef struct ParsingMark {
+    size_t node_offset:24;
+    size_t node_num:8;           // Number of nodes with this spelling id given
+                                 // by spl_id. If spl_id is a Shengmu, for nodes
+                                 // in the first layer of DictTrie, it equals to
+                                 // SpellingTrie::shm2full_num(); but for those
+                                 // nodes which are not in the first layer,
+                                 // node_num < SpellingTrie::shm2full_num().
+                                 // For a full spelling id, node_num = 1;
+  };
+
+  // Used to indicate an extended mile stone.
+  // An extended mile stone is used to mark a partial match in the dictionary
+  // trie to speed up further potential extending.
+  // For example, when the user inputs "w", a mile stone is created to mark the
+  // partial match status, so that when user inputs another char 'm', it will be
+  // faster to extend search space based on this mile stone.
+  //
+  // For partial match status of "wm", there can be more than one sub mile
+  // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
+  // there may be more one parsing mark used to mark these partial matchings.
+  // A mile stone records the starting position in the mark list and number of
+  // marks.
+  struct MileStone {
+    uint16 mark_start;
+    uint16 mark_num;
+  };
+
+  DictList* dict_list_;
+
+  const SpellingTrie *spl_trie_;
+
+  LmaNodeLE0* root_;        // Nodes for root and the first layer.
+  LmaNodeGE1* nodes_ge1_;   // Nodes for other layers.
+
+  // An quick index from spelling id to the LmaNodeLE0 node buffer, or
+  // to the root_ buffer.
+  // Index length:
+  // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
+  // to get the end.
+  // All Shengmu ids are not indexed because they will be converted into
+  // corresponding full ids.
+  // So, given an id splid, the son is:
+  // root_[splid_le0_index_[splid - kFullSplIdStart]]
+  uint16 *splid_le0_index_;
+
+  size_t lma_node_num_le0_;
+  size_t lma_node_num_ge1_;
+
+  // The first part is for homophnies, and the last  top_lma_num_ items are
+  // lemmas with highest scores.
+  unsigned char *lma_idx_buf_;
+  size_t lma_idx_buf_len_;  // The total size of lma_idx_buf_ in byte.
+  size_t total_lma_num_;    // Total number of lemmas in this dictionary.
+  size_t top_lmas_num_;     // Number of lemma with highest scores.
+
+  // Parsing mark list used to mark the detailed extended statuses.
+  ParsingMark *parsing_marks_;
+  // The position for next available mark.
+  uint16 parsing_marks_pos_;
+
+  // Mile stone list used to mark the extended status.
+  MileStone *mile_stones_;
+  // The position for the next available mile stone. We use positions (except 0)
+  // as handles.
+  MileStoneHandle mile_stones_pos_;
+
+  // Get the offset of sons for a node.
+  inline size_t get_son_offset(const LmaNodeGE1 *node);
+
+  // Get the offset of homonious ids for a node.
+  inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node);
+
+  // Get the lemma id by the offset.
+  inline LemmaIdType get_lemma_id(size_t id_offset);
+
+  void free_resource(bool free_dict_list);
+
+  bool load_dict(FILE *fp);
+
+  // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
+  // them into the lpi_items buffer.
+  // This function is called by the search engine.
+  size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
+                         LmaNodeLE0 *node);
+
+  // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
+  // them into the lpi_items buffer.
+  // This function is called by inner functions extend_dict0(), extend_dict1()
+  // and extend_dict2().
+  size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
+                         size_t homo_buf_off, LmaNodeGE1 *node,
+                         uint16 lma_len);
+
+  // Extend in the trie from level 0.
+  MileStoneHandle extend_dict0(MileStoneHandle from_handle,
+                               const DictExtPara *dep, LmaPsbItem *lpi_items,
+                               size_t lpi_max, size_t *lpi_num);
+
+  // Extend in the trie from level 1.
+  MileStoneHandle extend_dict1(MileStoneHandle from_handle,
+                               const DictExtPara *dep, LmaPsbItem *lpi_items,
+                               size_t lpi_max, size_t *lpi_num);
+
+  // Extend in the trie from level 2.
+  MileStoneHandle extend_dict2(MileStoneHandle from_handle,
+                               const DictExtPara *dep, LmaPsbItem *lpi_items,
+                               size_t lpi_max, size_t *lpi_num);
+
+  // Try to extend the given spelling id buffer, and if the given id_lemma can
+  // be successfully gotten, return true;
+  // The given spelling ids are all valid full ids.
+  bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);
+
+#ifdef ___BUILD_MODEL___
+  bool save_dict(FILE *fp);
+#endif  // ___BUILD_MODEL___
+
+  static const int kMaxMileStone = 100;
+  static const int kMaxParsingMark = 600;
+  static const MileStoneHandle kFirstValidMileStoneHandle = 1;
+
+  friend class DictParser;
+  friend class DictBuilder;
+
+ public:
+
+  DictTrie();
+  ~DictTrie();
+
+#ifdef ___BUILD_MODEL___
+  // Construct the tree from the file fn_raw.
+  // fn_validhzs provide the valid hanzi list. If fn_validhzs is
+  // NULL, only chars in GB2312 will be included.
+  bool build_dict(const char *fn_raw, const char *fn_validhzs);
+
+  // Save the binary dictionary
+  // Actually, the SpellingTrie/DictList instance will be also saved.
+  bool save_dict(const char *filename);
+#endif  // ___BUILD_MODEL___
+
+  void convert_to_hanzis(char16 *str, uint16 str_len);
+
+  void convert_to_scis_ids(char16 *str, uint16 str_len);
+
+  // Load a binary dictionary
+  // The SpellingTrie instance/DictList will be also loaded
+  bool load_dict(const char *filename, LemmaIdType start_id,
+                 LemmaIdType end_id);
+  bool load_dict_fd(int sys_fd, long start_offset, long length,
+                    LemmaIdType start_id, LemmaIdType end_id);
+  bool close_dict() {return true;}
+  size_t number_of_lemmas() {return 0;}
+
+  void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
+
+  MileStoneHandle extend_dict(MileStoneHandle from_handle,
+                              const DictExtPara *dep,
+                              LmaPsbItem *lpi_items,
+                              size_t lpi_max, size_t *lpi_num);
+
+  size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
+                  LmaPsbItem *lpi_items, size_t lpi_max);
+
+  uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
+
+  uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
+                          uint16 splids_max, bool arg_valid);
+
+  size_t predict(const char16 *last_hzs, uint16 hzs_len,
+                 NPredictItem *npre_items, size_t npre_max,
+                 size_t b4_used);
+
+  LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
+                        uint16 lemma_len, uint16 count) {return 0;}
+
+  LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
+                           bool selected) {return 0;}
+
+  LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
+                           uint16 lemma_len) {return 0;}
+
+  LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;}
+
+  LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
+                        uint16 lemma_len) {return 0;}
+
+  bool remove_lemma(LemmaIdType lemma_id) {return false;}
+
+  size_t get_total_lemma_count() {return 0;}
+  void set_total_lemma_count_of_others(size_t count);
+
+  void flush_cache() {}
+
+  LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);
+
+  // Fill the lemmas with highest scores to the prediction buffer.
+  // his_len is the history length to fill in the prediction buffer.
+  size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items,
+                          size_t npre_max, size_t b4_used);
+};
+}
+
+#endif  // PINYINIME_INCLUDE_DICTTRIE_H__

PinyinIME/jni/include/lpicache.h

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_ANDPY_INCLUDE_LPICACHE_H__
+#define PINYINIME_ANDPY_INCLUDE_LPICACHE_H__
+
+#include <stdlib.h>
+#include "./searchutility.h"
+#include "./spellingtrie.h"
+
+namespace ime_pinyin {
+
+// Used to cache LmaPsbItem list for half spelling ids.
+class LpiCache {
+ private:
+  static LpiCache *instance_;
+  static const int kMaxLpiCachePerId = 15;
+
+  LmaPsbItem *lpi_cache_;
+  uint16 *lpi_cache_len_;
+
+ public:
+  LpiCache();
+  ~LpiCache();
+
+  static LpiCache& get_instance();
+
+  // Test if the LPI list of the given splid  has been cached.
+  // If splid is a full spelling id, it returns false, because we only cache
+  // list for half ids.
+  bool is_cached(uint16 splid);
+
+  // Put LPI list to cahce. If the length of the list, lpi_num, is longer than
+  // the cache buffer. the list will be truncated, and function returns the
+  // maximum length of the cache buffer.
+  // Note: splid must be a half id, and lpi_items must be not NULL. The
+  // caller of this function should guarantee this.
+  size_t put_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_num);
+
+  // Get the cached list for the given half id.
+  // Return the length of the cached buffer.
+  // Note: splid must be a half id, and lpi_items must be not NULL. The
+  // caller of this function should guarantee this.
+  size_t get_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_max);
+};
+
+}  // namespace
+
+#endif  // PINYINIME_ANDPY_INCLUDE_LPICACHE_H__

PinyinIME/jni/include/matrixsearch.h

+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
+#define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
+
+#include <stdlib.h>
+#include "./atomdictbase.h"
+#include "./dicttrie.h"
+#include "./searchutility.h"
+#include "./spellingtrie.h"
+#include "./splparser.h"
+
+namespace ime_pinyin {
+
+static const size_t kMaxRowNum = kMaxSearchSteps;
+
+typedef struct {
+  // MileStoneHandle objects for the system and user dictionaries.
+  MileStoneHandle dict_handles[2];
+  // From which DMI node. -1 means it's from root.
+  PoolPosType dmi_fr;
+  // The spelling id for the Pinyin string from the previous DMI to this node.
+  // If it is a half id like Shengmu, the node pointed by dict_node is the first
+  // node with this Shengmu,
+  uint16 spl_id;
+  // What's the level of the dict node. Level of root is 0, but root is never
+  // recorded by dict_node.
+  unsigned char dict_level:7;
+  // If this node is for composing phrase, this bit is 1.
+  unsigned char c_phrase:1;
+  // Whether the spl_id is parsed with a split character at the end.
+  unsigned char splid_end_split:1;
+  // What's the length of the spelling string for this match, for the whole
+  // word.
+  unsigned char splstr_len:7;
+  // Used to indicate whether all spelling ids from the root are full spelling
+  // ids. This information is useful for keymapping mode(not finished). Because
+  // in this mode, there is no clear boundaries, we prefer those results which
+  // have full spelling ids.
+  unsigned char all_full_id:1;
+} DictMatchInfo, *PDictMatchInfo;
+
+typedef struct MatrixNode {
+  LemmaIdType id;
+  float score;
+  MatrixNode *from;
+  // From which DMI node. Used to trace the spelling segmentation.
+  PoolPosType dmi_fr;