Source

gpalign-cpp / src / scripts.cpp

Full commit
//--------------------------------------------------------------------------//
// scripts.cpp
// Lars Yencken <lars.yencken@gmail.com>
// vim: ts=4 sw=4 sts=4 expandtab:
// Sun Oct  7 02:00:17 EST 2007
//--------------------------------------------------------------------------//

#include "scripts.hpp"

//--------------------------------------------------------------------------//

Script scriptType(wchar_t c)
{
    if (c >= 0x0021 && c <= 0x00ff) {
        return Script__Ascii;
    } else if (c >= 0x3041 && c <= 0x3096) {
        return Script__Hiragana;
    } else if ((c >= 0x30a1 && c <= 0x30f6) || c == 0x30fc) {
        return Script__Katakana;
    } else if (c >= 0x4e00 && c <= 0x9fa5) {
        return Script__Kanji;
    } else if (c >= 0xff01 && c <= 0xff5f) {
        return Script__FullAscii;
    } else {
        return Script__Unknown;
    }
}

//--------------------------------------------------------------------------//

Script scriptType(const wstring& s)
{
    if (s.size() > 0) {
        return scriptType(s[0]);
    } else {
        return Script__Unknown;
    }
}

//--------------------------------------------------------------------------//

void scriptTypes(const wstring& s, set<Script>& scripts)
{
    scripts.clear();
    for (wstring::const_iterator i = s.begin(); i != s.end(); i++)
    {
        scripts.insert(scriptType(*i));
    }
    return;
}

//--------------------------------------------------------------------------//

void scriptBoundaries(const wstring& s, vector<ScriptSeg>& results)
{
    const int sSize = s.size();
    results.clear();
    if (sSize == 0) {
        return;
    }

    Script lastScript = scriptType(s[0]);

    int i = 1, startSeg = 0;
    Script script;
    for (i = 1; i < sSize; i++) {
        script = scriptType(s[i]);
        if (script != lastScript) {
            results.push_back(make_pair(lastScript, s.substr(startSeg,
                    i - startSeg)));
            startSeg = i;
            lastScript = script;
        }
    }
    results.push_back(make_pair(lastScript, s.substr(startSeg)));
    return;
}

//--------------------------------------------------------------------------//

bool containsScript(const wstring& s, Script script)
{
    for (wstring::const_iterator i = s.begin(); i != s.end(); i++) {
        if (scriptType(*i) == script) {
            return true;
        }
    }
    return false;
}

//--------------------------------------------------------------------------//

const int _interKanaDistance = 96;
 
//--------------------------------------------------------------------------//

wchar_t toHiragana(wchar_t c)
{
    if (scriptType(c) == Script__Katakana && c != 0x30fc) {
        return c - _interKanaDistance;
    } else {
        return c;
    }
}

//--------------------------------------------------------------------------//

wstring toHiragana(const wstring& s) 
{
    wstring result;
    for (wstring::const_iterator i = s.begin(); i != s.end(); i++) {
        result.push_back(toHiragana(*i));
    }

    return result;
}

//--------------------------------------------------------------------------//

wchar_t toKatakana(wchar_t c)
{
    if (scriptType(c) == Script__Hiragana) {
        return c + _interKanaDistance;
    } else {
        return c;
    }
}

//--------------------------------------------------------------------------//

wstring toKatakana(const wstring& s)
{
    wstring result;
    for (wstring::const_iterator i = s.begin(); i != s.end(); i++) {
        result.push_back(toKatakana(*i));
    }
    return result;
}

//--------------------------------------------------------------------------//

const wchar_t* _smallKana = L"っゃゅょぁぃぅぇぉ";
const int _smallKanaLen = wcslen(_smallKana);

bool isSmallKana(wchar_t c)
{
    c = toHiragana(c);
    for (int i = 0; i < _smallKanaLen; i++) {
        if (_smallKana[i] == c) {
            return true;
        }
    }
    return false;
}

//--------------------------------------------------------------------------//

unsigned int bigKanaLen(const wstring& s)
{
    unsigned int len = 0;
    for (wstring::const_iterator c = s.begin(); c != s.end(); c++) {
        if (!isSmallKana(*c)) {
            len++;
        }
    }
    return len;
}

//--------------------------------------------------------------------------//

unsigned int kanjiLen(const wstring& s)
{
    unsigned int len = 0;
    for (wstring::const_iterator c = s.begin(); c != s.end(); c++) {
        if (scriptType(*c) == Script__Kanji) {
            len++;
        }
    }
    return len;
}

//--------------------------------------------------------------------------//