Source

gpalign-cpp / src / generator.cpp

Full commit
//--------------------------------------------------------------------------//
// generator.cpp
// Lars Yencken <lars.yencken@gmail.com>
// vim: ts=4 sw=4 sts=4 expandtab:
// Sun Oct  7 02:30:34 EST 2007
//--------------------------------------------------------------------------//

#include "generator.hpp"
#include "scripts.hpp"
#include "partialMatch.hpp"
#include "stringOps.hpp"
#include "japanese.hpp"

#include <iostream>

using namespace std;

//--------------------------------------------------------------------------//

typedef vector<Grapheme>  GraphemeList;

/**
 * A wide output stream operator for debugging output.
 */
wostream& operator<<(wostream& output, const GraphemeList& slots)
{
    if (slots.size() == 0) {
        output << L"<no slots>";
        return output;
    }

    for (unsigned int i = 0; i < slots.size(); i++) {
        if (i > 0) {
            output << L"|";
        }
        output << slots[i];
    }
    return output;
}

//--------------------------------------------------------------------------//

/**
 * Extends the basic list of grapheme slots with extra candidates. The basic
 * slots are driven by script boundaries.
 */
void _buildGraphemeSlots(
        const Grapheme&             grapheme,
        vector<GraphemeList>&       graphemeAlignments
    );

/**
 * Attempts to fill each set of grapheme slots with phonemes, in each possible
 * way. When this is not possible, the set of slots is discarded.
 */
void _fillGraphemeSlots(
        const vector<GraphemeList>& graphemeAlignments,
        const Phoneme&              phoneme,
        vector<Alignment>&          alignments
    );

int _countGraphemeVariants(const vector<ScriptSeg>& boundarySlots);

void _buildGraphemeVariants(
        int                         nVariants,
        const vector<ScriptSeg>&    boundarySlots,
        vector<GraphemeList>&       graphemeAlignments
    );

//--------------------------------------------------------------------------//

void potentialAlignments(const Segment& s, vector<Alignment>& alignments)
{
    alignments.clear();

    // Add any variations to these grapheme slots.
    vector<GraphemeList> graphemeAlignments;
    _buildGraphemeSlots(s.g, graphemeAlignments);

    // Align each set of grapheme slots with phonemes. 
    _fillGraphemeSlots(graphemeAlignments, s.p, alignments);

    pruneAlignments(alignments);

    return;
}

//--------------------------------------------------------------------------//

void _buildGraphemeSlots(
        const Grapheme&             grapheme,
        vector<GraphemeList>&       graphemeAlignments
    )
{
    //wcout << grapheme << endl;
    graphemeAlignments.clear();

    // Determine the natural "slots" formed by the script boundaries.
    vector<ScriptSeg> boundarySlots;
    scriptBoundaries(grapheme, boundarySlots);

    // Count the number of optional boundaries.
    const int nVariants = _countGraphemeVariants(boundarySlots);

    if (nVariants == 1) {
        GraphemeList graphemeSlots;
        // One unique alignment.
        for (vector<ScriptSeg>::iterator i = boundarySlots.begin();
                i != boundarySlots.end(); i++) {
            graphemeSlots.push_back(i->second);
        }
        //wcout << L"--> " << join(graphemeSlots, L"|") << endl;
        graphemeAlignments.push_back(graphemeSlots);
    } else {
        // Many possible alignments.
        _buildGraphemeVariants(nVariants, boundarySlots, graphemeAlignments);
    }
    return;
}

//--------------------------------------------------------------------------//

void _buildGraphemeVariants(
        int                         nVariants,
        const vector<ScriptSeg>&    boundarySlots,
        vector<GraphemeList>&       graphemeAlignments
    )
{
    // Initialize each copy separately, and its remainder.
    vector<int> remainders;
    {
        GraphemeList graphemeSlots;
        remainders.reserve(nVariants);
        for (int i = 0; i < nVariants; i++) {
            graphemeAlignments.push_back(graphemeSlots);
            remainders.push_back(i);
        }
    }
    for (vector<ScriptSeg>::const_iterator j = boundarySlots.begin();
            j != boundarySlots.end(); j++) {
        const Script& script = j->first;
        const Grapheme& current = j->second;

        if (script != Script__Kanji || current.size() == 1) {
            // Each variant is the same for this segment.
            for (int i = 0; i < nVariants; i++) {
                graphemeAlignments[i].push_back(current);
            }
            continue;
        }

        // Each is different. Add the first grapheme character.
        for (int i = 0; i < nVariants; i++) {
            graphemeAlignments[i].push_back(current.substr(0, 1));
        }

        const int currSize = current.size();
        for (int gIndex = 1; gIndex < currSize; gIndex++) {
            // New segment or same segment?
            for (int i = 0; i < nVariants; i++) {
                bool sameSegment = (bool) (remainders[i] % 2);
                remainders[i] /= 2;

                GraphemeList& alignment = graphemeAlignments[i];
                if (sameSegment) {
                    alignment.back().push_back(current[gIndex]);
                } else {
                    alignment.push_back(current.substr(gIndex, 1));
                }
            }
        }
    }

    return;
}

//--------------------------------------------------------------------------//

int _countGraphemeVariants(const vector<ScriptSeg>& boundarySlots)
{
    int nVariants = 1;
    for (vector<ScriptSeg>::const_iterator i = boundarySlots.begin();
            i != boundarySlots.end(); i++) {
        if (i->first == Script__Kanji) {
            const int nExtras = i->second.size() - 1;
            for (int j = 0; j < nExtras; j++) {
                nVariants *= 2;
            }
        }
    }

    return nVariants;
}

//--------------------------------------------------------------------------//

void _fillGraphemeSlots(
        const vector<GraphemeList>& graphemeAlignments,
        const Phoneme&              phoneme,
        vector<Alignment>&          alignments
    )
{
    alignments.clear();

    vector<PartialMatch> lastMatches;
    vector<PartialMatch> currentMatches;

    for (vector<GraphemeList>::const_iterator i = graphemeAlignments.begin();
            i != graphemeAlignments.end(); i++) {
        const GraphemeList& graphemeSlots = *i;
        lastMatches.clear();
        lastMatches.push_back(PartialMatch(graphemeSlots.size(), phoneme));

        for (GraphemeList::const_iterator g = graphemeSlots.begin();
                g != graphemeSlots.end(); g++) {
            currentMatches.clear();
            for (vector<PartialMatch>::iterator match = lastMatches.begin();
                    match != lastMatches.end(); match++) {
                match->expand(*g, currentMatches);
            }
            lastMatches = currentMatches;
        }

        Alignment a;
        for (vector<PartialMatch>::iterator match = lastMatches.begin();
                match != lastMatches.end(); match++) {
            a.clear();
            const int nSlots = graphemeSlots.size();
            for (int j = 0; j < nSlots; j++) {
                a.push_back(Segment(graphemeSlots[j], match->m_slots[j]));
            }
            alignments.push_back(a);
        }
    }
    return;
}

//--------------------------------------------------------------------------//