codeeval_challenges / 055_TypeAhead / type_ahead.py

#! /usr/bin/env python

import re
from optparse import OptionParser

DEFAULT_CORPUS = """Mary had a little lamb its fleece was white as snow;
And everywhere that Mary went, the lamb was sure to go. 
It followed her to school one day, which was against the rule;
It made the children laugh and play, to see a lamb at school.
And so the teacher turned it out, but still it lingered near,
And waited patiently about till Mary did appear.
"Why does the lamb love Mary so?" the eager children cry;"Why,
Mary loves the lamb, you know" the teacher did reply."
"""

class NgramModel(object):
    """ This class implements the n-gram algorithm to predict
        which word will most like follow or preceed a sequence of
        words. The algorithm uses a text corpus as training data 
        and predicts the next word based on this corpus. The corpus 
        can only contain alphanumeric characters all other characters 
        are strip prior to train.

        Example:
        >>> import type_ahead
        >>> ngram = type_ahead.NgramModel(type_ahead.DEFAULT_CORPUS)
        >>> ngram.set_length(2)
        >>> ngram.predict('the')
        [('lamb', 0.375), ('teacher', 0.25), ('children', 0.125), ('eager', 0.125), ('rule', 0.125)]
    """

    STRIP_PATTERN = re.compile(r'[\W_]+')

    def __init__(self, corpus, backwards=False):
        """ Construct a new instance of the ngram model. The text 
            corpus is provided in the corpus parameter and expects
            a string objects. By default, the n-gram model predicts
            the word following a specified sequence, to predict 
            the word preceeding the sequence backwards is to be set
            to True.
        """
        super(NgramModel, self).__init__()
        self.corpus = [] 
        self.ngrams = []
        self.backwards = backwards
        self.length = -1

        self.corpus = self._prepare_text(corpus)

    def set_length(self, length):
        """ Set the length of n-grams to be used in the prediction and
            has to be set prior to predicting to initialse the model.
            The parameter length is expected to be an integer value. 
        """
        self.length = length

        for idx in xrange(len(self.corpus)-self.length+1):
            self.ngrams.append(
                self.corpus[idx:idx+self.length]
            )

    def predict(self, typed):
        """ Predicts the next word following the sequence of words in 
            typed. typed has to be a string object and will run on 
            n-grams as initialised using set_length.
            Returns a list of (word, probability) tuples in descending
                order of probability.
        """
        typed = self._prepare_text(typed)

        if self.backwards:
            prediction_idx = 0 
        else:
            prediction_idx = -1

        stats = {} 
        for ngram in self.ngrams:
            ngram = list(ngram)

            word = ngram.pop(prediction_idx)

            if typed == ngram:
                if word not in stats:
                    stats[word] = 0

                stats[word] += 1

        total = sum(stats.values())

        if total == 0:
            return []

        predictions = []
        for key, value in stats.items():
            predictions.append((
                    key, 
                    float(value)/float(total)
            ))

        predictions.sort(cmp=self._compare_word_statistic) 

        return predictions

    @staticmethod
    def _compare_word_statistic(one, other):
        """ Compare two (word, probability) tuples. The tuples are
            compared according to probability. 
            Returns 1 if other's probability > one's probability and 
                -1 when other's probability < one's probability. If 
                probabilities are equal, word's alphabetical order 
                is returned.
        """
        if one[1] < other[1]:
            return 1
        elif one[1] > other[1]:
            return -1
        else:
            return cmp(one[0], other[0])

    @classmethod
    def _prepare_text(cls, text):
        """ Stips all non-alphanumeric characters from the string 
            in text using regular expressions. 
            Returns pure alphanumeric string.
        """
        text = text.strip()
        
        prepared_text = []
        for word in text.split():
            prepared_text.append(
                cls.STRIP_PATTERN.sub('', word)
            )
        
        return prepared_text


def main():
    parser = OptionParser('%s testcase.txt' % __file__)

    parser.add_option(
        '-c', '--corpus-file',
        help="Specify custom text corpus provided in CORPUS_FILE.",
    )
    parser.add_option(
        '-b', '--backwards',
        action="store_true",
        default=False,
        help="Predict the preceeding word in the Ngram."
    )
    options, args = parser.parse_args()

    ## Throw an error when now input file is specified as first 
    ## argument
    if len(args) < 1:
        parser.error("No input file provided.")

    corpus = DEFAULT_CORPUS

    ## check if corpus file was defined as commandline option 
    ## and load text if available
    if options.corpus_file:
        try:
            lines = open(options.corpus_file, 'r').readlines()
        except IOError:
            parser.error("Cannot read from corpus file. Please check path.")
        finally:
            corpus = ''.join(lines)

    ngrams = NgramModel(corpus, options.backwards)

    test_cases = open(args[0], 'r')

    for test in test_cases:
        test = test.strip()

        if test != '':
            length, typed = test.split(',')

            ngrams.set_length(int(length))

            output = []
            for word, prob in ngrams.predict(typed):
                output.append(
                    '%s,%.3f' % (word, prob)
                )

            print ';'.join(output)

    test_cases.close()


if __name__ == "__main__":
    main()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.