Source

shav / shav.py

#!/usr/bin/env python3

# shav.py, script for converting english text to shavian via cmudict

# for compatability with 2.6 and 3.x
from __future__ import print_function, unicode_literals
try:
    input = raw_input
except NameError:
    pass

from shav_data import word_sounds_map, sound_shav_map, word_shav_map, \
str_ligature_map, rotation_map, vert_reflection_map, horiz_reflection_map
import sys
import string

def latin_to_shav(text):
    "Inputs a string of Latin English text, and outputs the shavian transliteration"
    # find all substrings made of letters, run those through word_to_shav
    out_list = []
    word_letters = string.ascii_letters + "'"
    word_start = 0
    in_word = False
    for i in range(len(text)):
        if text[i] in word_letters:
            if in_word:
                continue
            else:
                in_word = True
                word_start = i
        else:
            if in_word:
                word = text[word_start:i]
                out_list.append(word_to_shav(word))
                in_word = False
            out_list.append(text[i])
    if in_word:
        word = text[word_start:]
        out_list.append(word_to_shav(word))
    return "".join(out_list)

def word_to_shav(input_word,ligatures=True):
    word = input_word.upper()
    if word.startswith("'"):
        return "'" + word_to_shav(input_word[1:],ligatures)
    if word.endswith("'"):
        return word_to_shav(input_word[:-1],ligatures) + "'"
    if word in word_shav_map:
        return word_shav_map[word]

    try:
        sounds = word_sounds_map[word]
    except KeyError:
        # to catch possessive forms whose plural form is in the dictionary
        if "'" in word:
            without_quote = word.replace("'","")
            guess = word_to_shav(without_quote)
            if guess != without_quote:
                return guess
        # detecting compound words which are not in the dictionary
        for i in range(2,len(word)-1):
            if word[:i] in word_sounds_map and word[i:] in word_sounds_map:
                return word_to_shav(word[:i])+"-"+word_to_shav(word[i:])
        return input_word
    
    # remove stress number, unless we care
    sounds = sounds.split()
    for i, sound in enumerate(sounds):
        if sound not in ("AH0", "ER0") and sound[-1] in '0123456789':
            sounds[i] = sound[:-1]
    
    shav_letters = []
    for sound in sounds:
        try:
            letter = sound_shav_map[sound]
        except KeyError:
            return input_word
        shav_letters.append(letter)
    
    shav_word = "".join(shav_letters)
    # postprocessing for ligatures
    if ligatures:
        for pair, ligature in str_ligature_map.items():
            shav_word = shav_word.replace(pair,ligature)

    return shav_word

def letter_pairs(word):
    for i in range(0,len(word),2):
        yield word[i:i+2]

def smp_list(word):
    """take a string of characters in Unicode Supplementary Multilingual Plane,
    and return list of the letters, regardless of unicode representation"""
    if sys.maxunicode > 2**20:
        return word
    else:
        return list(letter_pairs(word))

def shav_letters_only(text):
    "returns only characters in the Supplementary Multilingual Plane"
    return "".join(filter(lambda l: ord(l) >= 55204,text))

def is_vert_reflected_palindrome(word):
    #assumes all characters in word are in Unicode Supplementary Multilingual Plane
    letters = smp_list(shav_letters_only(word))
    for i,reflected in enumerate(reversed(letters)):
        try:
            if letters[i] != vert_reflection_map[reflected]:
                return False
        except KeyError:
            return False
    return True

def is_rotated_palindrome(word):
    letters = smp_list(shav_letters_only(word))
    for i,rotated in enumerate(reversed(letters)):
        try:
            if letters[i] != rotation_map[rotated]:
                return False
        except KeyError:
            return False
    return True

def is_palindrome(word):
    letters = smp_list(shav_letters_only(word))
    return letters == list(reversed(letters))

if __name__ == '__main__':
    if len(word_sounds_map) == 0:
        sys.exit(1)
    if sys.argv[1:]: # transliterate files from command line
        for filename in sys.argv[1:]:
            f=open(filename)
            for line in f.readlines():
                print(latin_to_shav(line),end="")
            f.close()
    else: # transliterate text from stdin
        try:
            while 1:
                line = input()
                print(latin_to_shav(line))
        except EOFError:
            sys.exit(0)