Commits

Ivan Vučica  committed a691580

Code dump

  • Participants

Comments (0)

Files changed (3)

+Copyright (c) 2013, Ivan Vučica
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+Katakanizer
+===========
+
+Small script to convert Croatian writing (which is phonetic) into 
+similar-sounding katakana.
+
+Primarily written for playing with text-to-speech engines, which traditionally
+have very poor support for Croatian (but there are many engines that support
+Japanese very well). Since it's easier to convert from phonetic to phonetic
+language, Japanese was an obvious target language.
+
+Initially I converted to hiragana, since it doesn't really matter which
+script one converts to when it comes to these engines. But, katakana is what
+foreign words are supposed to be written in, so the hiraganized string is
+converted into katakana.
+
+Spaces are intentionally not converted into cdots, since that breaks at least
+Google's TTS engine (inserting too much pauses).
+
+Again, it mostly works for Croatian writing system. It may or may not work
+even for Croatian, much less any other writing system. This is a toy project.
+
+(c) 2013 Ivan Vučica. See LICENSE.md for license information.
+<?php error_reporting(E_ALL); header("Content-type: text/html; charset=utf-8"); ?>
+
+<?php /* Please see LICENSE.md for license details. */ ?>
+
+<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head><body>
+
+<form action="?" method="post">
+<textarea name="text"><?=stripslashes(@$_POST["text"])?></textarea>
+<input type="submit">
+</form>
+<?php
+
+$map = array();
+$map["a"] = "ア";
+$map["i"] = "い";
+$map["u"] = "う";
+$map["e"] = "え";
+$map["o"] = "お";
+
+$map["ka"] = "か";
+$map["ki"] = "き";
+$map["ku"] = "く";
+$map["ke"] = "け";
+$map["ko"] = "こ";
+$map["k"] = $map["ku"];
+
+$map["ga"] = "が";
+$map["gi"] = "ぎ";
+$map["gu"] = "ぐ";
+$map["ge"] = "げ";
+$map["go"] = "ご";
+$map["g"] = $map["gu"];
+
+$map["sa"] = "さ";
+$map["si"] = "し";
+$map["su"] = "す";
+$map["se"] = "せ";
+$map["so"] = "そ";
+$map["s"] = $map["su"];
+
+$map["za"] = "ざ";
+$map["zi"] = "じ";
+$map["zu"] = "ず";
+$map["ze"] = "ぜ";
+$map["zo"] = "ぞ";
+$map["z"] = $map["zu"];
+
+$map["ša"] = "しゃ";
+$map["ši"] = "し";
+$map["šu"] = "しゅ";
+$map["še"] = "しぇ";
+$map["šo"] = "しょ";
+$map["š"] = $map["šu"];
+
+$map["ža"] = "じゃ";
+$map["ži"] = "じ";
+$map["žu"] = "じゅ";
+$map["že"] = "じぇ";
+$map["žo"] = "じょ";
+$map["ž"] = $map["žu"];
+
+$map["ha"] = "は";
+$map["hi"] = "ひ";
+$map["hu"] = "ふ";
+$map["he"] = "へ";
+$map["ho"] = "ほ";
+$map["h"] = $map["hu"];
+
+$map["ba"] = "ば";
+$map["bi"] = "び";
+$map["bu"] = "ぶ";
+$map["be"] = "べ";
+$map["bo"] = "ぼ";
+$map["b"] = $map["bu"];
+
+$map["pa"] = "ぱ";
+$map["pi"] = "ぴ";
+$map["pu"] = "ぷ";
+$map["pe"] = "ぺ";
+$map["po"] = "ぽ";
+$map["p"] = $map["pu"];
+
+$map["ta"] = "た";
+$map["ti"] = "ち";
+$map["tu"] = "つ";
+$map["te"] = "て";
+$map["to"] = "と";
+$map["t"] = $map["tu"];
+
+$map["da"] = "だ";
+$map["di"] = "ぢ";
+$map["du"] = "づ";
+$map["de"] = "で";
+$map["do"] = "ど";
+$map["d"] = $map["du"];
+
+$map["ca"] = "つぁ";
+$map["ci"] = "つぃ";
+$map["cu"] = "つ";
+$map["ce"] = "つぇ";
+$map["co"] = "つぉ";
+$map["c"] = $map["cu"];
+
+$map["ča"] = "ちゃ";
+$map["či"] = "ち";
+$map["ču"] = "ちゅ";
+$map["če"] = "ちぇ";
+$map["čo"] = "ちょ";
+$map["č"] = $map["ču"];
+
+$map["ća"] = "ちゃ";
+$map["ći"] = "ち";
+$map["ću"] = "ちゅ";
+$map["će"] = "ちぇ";
+$map["ćo"] = "ちょ";
+$map["ć"] = $map["ću"];
+
+$map["đa"] = "じゃ";
+$map["đi"] = "じ";
+$map["đu"] = "じゅ";
+$map["đe"] = "じぇ";
+$map["đo"] = "じょ";
+$map["đ"] = $map["đu"];
+
+$map["dža"] = "じゃ";
+$map["dži"] = "じ";
+$map["džu"] = "じゅ";
+$map["dže"] = "じぇ";
+$map["džo"] = "じょ";
+$map["dž"] = $map["đu"];
+
+$map["ma"] = "ま";
+$map["mi"] = "み";
+$map["mu"] = "む";
+$map["me"] = "め";
+$map["mo"] = "も";
+$map["m"] = "ん";
+
+$map["na"] = "な";
+$map["ni"] = "に";
+$map["nu"] = "ぬ";
+$map["ne"] = "ね";
+$map["no"] = "の";
+$map["n"] = "ん";
+
+$map["nja"] = "にゃ";
+$map["nji"] = "にぃ";
+$map["nju"] = "にゅ";
+$map["nje"] = "にぇ";
+$map["njo"] = "にょ";
+$map["nj"] = $map["nju"];
+
+$map["ra"] = "ら";
+$map["ri"] = "り";
+$map["ru"] = "る";
+$map["re"] = "れ";
+$map["ro"] = "ろ";
+$map["r"] = $map["ru"];
+
+$map["la"] = "ら";
+$map["li"] = "り";
+$map["lu"] = "る";
+$map["le"] = "れ";
+$map["lo"] = "ろ";
+$map["l"] = $map["lu"];
+
+$map["ja"] = "や";
+$map["ji"] = "い";
+$map["ju"] = "ゆ";
+$map["je"] = "いぇ";
+$map["jo"] = "よ";
+$map["j"] = $map["ju"];
+
+$map["va"] = "わ";//"ゔぁ";
+$map["vi"] = "ゔぃ";
+$map["vu"] = "ゔ";
+$map["ve"] = "ゔぇ";
+$map["vo"] = "ゔぉ";
+$map["v"] = $map["vu"];
+
+$map[" "] = " ";//"・";
+$map["."] = "。";
+$map[","] = "、";
+$map["?"] = "?";
+$map["!"] = "!";
+
+// overrides
+$map["od t"] = "ott";
+$map["tebe "] = "tebe";
+
+
+function dupl($text)
+{
+	$consonants = array("b", "c", "č", "d", "đ", "dž", "f", "g", "h", "j", "k", "l", "m", "n", "p", "r", "s", "š", "t", "v", "z", "ž");
+	foreach($consonants as $consonant)
+	{
+		$double = $consonant . $consonant;
+		$new = $consonant . "u" . $consonant; //"っ" . $consonant;
+		$text = str_replace($double, $new, $text);
+	}
+	return $text;
+}
+
+function repl($map, $text, $len)
+{
+	foreach(array_keys($map) as $key)
+	{
+		$value = $map[$key];
+		if(iconv_strlen($key, "utf-8") == $len)
+		{
+			$text = str_replace($key, $value, $text);
+		}
+	}
+	return dupl($text);
+}
+
+
+function hira2kata($text)
+{
+	return mb_convert_kana($text, "KVC", "utf-8");
+}
+
+
+if(isset($_POST["text"]))
+{
+	$text = $_POST["text"];
+
+	$text = dupl($text);
+
+	$text = repl($map, $text, 5);
+	$text = repl($map, $text, 4);
+	$text = repl($map, $text, 3);
+	$text = repl($map, $text, 2);
+	$text = repl($map, $text, 1);
+
+	$text = hira2kata($text);
+
+	echo "<span lang=\"ja\">";
+	echo $text;
+	echo "</span>";
+//	echo str_replace(array_keys($map), array_values($map), $_POST["text"]);
+}