Commits

Lars Yencken  committed 3fe1bd6

Adds devoicing and degemination support to KanjidicGenerator.

  • Participants
  • Parent commits b3304a5

Comments (0)

Files changed (6)

File src/gpaligner/AlignmentGenerator.scala

     for {
       segLists <- Combinatorics.segmentCombinations(graphemeString.toCharArray.toList).toStream
     } yield segLists.map(_.mkString)
-  }
+  }.filter(gs => !gs.exists(!isValidGrapheme(_)))
 
   def phonemeSegmentations(graphemes: List[String], phonemes: String): Stream[List[Segment]] = {
     val nSegs = graphemes.length
   def pruneBatch(as: Stream[Alignment]) = as
 }
 
+/**
+ * Stricter linguistic contraints for Japanese, including:
+ *
+ * - one script per grapheme segment
+ * - contiguous hiragana and katakana segments
+ * - identical grapheme and phoneme strings for hiragana and katakana segments
+ * - limited ratio of kanji to reading in a segment
+ */
 class JapaneseGenerator extends AlignmentGenerator {
   override def isValidGrapheme(grapheme: String): Boolean = {
     Japanese.countScripts(grapheme) == 1
   }
 
-  override def isValidPhoneme(phoneme: String): Boolean = {
-    !phoneme.startsWith(Japanese.nKana.toString) && !Japanese.smallKanaSet.contains(phoneme.charAt(0))
-  }
+  override def isValidPhoneme(phoneme: String): Boolean = true
 
   override def isValidSegment(segment: Segment): Boolean = {
     (Japanese.scriptType(segment.grapheme) match {
         case ScriptType.Hiragana => segment.grapheme == segment.phoneme
         case ScriptType.Katakana => segment.grapheme == segment.phoneme
-        case ScriptType.Kanji => segment.phoneme.length <= 4 * segment.grapheme.length
+        case ScriptType.Kanji => segment.phoneme.length <= 4 * segment.grapheme.length && !Japanese.isSmallKana(segment.phoneme(0))
         case _ => true
       })
   }
+
+  override def pruneBatch(as: Stream[Alignment]) = {
+    as.filter(a => {
+        if (as.size > 1) {
+          val scripts = a.map(s => Japanese.scriptType(s.grapheme))
+          !scripts.zip(scripts.tail).exists(p => (p._1 != ScriptType.Kanji) && p._1 == p._2)
+        } else {
+          true
+        }
+      })
+  }
 }
 
 object ReadingStatus extends Enumeration {
   type ReadingStatus = Value
-  val Unknown, Mixed, Validated = Value
+  val Unknown = Value("Unknown")
+  val Mixed = Value("Mixed")
+  val Validated = Value("Validated")
   
   def combine(lhs: Value, rhs: Value): Value = {
     lhs match {
 /**
  * An extension to the Japanese generator which prunes alignments in batches.
  */
-class KanjidicGenerator(filename: String) extends JapaneseGenerator {
+class KanjidicGenerator(filename: String = "resources/kanji.readings") extends JapaneseGenerator {
   val readingMap: Map[String, Set[String]] = parseReadings(filename)
 
-  override def pruneBatch(as: Stream[Alignment]): Stream[Alignment] = {
+  override def pruneBatch(asRaw: Stream[Alignment]): Stream[Alignment] = {
+    val as = super.pruneBatch(asRaw)
     if (as.isEmpty) {
       as
     } else {
     }
   }
 
-  private def triageAlignment(a: Alignment): ReadingStatus = {
-    a.map(triageSegment(_)).reduceLeft(combine(_, _))
+  def triageAlignment(a: Alignment): ReadingStatus = {
+    val kanjiSegments = a.filter(s => Japanese.scriptType(s.grapheme) == ScriptType.Kanji)
+    if (kanjiSegments.size > 0) {
+      kanjiSegments.map(triageSegment(_)).reduceLeft(combine(_, _))
+    } else {
+      Validated
+    }
   }
 
-  private def triageSegment(s: Segment): ReadingStatus = {
-    val readings = readingMap.get(s.grapheme)
-    if (readings.isDefined && readings.get.contains(Japanese.toHiragana(s.phoneme))) {
-      Validated
+  def triageSegment(s: Segment): ReadingStatus = {
+    if (readingMap contains s.grapheme) {
+      val readingSet = readingMap.get(s.grapheme).get
+      var candidates = List(s.phoneme)
+      if (Japanese.isVoiced(s.phoneme)) {
+        candidates = Japanese.devoice(s.phoneme) :: candidates
+      }
+      if (candidates.exists(Japanese.isGeminate(_))) {
+        candidates = candidates.flatMap(Japanese.degeminate(_))
+      }
+      if (candidates.exists(readingSet.contains(_))) {
+        Validated
+      } else {
+        Unknown
+      }
     } else {
       Unknown
     }
     }
     result
   }
+}
+
+object ShowAlignments {
+  def main(args: Array[String]): Unit = {
+    if (args.size != 2) {
+      println("Usage: ShowAlignments graphemes phonemes")
+      System.exit(1)
+    }
+
+    val generator = new KanjidicGenerator
+    generator.generate(GPPair(args(0), args(1))).foreach(a => println("%s -- %s".format(a.toString, generator.triageAlignment(a).toString)))
+  }
 }

File src/gpaligner/GPAligner.scala

     var cs = List[AlignmentCloud]()
     var nOverconstrained = 0
     var nResolved = 0
-    val generator = new KanjidicGenerator("resources/kanji.readings")
+    val generator = new KanjidicGenerator
     for (gpPair <- input) {
       val cloud = buildCloud(gpPair, generator)
       cloud.status match {

File src/gpaligner/Japanese.scala

 
 object Japanese {
     val nKana = '\u3093'
-    val smallKanaSet = Set('\u3083', '\u3085', '\u3087', '\u3043', '\u3045', '\u3049', '\u3041', '\u3047')
+    val smallKanaSet = Set('\u3083', '\u3085', '\u3087', '\u3043', '\u3045', '\u3049', '\u3041', '\u3047', '\u3063')
 
     def toHiragana(c: Char): Char = UnicodeBlock.of(c) match {
       case UnicodeBlock.KATAKANA => (c.toInt - 96).toChar
       case _ => c
     }
 
+    def isSmallKana(c: Char): Boolean = smallKanaSet contains c
+
     def toHiragana(s: String): String = s.map(toHiragana)
 
     def scriptType(c: Char): ScriptType.Value = UnicodeBlock.of(c) match {
         set
     }
 
+    val devoiceMap = Map("がだざびぴじばぱ".zip("かたさひひしはは") : _*)
+
+    def isVoiced(s: String): Boolean = devoiceMap contains s(0)
+
+    def devoice(s: String): String = {
+      require(s.length > 0)
+      val c = s(0)
+      devoiceMap.getOrElse(c, c).toString + s.slice(1, s.length)
+    }
+
+    def isGeminate(s: String): Boolean = s.endsWith("っ")
+
+    def degeminate(s: String): List[String] = {
+      if (s.endsWith("っ")) {
+        s :: (for (c <- "きつちくいり") yield { s.slice(0, s.length - 1) + c.toString }).toList
+      } else {
+        List(s)
+      }
+    }
+
     def countScripts(s: String): Int = scriptTypes(s).size
 }

File src/gpaligner/Types.scala

 
 case class GPContext(left: Option[Segment], pivot: Segment, right: Option[Segment])
 
+object CoreTypes {
+  def parseAlignment(s: String): Alignment = {
+    val parts = s.split(' ')
+    require(parts.length == 2)
+    Alignment(parts(0).split('|').zip(parts(1).split('|')).map(p => Segment(p._1, p._2)).toList)
+  }
+}
+
 case class Alignment(val segments: List[Segment]) extends Iterable[Segment] {
   def iterator: Iterator[Segment] = segments.iterator
 
     case x :: y :: xs => Stream.cons(GPContext(Some(head), x, Some(y)), addContext(x, y :: xs))
   }
 
-  override def toString = "Alignment(%s:%s)".format(segments.map(_.grapheme).mkString("|"), segments.map(_.phoneme).mkString("|"))
+  override def toString = "Alignment(%s %s)".format(segments.map(_.grapheme).mkString("|"), segments.map(_.phoneme).mkString("|"))
 }
 
 case class ScoredAlignment(score: Double, alignment: Alignment) extends Ordered[ScoredAlignment] {

File test/gpaligner/AlignmentGeneratorTest.scala

 import org.junit.Assert._
 
 class AlignmentGeneratorTest {
-    @Before
-    def setUp: Unit = {
-    }
+  @Before
+  def setUp: Unit = {
+  }
 
-    @After
-    def tearDown: Unit = {
-    }
+  @After
+  def tearDown: Unit = {
+  }
 
-    @Test
-    def testNKana = {
-        val input = GPPair("神符", "しんぷ")
+  @Test
+  def testNKana = {
+    val input = GPPair("神符", "しんぷ")
 
-        val alignments0 = (new AlignmentGenerator).generate(input)
-        alignments0.foreach(a => println(a))
-        assert(alignments0 == List(
-                Alignment(List(Segment("神符", "しんぷ"))),
-                Alignment(List(Segment("神", "し"), Segment("符", "んぷ"))),
-                Alignment(List(Segment("神", "しん"), Segment("符", "ぷ")))
-            ))
+    val alignments0 = (new AlignmentGenerator).generate(input)
+    assert(alignments0 == List(
+        Alignment(List(Segment("神符", "しんぷ"))),
+        Alignment(List(Segment("神", "し"), Segment("符", "んぷ"))),
+        Alignment(List(Segment("神", "しん"), Segment("符", "ぷ")))
+      ))
 
-        val alignments1 = (new JapaneseGenerator).generate(input)
-        assert(alignments1 == List(
-                Alignment(List(Segment("神符", "しんぷ"))),
-                Alignment(List(Segment("神", "しん"), Segment("符", "ぷ")))
-            ))
-    }
+    val alignments1 = (new JapaneseGenerator).generate(input)
+    assert(alignments1 == List(
+        Alignment(List(Segment("神符", "しんぷ"))),
+        Alignment(List(Segment("神", "しん"), Segment("符", "ぷ")))
+      ))
+  }
 
-    @Test
-    def testSmallKana = {
-        val input = GPPair("州際", "しゅうさい")
-        val alignments = (new JapaneseGenerator).generate(input)
-        assert(alignments == List(
-                Alignment(List(Segment("州際", "しゅうさい"))),
-                Alignment(List(Segment("州", "しゅ"), Segment("際", "うさい"))),
-                Alignment(List(Segment("州", "しゅう"), Segment("際", "さい"))),
-                Alignment(List(Segment("州", "しゅうさ"), Segment("際", "い")))
-            ))
-    }
+  @Test
+  def testSmallKana = {
+    val input = GPPair("州際", "しゅうさい")
+    val alignments = (new JapaneseGenerator).generate(input)
+    assert(alignments == List(
+        Alignment(List(Segment("州際", "しゅうさい"))),
+        Alignment(List(Segment("州", "しゅ"), Segment("際", "うさい"))),
+        Alignment(List(Segment("州", "しゅう"), Segment("際", "さい"))),
+        Alignment(List(Segment("州", "しゅうさ"), Segment("際", "い")))
+      ))
+  }
+
+  @Test
+  def testScriptBoundaries = {
+    val input = GPPair("締め殺す", "しめころす")
+    val alignments = (new JapaneseGenerator).generate(input)
+    assert(alignments.size == 1)
+    assert(alignments == List(CoreTypes.parseAlignment("締|め|殺|す し|め|ころ|す")))
+  }
+
+  @Test
+  def testKanjidicMatches = {
+    val input = GPPair("紫綬褒章", "しじゅほうしょう")
+    val alignments = (new KanjidicGenerator).generate(input)
+    assert(alignments.length == 1)
+  }
 }

File test/gpaligner/JapaneseTest.scala

         assert(Japanese.countScripts("引き時") == 2)
     }
 
+    @Test
+    def isSmallKana = {
+      assert(Japanese.isSmallKana('っ'))
+    }
+
 }