Source

edict / edict-morphology.el

Full commit
jareth 7231f95 

stephent 60fcf76 
jareth 7231f95 
stephent 60fcf76 
jareth 7231f95 


stephent 60fcf76 


jareth 7231f95 


































stephent 60fcf76 
jareth 7231f95 



stephent 05760bd 
jareth 7231f95 
stephent 60fcf76 


jareth 7231f95 



























































































youngs 4caba7d 
jareth 7231f95 


































































































































































































































































































































































stephent 05760bd 

jareth 7231f95 
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
;;; edict-morphology.el --- morphology rewrite engine for edict.el

;; Copyright (C) 1992 Bob Kerns <rwk@crl.dec.com>
;; Copyright (C) 1991, 1992 Per Hammarlund (perham@nada.kth.se)
;; Copyright (C) 1998, 2002 Free Software Foundation, Inc.

;; Author:      Per Hammarlund <perham@nada.kth.se>
;; Keywords:    mule, edict, dictionary
;; Version:     0.9.9
;; Adapted-by:  Stephen J. Turnbull <stephen@xemacs.org> for XEmacs
;; Maintainer:  Stephen J. Turnbull <stephen@xemacs.org>

;;   This file is part of XEmacs.

;;   XEmacs is free software; you can redistribute it and/or modify it
;;   under the terms of the GNU General Public License as published by
;;   the Free Software Foundation; either version 2, or (at your
;;   option) any later version.

;;   XEmacs is distributed in the hope that it will be useful, but
;;   WITHOUT ANY WARRANTY; without even the implied warranty of
;;   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;   General Public License for more details.
;; 
;;   You should have received a copy of the GNU General Public License
;;   along with XEmacs; if not, write to the Free Software Foundation,
;;   Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

;;; Commentary:

;; Some code that looks for translations of english and japanese using the
;; EDICTJ Public Domain japanese/english dictionary.

;; Written by Per Hammarlund <perham@nada.kth.se>
;; Morphology and private dictionary handling/editing by Bob Kerns
;; <rwk@crl.dec.com>
;; Helpful remarks from Ken-Ichi Handa <handa@etl.go.jp>.
;; The EDICTJ PD dictionary is maintained by Jim Breen
;; <jwb@monu6.cc.monash.edu.au>

;; Morphology rewrite engine

;;; To do:

;;; Changelog:

;; 1998-03-27  Stephen Turnbull  <stephen@xemacs.org>
;;        (created):  broken out from monolithic edict.el

;;; Code:

(require 'cl)				; for defstruct

;; Set this to true for debugging.
(defvar *edict-expand-string-trace* nil)  

;;; Constants:

;; The edict-category-* variables are used to emulate the character
;; categories for regexps that are (partially) documented, but not
;; implemented, in XEmacs/Mule (20.5).  It should be possible to use
;; ranges for this, in the sense that defined legal characters in a
;; given range are guaranteed to be of the appropriate category.  The
;; reason is that each Mule-defined character set will occupy such a
;; range by virtue of the leading-byte implementation (see `Info |
;; Internals | MULE Character Sets and Encodings | Internal Mule
;; Encodings | Internal Character Encoding' for the easily understood
;; character type representation; the Bufbyte representation is a
;; simple transformation format of varying width).  Then JIS, at
;; least, carefully arranges the categories of characters into
;; non-overlapping ranges (ranges > 96 code points (94 for JIS) are
;; necessarily non-contiguous, see the info section cited above---this
;; is why undefined or illegal characters cannot be ruled out).  Mule
;; itself is happy to insert undefined characters---try eval'ing
;; (insert (int-char (+ (char-int ?$Bt$(B) 3)))---but balks at illegal
;; ones---(insert (int-char (- (char-int ?$B0!(B) 1))).  However, there
;; are two holes in JIS X 0208 (between the yomi-ordered Level 1 kanji
;; and the radical-ordered Level 2, and at the end of the character
;; set) and these undefined characters can be inserted, eg by the LISP
;; code above.

;; Regexp ranges cannot be depended on; they work for all JIS-derived
;; encodings (including EUC, ISO-2022-JP, and SJIS), but won't for
;; UCS/Unicode.  Watch out for (eg) Big-5 in trying to generalize this
;; code.

;; The values for the ranges are taken from Ken Lunde, Understanding
;; Japanese Information Processing, (C) 1993 O'Reilly & Associates,
;; Sebastopol, CA, and from $B6S8+!&9b66!&8MB<!&H>ED!&7,M}!&8~@n!&5HED!"(B
;; $B%^%k%A%j%s%,%k4D6-$N<B8=!"%W%l%s%F%#%9%[!<%k=PHG(B.

(eval-and-compile
  (defconst edict-category-c "[$B0!(B-$Bt&(B]"
    "JIS X 0208-1990 kanji character category.
NB: The last two characters of the range will be invisible for most fonts
(based on JIS X 0208-1983).")

  (defconst edict-category-C "[$(D0!(B-$(Dmc(B]"
    "JIS X 0212-1990 kanji character category.
Computed as kuten 1601 - kuten 7767, leading byte 0x94, but not verified
(I don't have the fonts on hand.)  #### Also, this may not be the official
definition of character category \cC in FSF Emacs 20.
Cannot be combined with JIS X 0208-1983/1990 (leading byte 0x92) since
Korean KSC 5601-1987 is interpolated (leading byte 0x93).")

  (defconst edict-category-h "[$B$!(B-$B$s(B]"
    "JIS X 0208-1990 hiragana character category.")

  (defconst edict-category-H "[$B~~(B]"
    "Unknown (half-width hiragana character?) category.
Currently implemented as the undefined character kuten 9494 in JIS X 0208.")

  (defconst edict-category-k "[$B%!(B-$B%t(B]"
    "JIS X 0208-1990 katakana character category.
Does not include the `small ka' and `small ke' characters (which would not
be appropriate in yomi).")

  (defconst edict-category-K "[(I&(B-(I_(B]"
    "JIS X 0201-1976 \"half-width\" katakana character category.")

  (defconst edict-category-kana
    (concat edict-category-h "\\|" edict-category-H
	    "\\|" edict-category-k "\\|" edict-category-K)
    "Japanese kana (half- or full-width, kata- or hira-) character category.")

  (defconst edict-category-Japanese-word-constituent
    (concat edict-category-kana "\\|" edict-category-c "\\|" edict-category-C)
    "Japanese word constituent character category.")

  (defconst edict-dictionary-entry-start-regexp
    (concat "^\\(\\(" edict-category-Japanese-word-constituent "\\)+\\)[ \t]")
    "Matches start of an EDICT entry.")

  (defconst edict-yomi-regexp
    (concat "^\\(" edict-category-kana "\\)+$")
    "Matches a pronunciation from an EDICT entry.")

  (defconst edict-yomi-part-regexp
    (concat "\\[^\\(" edict-category-kana "\\)+$")
    "Matches a `yomi part' (includes delimiters) from an EDICT entry.")
  )

;; The syntax of the rules is:
;; (define-edict-rule name <pattern> <conversion-function> <conversion-data>).
;; 
;;  <pattern> is a regular expression, with the parts to be substituted
;;  being denoted by \\(<subpattern>\\).
;; 
;;  <conversion function> is a function responsible for determining
;;  the replacements.  The current choices are
;;  edict-subst-modified-affix and edict-subst-affix.  These
;;  functions are called just after doing match-string, so the regexp
;;  variables are set up.  They are applied to the string, and
;;  <conversion-data>.  These functions are responsible for
;;  determining and performing the substitutions to be made, and
;;  returning a list of possiblities.
;; 
;;  edict-subst-affix is the simpler case.  It takes as conversion
;;  data one string for each subpattern in the pattern.  This string
;;  will be used in place of the original.
;; 
;;  edict-subst-modified-affix takes as conversion data, an
;;  alternating list of functions and lists of additional arguments
;;  for those functions.  Each function is applied to the substring
;;  being replaced and its additional arguments.  Likely functions to
;;  use include edict-modify-verb, edict-ignore, and edict-subst.

;; Table of morphological rules.
(defvar *edict-syntax-types* nil)

;; defstruct's defsetfs should expand into this; sigh.
;; Maybe this is fixed, comment them out.
;(eval-when (eval load compile)
(defstruct edict-syntax-type
  name
  rules)
;)

(defun get-edict-syntax-type (name)
  (if (symbolp name)
      (catch 'found-it
	(dolist (s *edict-syntax-types*)
	  (when (eq (edict-syntax-type-name s) name)
	    (throw 'found-it s)))
	(let ((new (make-edict-syntax-type :name name :rules ())))
	  (push new *edict-syntax-types*)
	  new))
    name))

;(eval-when (eval load compile)
(defstruct edict-rule
  name
  pattern				;Pattern which it must match
  filter				;Syntactic filter on previous form
  function				;Function to transform the input
  additional-args			;Arguments to transform function
  from-syntax-types			;Syntaxes for which this is valid
  to-syntax-types)			;Syntaxes to consider after this rule.
;)

;; Delete all occurrances of a rule from the rule base.
(defun edict-delete-rule (name)
  (dolist (s *edict-syntax-types*)
    (let ((old (edict-get-rule-from-syntax-type name s)))
      (when old
	(setf (edict-syntax-type-rules s)
	      (delq old (edict-syntax-type-rules s)))))))

;(defun edict-decircularize-rules ()
;  (interactive)
;  (dolist (s *edict-syntax-types*)
;    (dolist (r (edict-syntax-type-rules s))
;      (setf (edict-rule-from-syntax-types r)
;	    (mapcar (function (lambda (type)
;				(if (symbolp type)
;				    type
;				  (edict-syntax-type-name type))))
;		    (edict-rule-from-syntax-types r)))
;      (setf (edict-rule-to-syntax-types r)
;	    (mapcar (function (lambda (type)
;				(if (symbolp type)
;				    type
;				  (edict-syntax-type-name type))))
;		    (edict-rule-to-syntax-types r))))))
;
;(defun edict-circularize-rules ()
;  (interactive)
;  (dolist (s *edict-syntax-types*)
;    (dolist (r (edict-syntax-type-rules s))
;      (setf (edict-rule-from-syntax-types r)
;	    (mapcar (function (lambda (type)
;				(if (symbolp type)
;				    (get-edict-syntax-type type)
;				  type)))
;		    (edict-rule-from-syntax-types r)))
;      (setf (edict-rule-to-syntax-types r)
;	    (mapcar (function (lambda (type)
;				(if (symbolp type)
;				    (get-edict-syntax-type type)
;				  type)))
;		    (edict-rule-to-syntax-types r))))))

(defun edict-add-rule (name rule)
  (edict-delete-rule name)
  (dolist (s (edict-rule-from-syntax-types rule))
    (push rule (edict-syntax-type-rules s))))

(defun edict-get-rule-from-syntax-type (name syntax-type)
  (catch 'edict-get-rule
    (dolist (rule (edict-syntax-type-rules syntax-type))
      (if (eq name (edict-rule-name rule))
	  (throw 'edict-get-rule rule)))))

(defmacro define-edict-rule (name pattern fromto function &rest additional-args)
  ;; First, some type-checking.
  (let ((filter nil)
	(from nil)
	(to nil)
	;; SJT: this needs to be a string.  If it already is, that's ok.
	(pattern (eval pattern)))
    (when (stringp fromto)
      (setq filter fromto
	    fromto nil))
    (when (null fromto)
      (setq fromto '($BF|K\8l(B $BF|K\8l(B)))
    (setq from (first fromto)
	  to (second fromto))
    (unless (listp from)
      (setq from (list from)))
    (unless (listp to)
      (setq to (list to)))
    (unless (string-match "^\\^\\|\\$$" pattern)
      (error "Rule %s: pattern must start with ^ or end with $: \"%s\""
	     name pattern))
    (when filter
      (unless (stringp filter)
	(error "Rule %s: filter must be a regexp"
	       name)))
    (` (define-edict-rule-internal '(, name) '(, pattern) '(, filter)
	 '(, from) '(, to)
	 (function (, function)) (quote ((,@ additional-args)))))))

(defun define-edict-rule-internal (name pattern filter 
					from-syntax-types to-syntax-types
					function additional-args)
  (unless (string-match "^\\^\\|\\$$" pattern)
    (error "Rule %s: pattern must start with ^ or end with $: \"%s\""
	   name pattern))
  (when filter
    (unless (stringp filter)
      (error "Rule %s: filter must be a regexp"
	     name)))
  (let ((from-types nil)
	(to-types nil))
    (dolist (f from-syntax-types)
      (push (get-edict-syntax-type f) from-types))
    (dolist (to to-syntax-types)
      (push (get-edict-syntax-type to) to-types))
    (edict-add-rule name 
		    (make-edict-rule :name name
				     :pattern pattern
				     :filter filter
				     :from-syntax-types from-types
				     :to-syntax-types to-types
				     :function function
				     :additional-args additional-args))
    name))

;; #### This is bogus; the function does not match what the
;; #### description above the rules says it satisfies.  In particular, 
;; #### it is supposed to take only strings as arguments.
(defun edict-subst-affix (string &rest affixes)
  (let ((i 1)
	(prev -1)
	(result ""))
    (dolist (x affixes)
      (let ((pos (match-beginning i)))
	;; #### aren't the behaviors of edict-identity and
	;;      edict-ignore reversed?
	;; #### Hmm ... maybe this code never triggers.
	(cond ((eq x 'edict-identity))
	      ((eq x 'edict-ignore)
	       (setq result (concat result
				    (substring string
					       (max prev 0)
					       (match-beginning i)))
		     prev (match-end i)))
	      ((and (symbolp x) (fboundp x))
	       (setq result
		     (concat result
			     (substring string
					(max prev 0)
					(match-beginning i))
			     (funcall x (substring string
						   (match-beginning i)
						   (match-end i))))))
	      ((not (stringp x))
	       (error "%s is not a string or function name in edict-subst-affix"
		      x))
	      ((and pos (>= pos prev))
	       (setq result (concat result
				    (substring string
					       (max prev 0)
					       (match-beginning i))
				    x))
	       (setq prev (match-end i))))
	(incf i)))
    (concat result (substring string (max prev 0)))))

;; Takes a series of alternating pairs of substitution functions
;; and arguments for those substitution functions.  This can be
;; used to algorithmically replace certain parts (typically involving
;; changing an $B$$9T(B to $B$&9T(B final character.

(defun edict-subst-modified-affix (string &rest affixes)
  (let ((fun nil)
	(args nil)
	(i 1)
	(prev -1)
	(result ""))
    (while affixes
      (setq fun (car affixes)
	    args (car (cdr affixes))
	    affixes (cdr (cdr affixes)))
      (let ((pos (match-beginning i)))
	;; #### aren't the behaviors of edict-identity and
	;;      edict-ignore reversed?
	;; #### Hmm ... maybe this code never triggers.
	(cond ((eq fun 'edict-identity))
	      ((eq fun 'edict-ignore)
	       (setq result (concat result
				    (substring string
					       (max prev 0)
					       (match-beginning i)))
		     prev (match-end i)))
	      ((not (or (stringp fun) (and (symbolp fun) (fboundp fun))))
	       (error "%s is not a string or function name in %s"
		      fun 
		      'edict-subst-modified-affix))
	      ((and pos (>= pos prev))
	       (setq result (concat result
				    (substring string (max prev 0) pos)
				    (apply fun (substring string 
							  (match-beginning i)
							  (match-end i))
					   args)))
	       (setq prev (max prev (match-end i)))))
	(incf i)))
    (concat result (substring string (max prev 0)))))

;; Ignore this piece
(defun edict-ignore (affix) "")

;; Keep this piece
(defun edict-identity (affix) affix)

;; Substitute for this piece
(defun edict-subst (affix data)
  data)

;; More or less a guon table, for converting doshi suffixes.
(defvar *edict-doshi-suffix*
  '(["$B$o(B" "$B$$(B" "$B$&(B" "$B$((B" "$B$*(B"];; u -> wa; kau->kawanai
    ["$B$+(B" "$B$-(B" "$B$/(B" "$B$1(B" "$B$3(B"]
    ["$B$,(B" "$B$.(B" "$B$0(B" "$B$2(B" "$B$4(B"]
    ["$B$5(B" "$B$7(B" "$B$9(B" "$B$;(B" "$B$=(B"]
    ["$B$6(B" "$B$8(B" "$B$:(B" "$B$<(B" "$B$>(B"]
    ["$B$?(B" "$B$A(B" "$B$D(B" "$B$F(B" "$B$H(B"]
    ["$B$@(B" "$B$B(B" "$B$E(B" "$B$G(B" "$B$I(B"]
    ["$B$J(B" "$B$K(B" "$B$L(B" "$B$M(B" "$B$N(B"]
    ["$B$O(B" "$B$R(B" "$B$U(B" "$B$X(B" "$B$[(B"]
    ["$B$P(B" "$B$S(B" "$B$V(B" "$B$Y(B" "$B$\(B"]
    ["$B$Q(B" "$B$T(B" "$B$W(B" "$B$Z(B" "$B$](B"]
    ["$B$^(B" "$B$_(B" "$B$`(B" "$B$a(B" "$B$b(B"]
    ["$B$i(B" "$B$j(B" "$B$k(B" "$B$l(B" "$B$m(B"]))

(defun edict-modify-verb (suffix from to)
  (catch 'exit
    (dolist (b *edict-doshi-suffix*)
      (if (equal suffix (aref b from))
	  (throw 'exit (aref b to))))
    (throw 'skip-rule nil)))

;; This returns a list of the results of applying all rules whose
;; patterns match, to all levels of recursion.
(defun edict-expand-string (string &optional others previous syntax)
  (let* ((result nil)
	 (syntax (or syntax '$BF|K\8l(B))
	 (stype (get-edict-syntax-type syntax)))
    (dolist (rule (edict-syntax-type-rules stype))
      (when (string-match (edict-rule-pattern rule) string)
	(catch 'skip-rule
	  (unless (and previous
		       (edict-rule-filter rule)
		       (edict-filter-rule rule previous))
	    (let ((temp (apply (edict-rule-function rule) string
			       (edict-rule-additional-args rule))))
	      (unless (or (equal temp string)
			  (member temp others)
			  (member temp result))
		(when *edict-expand-string-trace*
		  (read-string (format "%s: %s -> %s -:" 
				       (edict-rule-name rule)
				       string temp)))
		(setq result
		      (union (edict-expand-string-recurse
				    temp (cons string (append result others))
				    string rule)
				   result))))))))
    (if (member string result)
	result
      (cons string result))))

(defun edict-expand-string-recurse (string others previous rule)
  (edict-expand-string-syntaxes string others previous 
				(edict-rule-to-syntax-types rule)))

(defun edict-expand-string-syntaxes (string others previous syntaxes)
  (let ((result nil))
    (dolist (syntax syntaxes)
      (setq result
	    (union (edict-expand-string string
					      (append result others)
					      previous
					      syntax)
			 result)))
    result))


;; Returns T if the rule should not be run, because of the past
;; history of expansions.  I.e. if something started out with $B$/(Bon
;; the end, and we've made it into an adjective, we should disable
;; any expansions based on it being a the conjunctive/stem form of a
;; verb.  This is done purely based on the most immediately preceding
;; expansion, because that is what determined the sense of the word.

(defun edict-filter-rule (rule previous)
  (let ((filter (edict-rule-filter rule)))
    (cond ((null filter) nil)
	  ((null previous) nil)
	  ((stringp filter)
	   (string-match filter previous))
;; #### This code is not functional yet, let those cases signal errors.
;	  ((symbolp filter)
;	   (funcall filter frob))
;	  ((consp filter)
;	   (apply (car filter) frob (cdr filter)))
	  (t (error "Bogus filter in rule %s: %s"
		    (edict-rule-name rule)
		    filter)))))

;(defun edict-find (elt list)
;  (catch 'edict-find
;    (dolist (test list)
;      (when (equal elt test)
;	(throw 'edict-find test)))))

;(defun edict-union (set1 set2)
;  (let ((result set2))
;    (dolist (frob set1)
;      (unless (member frob set2)
;	(setq result (cons frob result))))
;    result))

(provide 'edict-morphology)

;;; edict-morphology.el ends here