Source

latin-unity / latin-unity-utils.el

stephent 61436fd 





stephent 0ab2a59 

stephent 61436fd 


































stephent 0ab2a59 
stephent 61436fd 


stephent 39c64e6 
stephent 3ca4871 
stephent 0ab2a59 

stephent 3d4894c 
stephent 61436fd 
stephent 3d4894c 











stephent 61436fd 

stephent 3d4894c 
stephent 61436fd 






stephent ad0e7d9 
stephent 61436fd 
















stephent 3ca4871 
stephent 61436fd 












stephent 3ca4871 


stephent 61436fd 
stephent 3ca4871 



stephent 3d4894c 
stephent 3ca4871 




























stephent 3d4894c 
stephent 3ca4871 



























stephent 61436fd 
stephent 3d4894c 
stephent 61436fd 













stephent 3ca4871 




























stephent 61436fd 





























stephent 3d4894c 
stephent 61436fd 





stephent 3d4894c 




stephent 61436fd 
stephent 3d4894c 


stephent 61436fd 
stephent 3d4894c 
















stephent 61436fd 
stephent 3d4894c 




















stephent 61436fd 
stephent 3d4894c 













stephent 61436fd 
;;; latin-unity-utils.el --- Utility functions for preparing latin-unity data

;; Copyright (C) 2002 Free Software Foundation, Inc.

;; Author: Stephen J. Turnbull
;; Keywords: mule, charsets
;; Created: 2002 January 26
;; Last-modified: 2002 March 23

;; This file is part of XEmacs.

;; XEmacs is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; XEmacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with XEmacs; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;;; Commentary:

;; Mule bogusly considers the various ISO-8859 extended character sets
;; as disjoint, when ISO 8859 itself clearly considers them to be subsets
;; of a larger character set.

;; This library provides functions which for creating databases of
;; equivalence classes of characters.

;; It is NOT REQUIRED for the use of latin-unity.el; only for creating
;; the data it uses (provided in latin-unity-tables.el).

;; This is a developer-only library; _do not_ autoload anything in it.

;;; Code:

(provide 'latin-unity-utils)
(provide 'latin-unity-tables)	; Not a lie.

;;; Requires
(require 'cl)
(load "cl-macs" nil t)			; howcum no #'provide?
;; Get the charsets, among other things
(require 'latin-unity-vars)

(defvar latin-unity-utils-found-unicode-support t)

(condition-case nil
    (if (fboundp 'character-to-unicode)
	;; #### untested
	(fset 'char-to-ucs 'character-to-unicode)
      ;; the following libraries are from Mule-UCS.
      ;; this dependency can be eliminated by providing char-to-ucs.
      (require 'mule-ucs-unicode "unicode")
      (require 'un-define))
  (file-error (setq latin-unity-utils-found-unicode-support nil)))

;; Table of character set support for each Unicode code point
;; Hard-coded tables are from etc/unicode/unicode.org in XEmacs 21.5.

;; Populate the equivalence table
(when latin-unity-utils-found-unicode-support
(let* ((u+index (1+ (length latin-unity-character-sets))) ; alias
       (zero (make-vector (1+ u+index) nil))              ; useful constant
       ;; temporary holding tank for equivs: list of Mule characters
       ;; equivalent to the Unicode code point
       (unitable (make-vector (1+ #x20AC) nil)))

  ;; 
  ;; ASCII is spatial, Mule treats C0, SPC, and DEL as ASCII, but
  ;; (= (charset-property 'ascii 'chars) 94) :-(
  (loop for i from #x00 to #x7F do
    (let* ((ch (make-char 'ascii i))	; multibyte dirty
	   (ucs (char-to-ucs ch)))
      (if ucs
	  (aset unitable ucs (cons ch (aref unitable ucs)))
	;; Unfortunately it seems that Mule-UCS doesn't know Latin-9....
	;; It also is smart enough to know that there are holes in Latin-3.
	(message "Mule-UCS doesn't know about %s" (split-char ch)))))

  ;; Other character sets
  ;; Control-1 is spatial, but handled below
  ;; NB: JIS Roman defaults to differing from ASCII
  (mapc (lambda (cs)
	  (let (lo hi)
	    ;; cond because Morioka added a lot of extra sizes
	    ;; not relevant to our Latin character sets
	    (message "Processing charset %s ..." cs)
	    (cond ((= (charset-property cs 'chars) 94)
		   (setq lo #x21 hi #x7E))
		  ((= (charset-property cs 'chars) 96)
		   (setq lo #x20 hi #x7F))
		  (t (message "Odd size character set (%s)!" cs)
		     (setq lo #x20 hi #x7F)))
	    (loop for i from lo to hi do
	      (let* ((ch (make-char cs i)) ; multibyte dirty
		     (ucs (char-to-ucs ch)))
		(if ucs
		    (aset unitable ucs (cons ch (aref unitable ucs)))
		  (message "Mule-UCS doesn't know about %s"
			   (split-char ch)))))))
	(set-difference (copy-sequence latin-unity-character-sets)
			'(ascii latin-iso8859-13 latin-iso8859-14
			  latin-iso8859-15 latin-iso8859-16)))

  ;; Latin-7, -8, -9, and -10 are spatial, Mule-UCS doesn't handle them
  ;; correctly (maybe because they're not built in?)
  ;; Latin-7 (ISO 8859/13)
  (when (find-coding-system 'iso-8859-13)
    (message "Processing charset %s ..." 'latin-iso8859-13)
    (mapc (lambda (pair)
	    (let ((ucs (cdr pair))
		  (ch (make-char 'latin-iso8859-13 (car pair))))
	      (aset unitable ucs (cons ch (aref unitable ucs)))))
	  '((#xA0 . #x00A0) (#xA1 . #x201D) (#xA2 . #x00A2) (#xA3 . #x00A3)
	    (#xA4 . #x00A4) (#xA5 . #x201E) (#xA6 . #x00A6) (#xA7 . #x00A7)
	    (#xA8 . #x00D8) (#xA9 . #x00A9) (#xAA . #x0156) (#xAB . #x00AB)
	    (#xAC . #x00AC) (#xAD . #x00AD) (#xAE . #x00AE) (#xAF . #x00C6)
	    (#xB0 . #x00B0) (#xB1 . #x00B1) (#xB2 . #x00B2) (#xB3 . #x00B3)
	    (#xB4 . #x201C) (#xB5 . #x00B5) (#xB6 . #x00B6) (#xB7 . #x00B7)
	    (#xB8 . #x00F8) (#xB9 . #x00B9) (#xBA . #x0157) (#xBB . #x00BB)
	    (#xBC . #x00BC) (#xBD . #x00BD) (#xBE . #x00BE) (#xBF . #x00E6)
	    (#xC0 . #x0104) (#xC1 . #x012E) (#xC2 . #x0100) (#xC3 . #x0106)
	    (#xC4 . #x00C4) (#xC5 . #x00C5) (#xC6 . #x0118) (#xC7 . #x0112)
	    (#xC8 . #x010C) (#xC9 . #x00C9) (#xCA . #x0179) (#xCB . #x0116)
	    (#xCC . #x0122) (#xCD . #x0136) (#xCE . #x012A) (#xCF . #x013B)
	    (#xD0 . #x0160) (#xD1 . #x0143) (#xD2 . #x0145) (#xD3 . #x00D3)
	    (#xD4 . #x014C) (#xD5 . #x00D5) (#xD6 . #x00D6) (#xD7 . #x00D7)
	    (#xD8 . #x0172) (#xD9 . #x0141) (#xDA . #x015A) (#xDB . #x016A)
	    (#xDC . #x00DC) (#xDD . #x017B) (#xDE . #x017D) (#xDF . #x00DF)
	    (#xE0 . #x0105) (#xE1 . #x012F) (#xE2 . #x0101) (#xE3 . #x0107)
	    (#xE4 . #x00E4) (#xE5 . #x00E5) (#xE6 . #x0119) (#xE7 . #x0113)
	    (#xE8 . #x010D) (#xE9 . #x00E9) (#xEA . #x017A) (#xEB . #x0117)
	    (#xEC . #x0123) (#xED . #x0137) (#xEE . #x012B) (#xEF . #x013C)
	    (#xF0 . #x0161) (#xF1 . #x0144) (#xF2 . #x0146) (#xF3 . #x00F3)
	    (#xF4 . #x014D) (#xF5 . #x00F5) (#xF6 . #x00F6) (#xF7 . #x00F7)
	    (#xF8 . #x0173) (#xF9 . #x0142) (#xFA . #x015B) (#xFB . #x016B)
	    (#xFC . #x00FC) (#xFD . #x017C) (#xFE . #x017E) (#xFF . #x2019))))
  (when (find-coding-system 'iso-8859-14)
    (message "Processing charset %s ..." 'latin-iso8859-14)
    (mapc (lambda (pair)
	    (let ((ucs (cdr pair))
		  (ch (make-char 'latin-iso8859-14 (car pair))))
	      (aset unitable ucs (cons ch (aref unitable ucs)))))
	  '((#xA0 . #x00A0) (#xA1 . #x1E02) (#xA2 . #x1E03) (#xA3 . #x00A3)
	    (#xA4 . #x010A) (#xA5 . #x010B) (#xA6 . #x1E0A) (#xA7 . #x00A7)
	    (#xA8 . #x1E80) (#xA9 . #x00A9) (#xAA . #x1E82) (#xAB . #x1E0B)
	    (#xAC . #x1EF2) (#xAD . #x00AD) (#xAE . #x00AE) (#xAF . #x0178)
	    (#xB0 . #x1E1E) (#xB1 . #x1E1F) (#xB2 . #x0120) (#xB3 . #x0121)
	    (#xB4 . #x1E40) (#xB5 . #x1E41) (#xB6 . #x00B6) (#xB7 . #x1E56)
	    (#xB8 . #x1E81) (#xB9 . #x1E57) (#xBA . #x1E83) (#xBB . #x1E60)
	    (#xBC . #x1EF3) (#xBD . #x1E84) (#xBE . #x1E85) (#xBF . #x1E61)
	    (#xC0 . #x00C0) (#xC1 . #x00C1) (#xC2 . #x00C2) (#xC3 . #x00C3)
	    (#xC4 . #x00C4) (#xC5 . #x00C5) (#xC6 . #x00C6) (#xC7 . #x00C7)
	    (#xC8 . #x00C8) (#xC9 . #x00C9) (#xCA . #x00CA) (#xCB . #x00CB)
	    (#xCC . #x00CC) (#xCD . #x00CD) (#xCE . #x00CE) (#xCF . #x00CF)
	    (#xD0 . #x0174) (#xD1 . #x00D1) (#xD2 . #x00D2) (#xD3 . #x00D3)
	    (#xD4 . #x00D4) (#xD5 . #x00D5) (#xD6 . #x00D6) (#xD7 . #x1E6A)
	    (#xD8 . #x00D8) (#xD9 . #x00D9) (#xDA . #x00DA) (#xDB . #x00DB)
	    (#xDC . #x00DC) (#xDD . #x00DD) (#xDE . #x0176) (#xDF . #x00DF)
	    (#xE0 . #x00E0) (#xE1 . #x00E1) (#xE2 . #x00E2) (#xE3 . #x00E3)
	    (#xE4 . #x00E4) (#xE5 . #x00E5) (#xE6 . #x00E6) (#xE7 . #x00E7)
	    (#xE8 . #x00E8) (#xE9 . #x00E9) (#xEA . #x00EA) (#xEB . #x00EB)
	    (#xEC . #x00EC) (#xED . #x00ED) (#xEE . #x00EE) (#xEF . #x00EF)
	    (#xF0 . #x0175) (#xF1 . #x00F1) (#xF2 . #x00F2) (#xF3 . #x00F3)
	    (#xF4 . #x00F4) (#xF5 . #x00F5) (#xF6 . #x00F6) (#xF7 . #x1E6B)
	    (#xF8 . #x00F8) (#xF9 . #x00F9) (#xFA . #x00FA) (#xFB . #x00FB)
	    (#xFC . #x00FC) (#xFD . #x00FD) (#xFE . #x0177) (#xFF . #x00FF))))
  (when (find-coding-system 'iso-8859-15)
    (message "Processing charset %s ..." 'latin-iso8859-15)
    (loop for i from #x20 to #x7F do
      (let* ((ch (make-char 'latin-iso8859-15 i)) ; multibyte dirty
	     (ucs (+ i #x80)))
	(aset unitable ucs (cons ch (aref unitable ucs)))))
    (mapc (lambda (ucs)
	    (let ((ch (make-char 'latin-iso8859-15 ucs)))
	      (aset unitable ucs (delq ch (aref unitable ucs)))))
	  '(#xA4 #xA6 #xA8 #xB4 #xB8 #xBC #xBD #xBE))
    (mapc (lambda (pair)
	    (let ((ucs (car pair))
		  (ch (make-char 'latin-iso8859-15 (cdr pair))))
	      (aset unitable ucs (cons ch (aref unitable ucs)))))
	  '((#x0152 . #xBC) (#x0153 . #xBD) (#x0160 . #xA6) (#x0161 . #xA8)
	    (#x0178 . #xBE) (#x017D . #xB4) (#x017E . #xB8) (#x20AC . #xA4))))
  (when (find-coding-system 'iso-8859-16)
    (mapc (lambda (pair)
	    (let ((ucs (cdr pair))
		  (ch (make-char 'latin-iso8859-16 (car pair))))
	      (aset unitable ucs (cons ch (aref unitable ucs)))))
	  '((#xA0 . #x00A0) (#xA1 . #x0104) (#xA2 . #x0105) (#xA3 . #x0141)
	    (#xA4 . #x20AC) (#xA5 . #x201E) (#xA6 . #x0160) (#xA7 . #x00A7)
	    (#xA8 . #x0161) (#xA9 . #x00A9) (#xAA . #x0218) (#xAB . #x00AB)
	    (#xAC . #x0179) (#xAD . #x00AD) (#xAE . #x017A) (#xAF . #x017B)
	    (#xB0 . #x00B0) (#xB1 . #x00B1) (#xB2 . #x010C) (#xB3 . #x0142)
	    (#xB4 . #x017D) (#xB5 . #x201D) (#xB6 . #x00B6) (#xB7 . #x00B7)
	    (#xB8 . #x017E) (#xB9 . #x010D) (#xBA . #x0219) (#xBB . #x00BB)
	    (#xBC . #x0152) (#xBD . #x0153) (#xBE . #x0178) (#xBF . #x017C)
	    (#xC0 . #x00C0) (#xC1 . #x00C1) (#xC2 . #x00C2) (#xC3 . #x0102)
	    (#xC4 . #x00C4) (#xC5 . #x0106) (#xC6 . #x00C6) (#xC7 . #x00C7)
	    (#xC8 . #x00C8) (#xC9 . #x00C9) (#xCA . #x00CA) (#xCB . #x00CB)
	    (#xCC . #x00CC) (#xCD . #x00CD) (#xCE . #x00CE) (#xCF . #x00CF)
	    (#xD0 . #x0110) (#xD1 . #x0143) (#xD2 . #x00D2) (#xD3 . #x00D3)
	    (#xD4 . #x00D4) (#xD5 . #x0150) (#xD6 . #x00D6) (#xD7 . #x015A)
	    (#xD8 . #x0170) (#xD9 . #x00D9) (#xDA . #x00DA) (#xDB . #x00DB)
	    (#xDC . #x00DC) (#xDD . #x0118) (#xDE . #x021A) (#xDF . #x00DF)
	    (#xE0 . #x00E0) (#xE1 . #x00E1) (#xE2 . #x00E2) (#xE3 . #x0103)
	    (#xE4 . #x00E4) (#xE5 . #x0107) (#xE6 . #x00E6) (#xE7 . #x00E7)
	    (#xE8 . #x00E8) (#xE9 . #x00E9) (#xEA . #x00EA) (#xEB . #x00EB)
	    (#xEC . #x00EC) (#xED . #x00ED) (#xEE . #x00EE) (#xEF . #x00EF)
	    (#xF0 . #x0111) (#xF1 . #x0144) (#xF2 . #x00F2) (#xF3 . #x00F3)
	    (#xF4 . #x00F4) (#xF5 . #x0151) (#xF6 . #x00F6) (#xF7 . #x015B)
	    (#xF8 . #x0171) (#xF9 . #x00F9) (#xFA . #x00FA) (#xFB . #x00FB)
	    (#xFC . #x00FC) (#xFD . #x0119) (#xFE . #x021B) (#xFF . #x00FF))))

  ;; Fill in the equivalences

  ;; Default the whole equivalences table
  (aset zero 0 0)
  (put-char-table t zero latin-unity-equivalences)

  ;; Control 1 code points are spatial
  ;; Warning on these is beyond the scope of this library.
  (put-char-table 'control-1
		  (vector latin-unity-all-flags
			  nil nil nil nil nil nil nil nil nil)
		  latin-unity-equivalences)

  ;; Now map over the unitable to the equivalences char-table
  (mapc (lambda (equivs)
	  (when equivs			; null for all non-Latin characters
	    (dolist (ch1 equivs)
	      (let ((vec (copy-sequence
			  (get-char-table ch1 latin-unity-equivalences)))
		    (ucs (char-to-ucs ch1)))
		(when ucs (aset vec u+index ucs))
		(dolist (ch2 equivs)
		  (let* ((cset (char-charset ch2))
			 (bit (get cset 'latin-unity-flag-bit))
			 (index (get cset 'latin-unity-index)))
		    (aset vec 0 (logior bit (aref vec 0)))
		    (aset vec index ch2)))
		(put-char-table ch1 vec latin-unity-equivalences)))))
	unitable))
)   ; when latin-unity-utils-found-unicode-support

(defun latin-unity-dump-tables ()
  "Create a Lisp library to initialize the equivalences char-table."

  (interactive)

  (if (not latin-unity-utils-found-unicode-support)
      (if (file-readable-p (expand-file-name "latin-unity-tables.el"))
	  (message "Unicode unsupported.  Reusing old latin-unity-tables.el.")
	(error 'file-error
	       "*** Can't find Unicode support or latin-unity-tables.el.***"))

    ;; set up buffer
    (set-buffer (get-buffer-create "latin-unity-tables.el"))
    (erase-buffer)

    ;; insert preface
    (let ((nilvec (make-vector (+ (length latin-unity-character-sets) 2) nil))
	  (creation-date-string (format-time-string "%Y %B %d")))
      (insert ";;; latin-unity-tables.el ---"
	      " initialize latin-unity-equivalences"
	      "\n;; Do not edit --- automatically generated."
	      "\n;; Created: " creation-date-string
	      "\n(provide 'latin-unity-tables)"
	      "\n(defconst latin-unity-equivalences"
	      "\n  (let ((table (make-char-table 'generic)))"
	      "\n    ;; default all non-Latin charsets"
	      (format "\n    (put-char-table t %s table)"
		      (progn (aset nilvec 0 0) nilvec))
	      "\n    ;; Control 1 code points are spatial"
	      "\n    ;; Warning on these is beyond this library's scope."
	      (format "\n    (put-char-table 'control-1 %s table)"
		      (progn (aset nilvec 0 latin-unity-all-flags) nilvec)))

      ;; insert table insertions
      ;; alternate mmc: (format "(apply #'make-char '%s)" (split-char ch))
      (flet ((mmc (ch)
	       (let ((x (split-char ch)))
		 (concat (format "(make-char '%s %d" (first x) (second x))
			 (if (third x) (format " %d)" (third x)) ")")))))
	(map-char-table
	 (lambda (key val)
	   (when (characterp key)
	     (insert (format "\n    (put-char-table %s (vector %s) table)"
			     (mmc key)
			     (mapconcat
			      (lambda (elt)
				(cond ((characterp elt) (mmc elt))
				      ((null elt) "nil")
				      ;; be careful to emit read syntax here!
				      ((integerp elt) (format "#x%X" elt))
				      (t (format "%s" elt))))
			      val
			      " ")))))
	 latin-unity-equivalences))

      ;; insert trailing matter
      (insert "\n    table)"
	      "\n  \"Map a (Latin) Mule character to the set of"
	      " character sets containing it."
	      "\nCreated: " creation-date-string "\")"
	      "\n(put 'latin-unity-equivalences 'creation-date-string \""
	      creation-date-string "\")"
	      "\n;;; end of latin-unity-tables.el"
	      "\n"))

    ;; write the file
    (write-file "latin-unity-tables.el")
    (message "Wrote %s." "latin-unity-tables.el")))


;;; end of latin-unity-utils.el