Synchronized with the code in 21.4

and deleted codes for not yet supported features; utf-fragment-on-decoding and utf-translate-cjk.
author: Kenichi Handa <handa@m17n.org> 2002-10-01 06:57:47 +0000
committer: Kenichi Handa <handa@m17n.org> 2002-10-01 06:57:47 +0000
commit: 620d0ea95bd1d06c89b2d6b9318e996f03a09838 (patch)
tree: 420f64129b7e6dbfb89286b3a1fa9e2f5a00454f
parent: 3c18889e3b62cd0f133ba0f392984212a979ef26 (diff)
download: emacs-620d0ea95bd1d06c89b2d6b9318e996f03a09838.tar.gz
1 files changed, 45 insertions, 35 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index 8d5c50b450b..e6d7434e16e 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -27,8 +27,8 @@
 
 ;;; Commentary:
 
-;; The coding-system `mule-utf-8' supports encoding/decoding of the
-;; following character sets to and from UTF-8:
+;; The coding-system `mule-utf-8' basically supports encoding/decoding
+;; of the following character sets to and from UTF-8:
 ;;
 ;;   ascii
 ;;   eight-bit-control
@@ -37,18 +37,19 @@
 ;;   mule-unicode-2500-33ff
 ;;   mule-unicode-e000-ffff
 ;;
-;; Characters of other character sets cannot be encoded with
-;; mule-utf-8.
-;;
 ;; On decoding, Unicode characters that do not fit into the above
 ;; character sets are handled as `eight-bit-control' or
 ;; `eight-bit-graphic' characters to retain the information about the
-;; original byte sequence.
+;; original byte sequence and text properties record the corresponding
+;; unicode.
+;;
+;; Fixme: note that reading and writing invalid utf-8 may not be
+;; idempotent -- to represent the bytes to fix that needs a new charset.
 ;;
-;; Fixme: note that reading and writing invalid utf-8, even without
-;; editing it, may alter the text.  Fixing that needs a new charset to
-;; represent the raw bytes in the eight-bit-control range, which are
-;; otherwise valid unicodes.
+;; Characters from other character sets can be encoded with mule-utf-8
+;; by populating the translation-table
+;; `utf-translation-table-for-encode' and registering the translation
+;; with `register-char-codings'.
 
 ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
 
@@ -61,6 +62,14 @@
 
 ;;; Code:
 
+(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
+  "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
+
+If `unify-8859-on-encoding-mode' is non-nil, this table populates the
+translation-table named `utf-translation-table-for-encode'.")
+
+(define-translation-table 'utf-translation-table-for-encode)
+
 (define-ccl-program ccl-decode-mule-utf-8
   ;;
   ;;        charset         | bytes in utf-8 | bytes in emacs
@@ -259,9 +268,10 @@
 
   "CCL program to decode UTF-8.
 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
-mule-unicode-*.  Encodings of un-representable Unicode characters are
-decoded asis into eight-bit-control and eight-bit-graphic
-characters.")
+mule-unicode-*, but see also `utf-fragmentation-table' and
+`ucs-mule-cjk-to-unicode'.
+Encodings of un-representable Unicode characters are decoded asis into
+eight-bit-control and eight-bit-graphic characters.")
 
 (define-ccl-program ccl-encode-mule-utf-8
   `(1
@@ -269,7 +279,8 @@ characters.")
      (loop
       (if (r5 < 0)
 	  ((r1 = -1)
-	   (read-multibyte-character r0 r1))
+	   (read-multibyte-character r0 r1)
+	   (translate-character utf-translation-table-for-encode r0 r1))
 	(;; We have already done read-multibyte-character.
 	 (r0 = r5)
 	 (r1 = r6)
@@ -376,30 +387,26 @@ characters.")
 	  ((write #xc2)
 	   (write r1)))))
 
-  "CCL program to encode into UTF-8.
-Only characters from the charsets ascii, eight-bit-control,
-eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
-Others are encoded as U+FFFD.")
+  "CCL program to encode into UTF-8.")
 
 (make-coding-system
  'mule-utf-8 4 ?u
  "UTF-8 encoding for Emacs-supported Unicode characters.
-The supported Emacs character sets are:
-   ascii
-   eight-bit-control
-   eight-bit-graphic
-   latin-iso8859-1
-   mule-unicode-0100-24ff
-   mule-unicode-2500-33ff
-   mule-unicode-e000-ffff
-
-Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
-are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences.  The byte sequence is
-preserved on i/o for valid utf-8, but not necessarily for invalid
-utf-8.
-
-Emacs characters not from the above charsets are encoded into U+FFFD."
+It supports Unicode characters of these ranges:
+    U+0000..U+33FF, U+E000..U+FFFF.
+They correspond to these Emacs character sets:
+    ascii, latin-iso8859-1, mule-unicode-0100-24ff,
+    mule-unicode-2500-33ff, mule-unicode-e000-ffff
+
+On decoding (e.g. reading a file), Unicode characters not in the above
+ranges are decoded into sequences of eight-bit-control and
+eight-bit-graphic characters to preserve their byte sequences.  The
+byte sequence is preserved on i/o for valid utf-8, but not necessarily
+for invalid utf-8.
+
+On encoding (e.g. writing a file), Emacs characters not belonging to
+any of the character sets listed above are encoded into the UTF-8 byte
+sequence representing U+FFFD (REPLACEMENT CHARACTER)."
 
  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
  '((safe-charsets
@@ -412,7 +419,10 @@ Emacs characters not from the above charsets are encoded into U+FFFD."
     mule-unicode-e000-ffff)
    (mime-charset . utf-8)
    (coding-category . coding-category-utf-8)
-   (valid-codes (0 . 255))))
+   (valid-codes (0 . 255))
+   (post-read-conversion . utf-8-post-read-conversion)
+   (dependency unify-8859-on-encoding-mode
+	       unify-8859-on-decoding-mode)))
 
 (define-coding-system-alias 'utf-8 'mule-utf-8)
author	Kenichi Handa <handa@m17n.org>	2002-10-01 06:57:47 +0000
committer	Kenichi Handa <handa@m17n.org>	2002-10-01 06:57:47 +0000
commit	620d0ea95bd1d06c89b2d6b9318e996f03a09838 (patch)
tree	420f64129b7e6dbfb89286b3a1fa9e2f5a00454f
parent	3c18889e3b62cd0f133ba0f392984212a979ef26 (diff)
download	emacs-620d0ea95bd1d06c89b2d6b9318e996f03a09838.tar.gz