summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenichi Handa <handa@m17n.org>2002-10-01 06:57:47 +0000
committerKenichi Handa <handa@m17n.org>2002-10-01 06:57:47 +0000
commit620d0ea95bd1d06c89b2d6b9318e996f03a09838 (patch)
tree420f64129b7e6dbfb89286b3a1fa9e2f5a00454f
parent3c18889e3b62cd0f133ba0f392984212a979ef26 (diff)
downloademacs-620d0ea95bd1d06c89b2d6b9318e996f03a09838.tar.gz
Synchronized with the code in 21.4
and deleted codes for not yet supported features; utf-fragment-on-decoding and utf-translate-cjk.
-rw-r--r--lisp/international/utf-8.el80
1 files changed, 45 insertions, 35 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index 8d5c50b450b..e6d7434e16e 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -27,8 +27,8 @@
;;; Commentary:
-;; The coding-system `mule-utf-8' supports encoding/decoding of the
-;; following character sets to and from UTF-8:
+;; The coding-system `mule-utf-8' basically supports encoding/decoding
+;; of the following character sets to and from UTF-8:
;;
;; ascii
;; eight-bit-control
@@ -37,18 +37,19 @@
;; mule-unicode-2500-33ff
;; mule-unicode-e000-ffff
;;
-;; Characters of other character sets cannot be encoded with
-;; mule-utf-8.
-;;
;; On decoding, Unicode characters that do not fit into the above
;; character sets are handled as `eight-bit-control' or
;; `eight-bit-graphic' characters to retain the information about the
-;; original byte sequence.
+;; original byte sequence and text properties record the corresponding
+;; unicode.
+;;
+;; Fixme: note that reading and writing invalid utf-8 may not be
+;; idempotent -- to represent the bytes to fix that needs a new charset.
;;
-;; Fixme: note that reading and writing invalid utf-8, even without
-;; editing it, may alter the text. Fixing that needs a new charset to
-;; represent the raw bytes in the eight-bit-control range, which are
-;; otherwise valid unicodes.
+;; Characters from other character sets can be encoded with mule-utf-8
+;; by populating the translation-table
+;; `utf-translation-table-for-encode' and registering the translation
+;; with `register-char-codings'.
;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
@@ -61,6 +62,14 @@
;;; Code:
+(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
+ "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
+
+If `unify-8859-on-encoding-mode' is non-nil, this table populates the
+translation-table named `utf-translation-table-for-encode'.")
+
+(define-translation-table 'utf-translation-table-for-encode)
+
(define-ccl-program ccl-decode-mule-utf-8
;;
;; charset | bytes in utf-8 | bytes in emacs
@@ -259,9 +268,10 @@
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
-mule-unicode-*. Encodings of un-representable Unicode characters are
-decoded asis into eight-bit-control and eight-bit-graphic
-characters.")
+mule-unicode-*, but see also `utf-fragmentation-table' and
+`ucs-mule-cjk-to-unicode'.
+Encodings of un-representable Unicode characters are decoded asis into
+eight-bit-control and eight-bit-graphic characters.")
(define-ccl-program ccl-encode-mule-utf-8
`(1
@@ -269,7 +279,8 @@ characters.")
(loop
(if (r5 < 0)
((r1 = -1)
- (read-multibyte-character r0 r1))
+ (read-multibyte-character r0 r1)
+ (translate-character utf-translation-table-for-encode r0 r1))
(;; We have already done read-multibyte-character.
(r0 = r5)
(r1 = r6)
@@ -376,30 +387,26 @@ characters.")
((write #xc2)
(write r1)))))
- "CCL program to encode into UTF-8.
-Only characters from the charsets ascii, eight-bit-control,
-eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
-Others are encoded as U+FFFD.")
+ "CCL program to encode into UTF-8.")
(make-coding-system
'mule-utf-8 4 ?u
"UTF-8 encoding for Emacs-supported Unicode characters.
-The supported Emacs character sets are:
- ascii
- eight-bit-control
- eight-bit-graphic
- latin-iso8859-1
- mule-unicode-0100-24ff
- mule-unicode-2500-33ff
- mule-unicode-e000-ffff
-
-Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
-are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences. The byte sequence is
-preserved on i/o for valid utf-8, but not necessarily for invalid
-utf-8.
-
-Emacs characters not from the above charsets are encoded into U+FFFD."
+It supports Unicode characters of these ranges:
+ U+0000..U+33FF, U+E000..U+FFFF.
+They correspond to these Emacs character sets:
+ ascii, latin-iso8859-1, mule-unicode-0100-24ff,
+ mule-unicode-2500-33ff, mule-unicode-e000-ffff
+
+On decoding (e.g. reading a file), Unicode characters not in the above
+ranges are decoded into sequences of eight-bit-control and
+eight-bit-graphic characters to preserve their byte sequences. The
+byte sequence is preserved on i/o for valid utf-8, but not necessarily
+for invalid utf-8.
+
+On encoding (e.g. writing a file), Emacs characters not belonging to
+any of the character sets listed above are encoded into the UTF-8 byte
+sequence representing U+FFFD (REPLACEMENT CHARACTER)."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
'((safe-charsets
@@ -412,7 +419,10 @@ Emacs characters not from the above charsets are encoded into U+FFFD."
mule-unicode-e000-ffff)
(mime-charset . utf-8)
(coding-category . coding-category-utf-8)
- (valid-codes (0 . 255))))
+ (valid-codes (0 . 255))
+ (post-read-conversion . utf-8-post-read-conversion)
+ (dependency unify-8859-on-encoding-mode
+ unify-8859-on-decoding-mode)))
(define-coding-system-alias 'utf-8 'mule-utf-8)