summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenichi Handa <handa@m17n.org>2001-03-30 12:18:01 +0000
committerKenichi Handa <handa@m17n.org>2001-03-30 12:18:01 +0000
commitaa15b3e5a60ddd926ae8e10ac99ee4821c3e0062 (patch)
tree26ab5c61866051e731fabebeca470b3418d31907
parenta2217cd80f15ad44753c75789202f3b7b0343c46 (diff)
downloademacs-aa15b3e5a60ddd926ae8e10ac99ee4821c3e0062.tar.gz
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
-rw-r--r--lisp/international/utf-8.el182
1 files changed, 107 insertions, 75 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index acd9e14706a..d2972fe1539 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -3,6 +3,7 @@
;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
;; Licensed to the Free Software Foundation.
+;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
;; Keywords: multilingual, Unicode, UTF-8, i18n
;; This file is part of GNU Emacs.
@@ -186,85 +187,116 @@ characters.")
(define-ccl-program ccl-encode-mule-utf-8
`(1
- (loop
- (read-multibyte-character r0 r1)
-
- (if (r0 == ,(charset-id 'ascii))
- (write r1)
-
- (if (r0 == ,(charset-id 'latin-iso8859-1))
- ;; r1 scalar utf-8
- ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
- ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
- ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
- ((r0 = (((r1 & #x40) >> 6) | #xc2))
- (r1 &= #x3f)
- (r1 |= #x80)
- (write r0 r1))
-
- (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
- ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
- ;; #x3f80 == (0011 1111 1000 0000)b
- (r1 &= #x7f)
- (r1 += (r0 + 224)) ; 240 == -32 + #x0100
- ;; now r1 holds scalar value
- (if (r1 < #x0800)
- ;; 2byte encoding
- ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
- ;; #x07c0 == (0000 0111 1100 0000)b
- (r1 &= #x3f)
- (r1 |= #x80)
- (write r0 r1))
- ;; 3byte encoding
- ((r0 = (((r1 & #xf000) >> 12) | #xe0))
- (r2 = ((r1 & #x3f) | #x80))
- (r1 &= #x0fc0)
- (r1 >>= 6)
- (r1 |= #x80)
- (write r0 r1 r2))))
-
- (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
- ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
- (r1 &= #x7f)
- (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
- (r0 = (((r1 & #xf000) >> 12) | #xe0))
- (r2 = ((r1 & #x3f) | #x80))
- (r1 &= #x0fc0)
- (r1 >>= 6)
- (r1 |= #x80)
- (write r0 r1 r2))
-
- (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
- ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
- (r1 &= #x7f)
- (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
- (r0 = (((r1 & #xf000) >> 12) | #xe0))
+ ((r5 = -1)
+ (loop
+ (if (r5 < 0)
+ ((r1 = -1)
+ (read-multibyte-character r0 r1))
+ (;; We have already done read-multibyte-character.
+ (r0 = r5)
+ (r1 = r6)
+ (r5 = -1)))
+
+ (if (r0 == ,(charset-id 'ascii))
+ (write r1)
+
+ (if (r0 == ,(charset-id 'latin-iso8859-1))
+ ;; r1 scalar utf-8
+ ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
+ ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
+ ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
+ ((r0 = (((r1 & #x40) >> 6) | #xc2))
+ (r1 &= #x3f)
+ (r1 |= #x80)
+ (write r0 r1))
+
+ (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
+ ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+ ;; #x3f80 == (0011 1111 1000 0000)b
+ (r1 &= #x7f)
+ (r1 += (r0 + 224)) ; 240 == -32 + #x0100
+ ;; now r1 holds scalar value
+ (if (r1 < #x0800)
+ ;; 2byte encoding
+ ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
+ ;; #x07c0 == (0000 0111 1100 0000)b
+ (r1 &= #x3f)
+ (r1 |= #x80)
+ (write r0 r1))
+ ;; 3byte encoding
+ ((r0 = (((r1 & #xf000) >> 12) | #xe0))
(r2 = ((r1 & #x3f) | #x80))
(r1 &= #x0fc0)
(r1 >>= 6)
(r1 |= #x80)
- (write r0 r1 r2))
-
- (if (r0 == ,(charset-id 'eight-bit-control))
- ;; r1 scalar utf-8
- ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
- ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
- ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
- (write r1)
-
- (if (r0 == ,(charset-id 'eight-bit-graphic))
- ;; r1 scalar utf-8
- ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
- ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
- ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
- (write r1)
-
- ;; Unsupported character.
- ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
- ((write #xef)
- (write #xbf)
- (write #xbd)))))))))
- (repeat)))
+ (write r0 r1 r2))))
+
+ (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
+ ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+ (r1 &= #x7f)
+ (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
+ (r0 = (((r1 & #xf000) >> 12) | #xe0))
+ (r2 = ((r1 & #x3f) | #x80))
+ (r1 &= #x0fc0)
+ (r1 >>= 6)
+ (r1 |= #x80)
+ (write r0 r1 r2))
+
+ (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
+ ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+ (r1 &= #x7f)
+ (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
+ (r0 = (((r1 & #xf000) >> 12) | #xe0))
+ (r2 = ((r1 & #x3f) | #x80))
+ (r1 &= #x0fc0)
+ (r1 >>= 6)
+ (r1 |= #x80)
+ (write r0 r1 r2))
+
+ (if (r0 == ,(charset-id 'eight-bit-control))
+ ;; r1 scalar utf-8
+ ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
+ ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
+ ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
+ ((write #xc2)
+ (write r1))
+
+ (if (r0 == ,(charset-id 'eight-bit-graphic))
+ ;; r1 scalar utf-8
+ ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
+ ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
+ ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
+ ((write r1)
+ (r1 = -1)
+ (read-multibyte-character r0 r1)
+ (if (r0 != ,(charset-id 'eight-bit-graphic))
+ (if (r0 != ,(charset-id 'eight-bit-control))
+ ((r5 = r0)
+ (r6 = r1))))
+ (if (r5 < 0)
+ ((read-multibyte-character r0 r2)
+ (if (r0 != ,(charset-id 'eight-bit-graphic))
+ (if (r0 != ,(charset-id 'eight-bit-control))
+ ((r5 = r0)
+ (r6 = r2))))
+ (if (r5 < 0)
+ (write r1 r2)
+ (if (r1 < #xa0)
+ (write r1)
+ ((write #xc2)
+ (write r1)))))))
+
+ ;; Unsupported character.
+ ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
+ ((write #xef)
+ (write #xbf)
+ (write #xbd)))))))))
+ (repeat)))
+ (if (r1 >= #xa0)
+ (write r1)
+ (if (r1 >= #x80)
+ ((write #xc2)
+ (write r1)))))
"CCL program to encode into UTF-8.
Only characters from the charsets ascii, eight-bit-control,