summaryrefslogtreecommitdiff
path: root/admin/unidata/unidata-gen.el
diff options
context:
space:
mode:
authorLars Ingebrigtsen <larsi@gnus.org>2022-01-17 15:47:37 +0100
committerLars Ingebrigtsen <larsi@gnus.org>2022-01-17 15:47:50 +0100
commite2c8091113404971de75a893fb30cac591a82694 (patch)
tree850cbb3bbffb0fbe9a2e0a6403e418bf8c908fa7 /admin/unidata/unidata-gen.el
parent39d4e1ca21f3270d4835d5efa8862efc618c4cd9 (diff)
downloademacs-e2c8091113404971de75a893fb30cac591a82694.tar.gz
Add support for functions that deal with Unicode scripts
* admin/unidata/Makefile.in (${unidir}/uni-scripts.el): Build uni-scripts.el. * admin/unidata/Scripts.txt: * admin/unidata/ScriptExtensions.txt: * admin/unidata/PropertyValueAliases.txt: New files from Unicode. * admin/unidata/README: Update. * admin/unidata/unidata-gen.el (unidata-gen-charprop): Allow writing other data, too. (unidata-gen-scripts, unidata-gen--read-script-aliases) (unidata-gen--insert-file): New functions to parse the Script* files. * lisp/international/textsec.el: Implement some functions that work on scripts.
Diffstat (limited to 'admin/unidata/unidata-gen.el')
-rw-r--r--admin/unidata/unidata-gen.el117
1 files changed, 110 insertions, 7 deletions
diff --git a/admin/unidata/unidata-gen.el b/admin/unidata/unidata-gen.el
index d6b5a476bb0..f0538d70e21 100644
--- a/admin/unidata/unidata-gen.el
+++ b/admin/unidata/unidata-gen.el
@@ -1449,20 +1449,24 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)."
(format ";;; %s ends here\n" basename)))))
(or noninteractive (message "Generating %s...done" file)))
-(defun unidata-gen-charprop (&optional charprop-file)
+(defun unidata-gen-charprop (&optional charprop-file text)
(or charprop-file (setq charprop-file (pop command-line-args-left)))
(with-temp-file charprop-file
(insert ";; Automatically generated by unidata-gen.el."
" -*- lexical-binding: t -*-\n"
";; See the admin/unidata/ directory in the Emacs sources.\n")
- (dolist (elt unidata-file-alist)
- (dolist (proplist (cdr elt))
- (insert (format "(define-char-code-property '%S %S\n %S)\n"
- (unidata-prop-prop proplist) (car elt)
- (unidata-prop-docstring proplist)))))
+ (if text
+ (insert text)
+ (dolist (elt unidata-file-alist)
+ (dolist (proplist (cdr elt))
+ (insert (format "(define-char-code-property '%S %S\n %S)\n"
+ (unidata-prop-prop proplist) (car elt)
+ (unidata-prop-docstring proplist))))))
(or noninteractive (message "Writing %s..." charprop-file))
(insert "\n"
- "(provide 'charprop)\n"
+ (format "(provide '%s)\n"
+ (file-name-sans-extension
+ (file-name-nondirectory charprop-file)))
" \n"
";; Local Variables:\n"
";; coding: utf-8\n"
@@ -1473,6 +1477,105 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)."
(format ";;; %s ends here\n"
(file-name-nondirectory charprop-file)))))
+(defun unidata-gen-scripts (&optional file)
+ ;; Running from Makefile.
+ (unless file
+ (setq file (pop command-line-args-left)))
+ (let ((aliases (unidata-gen--read-script-aliases))
+ (table (make-char-table nil))
+ (segmented (make-hash-table :test #'equal)))
+ ;; First parse the scripts.
+ (with-temp-buffer
+ (unidata-gen--insert-file "Scripts.txt")
+ (while (not (eobp))
+ ;; 1700..1711 ; Tagalog # Lo [18] TAGALOG LETTER A..TAGALOG
+ (when (looking-at "\\([0-9A-F]+\\)\\(?:\\.\\.\\([0-9A-F]+\\)\\)? +; +\\([^ ]+\\) +#")
+ (let ((start (string-to-number (match-string 1) 16))
+ (end (and (match-string 2)
+ (string-to-number (match-string 2) 16)))
+ (scripts (list (intern (string-replace
+ "_" "-"
+ (downcase (match-string 3)))))))
+ (set-char-table-range
+ table (if end (cons start end) start) scripts)))
+ (forward-line 1)))
+
+ ;; Then parse the file that lists "other scripts" that characters
+ ;; may appear in, and add those.
+ (with-temp-buffer
+ (unidata-gen--insert-file "ScriptExtensions.txt")
+ (while (not (eobp))
+ ;; 102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
+ (when (looking-at "\\([0-9A-F]+\\)\\(?:\\.\\.\\([0-9A-F]+\\)\\)? +; +\\([^#]+\\)")
+ (let ((start (string-to-number (match-string 1) 16))
+ (end (and (match-string 2)
+ (string-to-number (match-string 2) 16)))
+ (scripts
+ (mapcar
+ (lambda (alias)
+ (intern (string-replace
+ "_" "-" (downcase
+ (gethash alias aliases)))))
+ (split-string (string-trim (match-string 3))))))
+ (dolist (script scripts)
+ (dotimes (i (- (1+ (or end start)) start))
+ (set-char-table-range
+ table (+ i start)
+ (append (elt table (+ i start)) (list script)))))))
+ (forward-line 1)))
+
+ ;; Then go through the data and collect into buckets based on
+ ;; identical script lists.
+ (map-char-table
+ (lambda (key value)
+ ;; `map-char-table' is reused, so copy it.
+ (push (if (consp key)
+ (cons (car key) (cdr key))
+ key)
+ ;; Keep the first element first, but sort the rest.
+ (gethash (cons (car value)
+ (sort (remq (car value) value) #'string<))
+ segmented)))
+ table)
+
+ ;; Then go through the data and collect into buckets based on
+ (let ((scripts nil))
+ (maphash
+ (lambda (segment chars)
+ (push (cons segment chars) scripts))
+ segmented)
+ (setq scripts (sort scripts (lambda (s1 s2)
+ (string< (caar s1) (caar s2)))))
+ (with-temp-buffer
+ (insert "(textsec--create-script-table '(\n")
+ (dolist (script scripts)
+ (insert "(" (prin1-to-string (car script)) "\n")
+ (insert " " (prin1-to-string (cdr script)))
+ (insert ")\n"))
+ (insert "))\n")
+ ;; Write the file.
+ (unidata-gen-charprop file (buffer-string))))))
+
+(defun unidata-gen--read-script-aliases ()
+ (let ((aliases (make-hash-table :test #'equal)))
+ (with-temp-buffer
+ (unidata-gen--insert-file "PropertyValueAliases.txt")
+ (unless (re-search-forward "^# Script " nil t)
+ (error "Can't find the Script section"))
+ (forward-line 2)
+ (while (looking-at "sc *;")
+ (let ((elem (split-string (buffer-substring (point) (line-end-position))
+ ";" nil "[ \t]+")))
+ (setf (gethash (nth 1 elem) aliases)
+ (nth 2 elem)))
+ (forward-line 1))
+ aliases)))
+
+(defun unidata-gen--insert-file (name)
+ (insert-file-contents
+ (expand-file-name (concat "../admin/unidata/" name)
+ data-directory)))
+
;;; unidata-gen.el ends here