diff options
author | Lars Ingebrigtsen <larsi@gnus.org> | 2022-01-17 15:47:37 +0100 |
---|---|---|
committer | Lars Ingebrigtsen <larsi@gnus.org> | 2022-01-17 15:47:50 +0100 |
commit | e2c8091113404971de75a893fb30cac591a82694 (patch) | |
tree | 850cbb3bbffb0fbe9a2e0a6403e418bf8c908fa7 /admin/unidata/unidata-gen.el | |
parent | 39d4e1ca21f3270d4835d5efa8862efc618c4cd9 (diff) | |
download | emacs-e2c8091113404971de75a893fb30cac591a82694.tar.gz |
Add support for functions that deal with Unicode scripts
* admin/unidata/Makefile.in (${unidir}/uni-scripts.el): Build
uni-scripts.el.
* admin/unidata/Scripts.txt:
* admin/unidata/ScriptExtensions.txt:
* admin/unidata/PropertyValueAliases.txt: New files from Unicode.
* admin/unidata/README: Update.
* admin/unidata/unidata-gen.el (unidata-gen-charprop): Allow
writing other data, too.
(unidata-gen-scripts, unidata-gen--read-script-aliases)
(unidata-gen--insert-file): New functions to parse the Script* files.
* lisp/international/textsec.el: Implement some functions that
work on scripts.
Diffstat (limited to 'admin/unidata/unidata-gen.el')
-rw-r--r-- | admin/unidata/unidata-gen.el | 117 |
1 files changed, 110 insertions, 7 deletions
diff --git a/admin/unidata/unidata-gen.el b/admin/unidata/unidata-gen.el index d6b5a476bb0..f0538d70e21 100644 --- a/admin/unidata/unidata-gen.el +++ b/admin/unidata/unidata-gen.el @@ -1449,20 +1449,24 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)." (format ";;; %s ends here\n" basename))))) (or noninteractive (message "Generating %s...done" file))) -(defun unidata-gen-charprop (&optional charprop-file) +(defun unidata-gen-charprop (&optional charprop-file text) (or charprop-file (setq charprop-file (pop command-line-args-left))) (with-temp-file charprop-file (insert ";; Automatically generated by unidata-gen.el." " -*- lexical-binding: t -*-\n" ";; See the admin/unidata/ directory in the Emacs sources.\n") - (dolist (elt unidata-file-alist) - (dolist (proplist (cdr elt)) - (insert (format "(define-char-code-property '%S %S\n %S)\n" - (unidata-prop-prop proplist) (car elt) - (unidata-prop-docstring proplist))))) + (if text + (insert text) + (dolist (elt unidata-file-alist) + (dolist (proplist (cdr elt)) + (insert (format "(define-char-code-property '%S %S\n %S)\n" + (unidata-prop-prop proplist) (car elt) + (unidata-prop-docstring proplist)))))) (or noninteractive (message "Writing %s..." charprop-file)) (insert "\n" - "(provide 'charprop)\n" + (format "(provide '%s)\n" + (file-name-sans-extension + (file-name-nondirectory charprop-file))) "\n" ";; Local Variables:\n" ";; coding: utf-8\n" @@ -1473,6 +1477,105 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)." (format ";;; %s ends here\n" (file-name-nondirectory charprop-file))))) +(defun unidata-gen-scripts (&optional file) + ;; Running from Makefile. + (unless file + (setq file (pop command-line-args-left))) + (let ((aliases (unidata-gen--read-script-aliases)) + (table (make-char-table nil)) + (segmented (make-hash-table :test #'equal))) + ;; First parse the scripts. + (with-temp-buffer + (unidata-gen--insert-file "Scripts.txt") + (while (not (eobp)) + ;; 1700..1711 ; Tagalog # Lo [18] TAGALOG LETTER A..TAGALOG + (when (looking-at "\\([0-9A-F]+\\)\\(?:\\.\\.\\([0-9A-F]+\\)\\)? +; +\\([^ ]+\\) +#") + (let ((start (string-to-number (match-string 1) 16)) + (end (and (match-string 2) + (string-to-number (match-string 2) 16))) + (scripts (list (intern (string-replace + "_" "-" + (downcase (match-string 3))))))) + (set-char-table-range + table (if end (cons start end) start) scripts))) + (forward-line 1))) + + ;; Then parse the file that lists "other scripts" that characters + ;; may appear in, and add those. + (with-temp-buffer + (unidata-gen--insert-file "ScriptExtensions.txt") + (while (not (eobp)) + ;; 102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK + (when (looking-at "\\([0-9A-F]+\\)\\(?:\\.\\.\\([0-9A-F]+\\)\\)? +; +\\([^#]+\\)") + (let ((start (string-to-number (match-string 1) 16)) + (end (and (match-string 2) + (string-to-number (match-string 2) 16))) + (scripts + (mapcar + (lambda (alias) + (intern (string-replace + "_" "-" (downcase + (gethash alias aliases))))) + (split-string (string-trim (match-string 3)))))) + (dolist (script scripts) + (dotimes (i (- (1+ (or end start)) start)) + (set-char-table-range + table (+ i start) + (append (elt table (+ i start)) (list script))))))) + (forward-line 1))) + + ;; Then go through the data and collect into buckets based on + ;; identical script lists. + (map-char-table + (lambda (key value) + ;; `map-char-table' is reused, so copy it. + (push (if (consp key) + (cons (car key) (cdr key)) + key) + ;; Keep the first element first, but sort the rest. + (gethash (cons (car value) + (sort (remq (car value) value) #'string<)) + segmented))) + table) + + ;; Then go through the data and collect into buckets based on + (let ((scripts nil)) + (maphash + (lambda (segment chars) + (push (cons segment chars) scripts)) + segmented) + (setq scripts (sort scripts (lambda (s1 s2) + (string< (caar s1) (caar s2))))) + (with-temp-buffer + (insert "(textsec--create-script-table '(\n") + (dolist (script scripts) + (insert "(" (prin1-to-string (car script)) "\n") + (insert " " (prin1-to-string (cdr script))) + (insert ")\n")) + (insert "))\n") + ;; Write the file. + (unidata-gen-charprop file (buffer-string)))))) + +(defun unidata-gen--read-script-aliases () + (let ((aliases (make-hash-table :test #'equal))) + (with-temp-buffer + (unidata-gen--insert-file "PropertyValueAliases.txt") + (unless (re-search-forward "^# Script " nil t) + (error "Can't find the Script section")) + (forward-line 2) + (while (looking-at "sc *;") + (let ((elem (split-string (buffer-substring (point) (line-end-position)) + ";" nil "[ \t]+"))) + (setf (gethash (nth 1 elem) aliases) + (nth 2 elem))) + (forward-line 1)) + aliases))) + +(defun unidata-gen--insert-file (name) + (insert-file-contents + (expand-file-name (concat "../admin/unidata/" name) + data-directory))) + ;;; unidata-gen.el ends here |