Support casing characters which map into multiple code points (bug#24603)

Implement unconditional special casing rules defined in Unicode standard. Among other things, they deal with cases when a single code point is replaced by multiple ones because single character does not exist (e.g. ‘ﬁ’ ligature turning into ‘FL’) or is not commonly used (e.g. ß turning into SS). * admin/unidata/SpecialCasing.txt: New data file pulled from Unicode standard distribution. * admin/unidata/README: Mention SpecialCasing.txt. * admin/unidata/unidata-get.el (unidata-gen-table-special-casing, unidata-gen-table-special-casing--do-load): New functions generating ‘special-uppercase’, ‘special-lowercase’ and ‘special-titlecase’ character Unicode properties built from the SpecialCasing.txt Unicode data file. * src/casefiddle.c (struct casing_str_buf): New structure for representing short strings used to handle one-to-many character mappings. (case_character_imlp): New function which can handle one-to-many character mappings. (case_character, case_single_character): Wrappers for the above functions. The former may map one character to multiple (or no) code points while the latter does what the former used to do (i.e. handles one-to-one mappings only). (do_casify_natnum, do_casify_unibyte_string, do_casify_unibyte_region): Use case_single_character. (do_casify_multibyte_string, do_casify_multibyte_region): Support new features of case_character. * (do_casify_region): Updated to reflact do_casify_multibyte_string changes. (casify_word): Handle situation when one character-length of a word can change affecting where end of the word is. (upcase, capitalize, upcase-initials): Update documentation to mention limitations when working on characters. * test/src/casefiddle-tests.el (casefiddle-tests-char-properties): Add test cases for the newly introduced character properties. (casefiddle-tests-casing): Update test cases which are now passing. * test/lisp/char-fold-tests.el (char-fold--ascii-upcase, char-fold--ascii-downcase): New functions which behave like old ‘upcase’ and ‘downcase’. (char-fold--test-match-exactly): Use the new functions. This is needed because otherwise ﬁ and similar characters are turned into their multi- -character representation. * doc/lispref/strings.texi: Describe issue with casing characters versus strings. * doc/lispref/nonascii.texi: Describe the new character properties.
author: Michal Nazarewicz <mina86@mina86.com> 2016-10-05 00:06:01 +0200
committer: Michal Nazarewicz <mina86@mina86.com> 2017-04-06 20:54:58 +0200
commit: b3b9b258c4026baa1cad3f2e617f1a637fc8d205 (patch)
tree: 1520ef9f5a3204784c597fcf2bf7a7c7fc1b8d7c /src/casefiddle.c
parent: 2c87dabd0460cce83d2345b4ddff159969674fef (diff)
download: emacs-b3b9b258c4026baa1cad3f2e617f1a637fc8d205.tar.gz
1 files changed, 205 insertions, 84 deletions
diff --git a/src/casefiddle.c b/src/casefiddle.c
index b1a5f8e236e..10674d963ec 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -1,3 +1,4 @@
+/* -*- coding: utf-8 -*- */
 /* GNU Emacs case conversion functions.
 
 Copyright (C) 1985, 1994, 1997-1999, 2001-2017 Free Software Foundation,
@@ -36,6 +37,9 @@ struct casing_context {
   /* A char-table with title-case character mappings or nil.  Non-nil implies
      flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP.  */
   Lisp_Object titlecase_char_table;
+  /* The unconditional special-casing Unicode property char tables for upper
+     casing, lower casing and title casing respectively. */
+  Lisp_Object specialcase_char_tables[3];
   /* User-requested action. */
   enum case_action flag;
   /* If true, function operates on a buffer as opposed to a string or character.
@@ -58,6 +62,13 @@ prepare_casing_context (struct casing_context *ctx,
   ctx->inword = flag == CASE_DOWN;
   ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil :
     uniprop_table (intern_c_string ("titlecase"));
+  ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil :
+    uniprop_table (intern_c_string ("special-uppercase"));
+  ctx->specialcase_char_tables[CASE_DOWN] = flag == CASE_UP ? Qnil :
+    uniprop_table (intern_c_string ("special-lowercase"));
+  ctx->specialcase_char_tables[CASE_CAPITALIZE] =
+    (int)flag < (int)CASE_CAPITALIZE ? Qnil :
+    uniprop_table (intern_c_string ("special-titlecase"));
 
   /* If the case table is flagged as modified, rescan it.  */
   if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -67,25 +78,98 @@ prepare_casing_context (struct casing_context *ctx,
     SETUP_BUFFER_SYNTAX_TABLE ();	/* For syntax_prefix_flag_p.  */
 }
 
-/* Based on CTX, case character CH accordingly.  Update CTX as necessary.
-   Return cased character. */
+struct casing_str_buf {
+  unsigned char data[MAX_MULTIBYTE_LENGTH > 6 ? MAX_MULTIBYTE_LENGTH : 6];
+  unsigned char len_chars;
+  unsigned char len_bytes;
+};
+
+/* Based on CTX, case character CH.  If BUF is NULL, return cased character.
+   Otherwise, if BUF is non-NULL, save result in it and return whether the
+   character has been changed.
+
+   Since meaning of return value depends on arguments, it’s more convenient to
+   use case_single_character or case_character instead. */
 static int
-case_character (struct casing_context *ctx, int ch)
+case_character_impl (struct casing_str_buf *buf,
+		     struct casing_context *ctx, int ch)
 {
+  enum case_action flag;
   Lisp_Object prop;
+  bool was_inword;
+  int cased;
+
+  /* Update inword state */
+  was_inword = ctx->inword;
+  if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
+    ctx->inword = SYNTAX (ch) == Sword &&
+      (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
+
+  /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
+  if (!was_inword)
+    flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE;
+  else if (ctx->flag != CASE_CAPITALIZE_UP)
+    flag = CASE_DOWN;
+  else
+    {
+      cased = ch;
+      goto done;
+    }
+
+  /* Look through the special casing entries. */
+  if (buf && !NILP(ctx->specialcase_char_tables[(int)flag]))
+    {
+      prop = CHAR_TABLE_REF(ctx->specialcase_char_tables[(int)flag], ch);
+      if (STRINGP(prop))
+        {
+          struct Lisp_String *str = XSTRING(prop);
+          if (STRING_BYTES(str) <= sizeof buf->data)
+	    {
+	      buf->len_chars = str->size;
+	      buf->len_bytes = STRING_BYTES(str);
+	      memcpy(buf->data, str->data, buf->len_bytes);
+	      return 1;
+	    }
+        }
+    }
 
-  if (ctx->inword)
-    ch = ctx->flag == CASE_CAPITALIZE_UP ? ch : downcase (ch);
+  /* Handle simple, one-to-one case. */
+  if (flag == CASE_DOWN)
+    cased = downcase (ch);
   else if (!NILP (ctx->titlecase_char_table) &&
 	   CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch)))
-    ch = XFASTINT (prop);
+    cased = XFASTINT (prop);
   else
-    ch = upcase(ch);
+    cased = upcase(ch);
+
+  /* And we’re done. */
+ done:
+  if (!buf)
+    return cased;
+  buf->len_chars = 1;
+  buf->len_bytes = CHAR_STRING (cased, buf->data);
+  return cased != ch;
+}
 
-  if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
-    ctx->inword = SYNTAX (ch) == Sword &&
-      (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
-  return ch;
+/* Based on CTX, case character CH accordingly.  Update CTX as necessary.
+   Return cased character.
+
+   Special casing rules (such as upcase(ﬁ) = FI) are not handled.  For
+   characters whose casing results in multiple code points, the character is
+   returned unchanged. */
+static inline int
+case_single_character (struct casing_context *ctx, int ch)
+{
+  return case_character_impl (NULL, ctx, ch);
+}
+
+/* Save in BUF result of casing character CH.  Return whether casing changed the
+   character.  This is like case_single_character but also handles one-to-many
+   casing rules. */
+static inline bool
+case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch)
+{
+  return case_character_impl (buf, ctx, ch);
 }
 
 static Lisp_Object
@@ -112,7 +196,7 @@ do_casify_natnum (struct casing_context *ctx, Lisp_Object obj)
     || !NILP (BVAR (current_buffer, enable_multibyte_characters));
   if (! multibyte)
     MAKE_CHAR_MULTIBYTE (ch);
-  cased = case_character (ctx, ch);
+  cased = case_single_character (ctx, ch);
   if (cased == ch)
     return obj;
 
@@ -125,25 +209,34 @@ do_casify_natnum (struct casing_context *ctx, Lisp_Object obj)
 static Lisp_Object
 do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
 {
-  ptrdiff_t i, i_byte, size = SCHARS (obj);
-  int len, ch, cased;
+  /* We assume data is the first member of casing_str_buf structure so that if
+     we cast a (char *) into (struct casing_str_buf *) the representation of the
+     character is at the beginning of the buffer.  This is why we don’t need
+     separate struct casing_str_buf object but rather write directly to o. */
+  typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 1];
+
+  ptrdiff_t size = SCHARS (obj), n;
+  int ch;
   USE_SAFE_ALLOCA;
-  ptrdiff_t o_size;
-  if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &o_size))
-    o_size = PTRDIFF_MAX;
-  unsigned char *dst = SAFE_ALLOCA (o_size);
+  if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) ||
+      INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n))
+    n = PTRDIFF_MAX;
+  unsigned char *const dst = SAFE_ALLOCA (n), *const dst_end = dst + n;
   unsigned char *o = dst;
 
-  for (i = i_byte = 0; i < size; i++, i_byte += len)
+  const unsigned char *src = SDATA (obj);
+
+  for (n = 0; size; --size)
     {
-      if (o_size - MAX_MULTIBYTE_LENGTH < o - dst)
+      if (dst_end - o < sizeof(struct casing_str_buf))
 	string_overflow ();
-      ch = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);
-      cased = case_character (ctx, ch);
-      o += CHAR_STRING (cased, o);
+      ch = STRING_CHAR_ADVANCE (src);
+      case_character ((void *)o, ctx, ch);
+      n += ((struct casing_str_buf *)o)->len_chars;
+      o += ((struct casing_str_buf *)o)->len_bytes;
     }
-  eassert (o - dst <= o_size);
-  obj = make_multibyte_string ((char *) dst, size, o - dst);
+  eassert (o <= dst_end);
+  obj = make_multibyte_string ((char *) dst, n, o - dst);
   SAFE_FREE ();
   return obj;
 }
@@ -159,7 +252,7 @@ do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj)
     {
       ch = SREF (obj, i);
       MAKE_CHAR_MULTIBYTE (ch);
-      cased = case_character (ctx, ch);
+      cased = case_single_character (ctx, ch);
       if (ch == cased)
 	continue;
       MAKE_CHAR_UNIBYTE (cased);
@@ -191,7 +284,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
 DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
        doc: /* Convert argument to upper case and return that.
 The argument may be a character or string.  The result has the same type.
-The argument object is not altered--the value is a copy.
+The argument object is not altered--the value is a copy.  If argument
+is a character, characters which map to multiple code points when
+cased, e.g. ﬁ, are returned unchanged.
 See also `capitalize', `downcase' and `upcase-initials'.  */)
   (Lisp_Object obj)
 {
@@ -212,7 +307,9 @@ DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0,
 This means that each word's first character is converted to either
 title case or upper case, and the rest to lower case.
 The argument may be a character or string.  The result has the same type.
-The argument object is not altered--the value is a copy.  */)
+The argument object is not altered--the value is a copy.  If argument
+is a character, characters which map to multiple code points when
+cased, e.g. ﬁ, are returned unchanged.  */)
   (Lisp_Object obj)
 {
   return casify_object (CASE_CAPITALIZE, obj);
@@ -225,21 +322,28 @@ DEFUN ("upcase-initials", Fupcase_initials, Supcase_initials, 1, 1, 0,
 This means that each word's first character is converted to either
 title case or upper case, and the rest are left unchanged.
 The argument may be a character or string.  The result has the same type.
-The argument object is not altered--the value is a copy.  */)
+The argument object is not altered--the value is a copy.  If argument
+is a character, characters which map to multiple code points when
+cased, e.g. ﬁ, are returned unchanged.  */)
   (Lisp_Object obj)
 {
   return casify_object (CASE_CAPITALIZE_UP, obj);
 }
 
-/* Based on CTX, case region in a unibyte buffer from POS to *ENDP.  Return
-   first position that has changed and save last position in *ENDP.  If no
-   characters were changed, return -1 and *ENDP is unspecified. */
+/* Based on CTX, case region in a unibyte buffer from *STARTP to *ENDP.
+
+   Save first and last positions that has changed in *STARTP and *ENDP
+   respectively.  If no characters were changed, save -1 to *STARTP and leave
+   *ENDP unspecified.
+
+   Always return 0.  This is so that interface of this function is the same as
+   do_casify_multibyte_region. */
 static ptrdiff_t
 do_casify_unibyte_region (struct casing_context *ctx,
-			  ptrdiff_t pos, ptrdiff_t *endp)
+			  ptrdiff_t *startp, ptrdiff_t *endp)
 {
   ptrdiff_t first = -1, last = -1;  /* Position of first and last changes. */
-  ptrdiff_t end = *endp;
+  ptrdiff_t pos = *startp, end = *endp;
   int ch, cased;
 
   for (; pos < end; ++pos)
@@ -247,11 +351,11 @@ do_casify_unibyte_region (struct casing_context *ctx,
       ch = FETCH_BYTE (pos);
       MAKE_CHAR_MULTIBYTE (ch);
 
-      cased = case_character (ctx, ch);
+      cased = case_single_character (ctx, ch);
       if (cased == ch)
 	continue;
 
-      last = pos;
+      last = pos + 1;
       if (first < 0)
 	first = pos;
 
@@ -259,88 +363,107 @@ do_casify_unibyte_region (struct casing_context *ctx,
       FETCH_BYTE (pos) = cased;
     }
 
-  *endp = last + 1;
-  return first;
+  *startp = first;
+  *endp = last;
+  return 0;
 }
 
-/* Based on CTX, case region in a multibyte buffer from POS to *ENDP.  Return
-   first position that has changed and save last position in *ENDP.  If no
-   characters were changed, return -1 and *ENDP is unspecified. */
+/* Based on CTX, case region in a multibyte buffer from *STARTP to *ENDP.
+
+   Return number of added characters (may be negative if more characters were
+   deleted then inserted), save first and last positions that has changed in
+   *STARTP and *ENDP respectively.  If no characters were changed, return 0,
+   save -1 to *STARTP and leave *ENDP unspecified. */
 static ptrdiff_t
 do_casify_multibyte_region (struct casing_context *ctx,
-                           ptrdiff_t pos, ptrdiff_t *endp)
+			    ptrdiff_t *startp, ptrdiff_t *endp)
 {
   ptrdiff_t first = -1, last = -1;  /* Position of first and last changes. */
-  ptrdiff_t pos_byte = CHAR_TO_BYTE (pos), end = *endp;
-  ptrdiff_t opoint = PT;
+  ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
+  ptrdiff_t opoint = PT, added = 0;
+  struct casing_str_buf buf;
   int ch, cased, len;
 
-  while (pos < end)
+  for (; size; --size)
     {
       ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
-      cased = case_character (ctx, ch);
-      if (cased != ch)
+      if (!case_character (&buf, ctx, ch))
 	{
-	  last = pos;
-	  if (first < 0)
-	    first = pos;
+	  pos_byte += len;
+	  ++pos;
+	  continue;
+	}
 
-	  if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch))
-	    FETCH_BYTE (pos_byte) = cased;
-	  else
-	    {
-	      unsigned char str[MAX_MULTIBYTE_LENGTH];
-	      int totlen = CHAR_STRING (cased, str);
-	      if (len == totlen)
-		memcpy (BYTE_POS_ADDR (pos_byte), str, len);
-	      else
-		/* Replace one character with the other(s), keeping text
-		   properties the same.  */
-		replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
-				 (char *) str, 9, totlen, 0);
-	      len = totlen;
-	    }
+      last = pos + buf.len_chars;
+      if (first < 0)
+	first = pos;
+
+      if (buf.len_chars == 1 && buf.len_bytes == len)
+	memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
+      else
+	{
+	  /* Replace one character with the other(s), keeping text
+	     properties the same.  */
+	  replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
+			   (const char *) buf.data, buf.len_chars,
+			   buf.len_bytes,
+			   0);
+	  added += (ptrdiff_t) buf.len_chars - 1;
+	  if (opoint > pos)
+	    opoint += (ptrdiff_t) buf.len_chars - 1;
 	}
-      pos++;
-      pos_byte += len;
+
+      pos_byte += buf.len_bytes;
+      pos += buf.len_chars;
     }
 
   if (PT != opoint)
     TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint));
 
+  *startp = first;
   *endp = last;
-  return first;
+  return added;
 }
 
-/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.
-   b and e specify range of buffer to operate on. */
-static void
+/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.  b and
+   e specify range of buffer to operate on.  Return character position of the
+   end of the region after changes.  */
+static ptrdiff_t
 casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
 {
+  ptrdiff_t start, end, orig_end, added;
   struct casing_context ctx;
-  ptrdiff_t start, end;
-
-  if (EQ (b, e))
-    /* Not modifying because nothing marked */
-    return;
 
   validate_region (&b, &e);
   start = XFASTINT (b);
   end = XFASTINT (e);
+  if (start == end)
+    /* Not modifying because nothing marked */
+    return end;
   modify_text (start, end);
-  record_change (start, end - start);
   prepare_casing_context (&ctx, flag, true);
 
+  orig_end = end;
+  record_delete (start, make_buffer_string (start, end, true), false);
   if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
-    start = do_casify_unibyte_region (&ctx, start, &end);
+    {
+      record_insert (start, end - start);
+      added = do_casify_unibyte_region (&ctx, &start, &end);
+    }
   else
-    start = do_casify_multibyte_region (&ctx, start, &end);
+    {
+      ptrdiff_t len = end - start, ostart = start;
+      added = do_casify_multibyte_region (&ctx, &start, &end);
+      record_insert (ostart, len + added);
+    }
 
   if (start >= 0)
     {
-      signal_after_change (start, end + 1 - start, end + 1 - start);
-      update_compositions (start, end + 1, CHECK_ALL);
+      signal_after_change (start, end - start - added, end - start);
+      update_compositions (start, end, CHECK_ALL);
     }
+
+  return orig_end + added;
 }
 
 DEFUN ("upcase-region", Fupcase_region, Supcase_region, 2, 3,
@@ -432,9 +555,7 @@ casify_word (enum case_action flag, Lisp_Object arg)
   ptrdiff_t farend = scan_words (PT, XINT (arg));
   if (!farend)
     farend = XINT (arg) <= 0 ? BEGV : ZV;
-  ptrdiff_t newpoint = max (PT, farend);
-  casify_region (flag, make_number (PT), make_number (farend));
-  SET_PT (newpoint);
+  SET_PT (casify_region (flag, make_number (PT), make_number (farend)));
   return Qnil;
 }
author	Michal Nazarewicz <mina86@mina86.com>	2016-10-05 00:06:01 +0200
committer	Michal Nazarewicz <mina86@mina86.com>	2017-04-06 20:54:58 +0200
commit	b3b9b258c4026baa1cad3f2e617f1a637fc8d205 (patch)
tree	1520ef9f5a3204784c597fcf2bf7a7c7fc1b8d7c /src/casefiddle.c
parent	2c87dabd0460cce83d2345b4ddff159969674fef (diff)
download	emacs-b3b9b258c4026baa1cad3f2e617f1a637fc8d205.tar.gz