summaryrefslogtreecommitdiff
path: root/src/lread.c
diff options
context:
space:
mode:
authorMattias EngdegÄrd <mattiase@acm.org>2023-03-10 17:10:30 +0100
committerMattias EngdegÄrd <mattiase@acm.org>2023-03-11 10:21:23 +0100
commitb8e7061232f9a5b06af70031dcc4b48c6575a364 (patch)
tree357abc8cfd14a58874b58e6a89f50e8dfe4320b4 /src/lread.c
parentc6bfffa9fe1af7f4f806e5533ba5f3c33476cf9a (diff)
downloademacs-b8e7061232f9a5b06af70031dcc4b48c6575a364.tar.gz
Remove recursion from character escape handling in reader
This cures a C stack overflow when reading certain long (crafted) strings (bug#62039) and improves performance of reading escaped characters in character and string literals. Reported by Bruno Haible. * src/lread.c (invalid_escape_syntax_error): New. (read_escape): Rename to... (read_char_escape): ...this. Remove recursion. Pass read-ahead char as argument. Improve code performance and clarity. (read_char_literal, read_string_literal): Update calls. * test/src/lread-tests.el (lread-char-modifiers) (lread-many-modifiers): Add test cases.
Diffstat (limited to 'src/lread.c')
-rw-r--r--src/lread.c292
1 files changed, 150 insertions, 142 deletions
diff --git a/src/lread.c b/src/lread.c
index d0dc85f51c8..273120315df 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -2639,154 +2639,137 @@ character_name_to_code (char const *name, ptrdiff_t name_len,
Unicode 9.0.0 the maximum is 83, so this should be safe. */
enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
-/* Read a \-escape sequence, assuming we already read the `\'.
- If the escape sequence forces unibyte, return eight-bit char. */
+static AVOID
+invalid_escape_syntax_error (void)
+{
+ error ("Invalid escape character syntax");
+}
+/* Read a character escape sequence, assuming we just read a backslash
+ and one more character (next_char). */
static int
-read_escape (Lisp_Object readcharfun)
+read_char_escape (Lisp_Object readcharfun, int next_char)
{
- int c = READCHAR;
- /* \u allows up to four hex digits, \U up to eight. Default to the
- behavior for \u, and change this value in the case that \U is seen. */
- int unicode_hex_count = 4;
+ int modifiers = 0;
+ ptrdiff_t ncontrol = 0;
+ int chr;
+
+ again: ;
+ int c = next_char;
+ int unicode_hex_count;
+ int mod;
switch (c)
{
case -1:
end_of_file_error ();
- case 'a':
- return '\007';
- case 'b':
- return '\b';
- case 'd':
- return 0177;
- case 'e':
- return 033;
- case 'f':
- return '\f';
- case 'n':
- return '\n';
- case 'r':
- return '\r';
- case 't':
- return '\t';
- case 'v':
- return '\v';
+ case 'a': chr = '\a'; break;
+ case 'b': chr = '\b'; break;
+ case 'd': chr = 127; break;
+ case 'e': chr = 27; break;
+ case 'f': chr = '\f'; break;
+ case 'n': chr = '\n'; break;
+ case 'r': chr = '\r'; break;
+ case 't': chr = '\t'; break;
+ case 'v': chr = '\v'; break;
case '\n':
/* ?\LF is an error; it's probably a user mistake. */
error ("Invalid escape character syntax");
- case 'M':
- c = READCHAR;
- if (c != '-')
- error ("Invalid escape character syntax");
- c = READCHAR;
- if (c == '\\')
- c = read_escape (readcharfun);
- return c | meta_modifier;
-
- case 'S':
- c = READCHAR;
- if (c != '-')
- error ("Invalid escape character syntax");
- c = READCHAR;
- if (c == '\\')
- c = read_escape (readcharfun);
- return c | shift_modifier;
-
- case 'H':
- c = READCHAR;
- if (c != '-')
- error ("Invalid escape character syntax");
- c = READCHAR;
- if (c == '\\')
- c = read_escape (readcharfun);
- return c | hyper_modifier;
+ /* \M-x etc: set modifier bit and parse the char to which it applies,
+ allowing for chains such as \M-\S-\A-\H-\s-\C-q. */
+ case 'M': mod = meta_modifier; goto mod_key;
+ case 'S': mod = shift_modifier; goto mod_key;
+ case 'H': mod = hyper_modifier; goto mod_key;
+ case 'A': mod = alt_modifier; goto mod_key;
+ case 's': mod = super_modifier; goto mod_key;
- case 'A':
- c = READCHAR;
- if (c != '-')
- error ("Invalid escape character syntax");
- c = READCHAR;
- if (c == '\\')
- c = read_escape (readcharfun);
- return c | alt_modifier;
-
- case 's':
- c = READCHAR;
- if (c != '-')
- {
- UNREAD (c);
- return ' ';
- }
- c = READCHAR;
- if (c == '\\')
- c = read_escape (readcharfun);
- return c | super_modifier;
+ mod_key:
+ {
+ int c1 = READCHAR;
+ if (c1 != '-')
+ {
+ if (c == 's')
+ {
+ /* \s not followed by a hyphen is SPC. */
+ UNREAD (c1);
+ chr = ' ';
+ break;
+ }
+ else
+ /* \M, \S, \H, \A not followed by a hyphen is an error. */
+ invalid_escape_syntax_error ();
+ }
+ modifiers |= mod;
+ c1 = READCHAR;
+ if (c1 == '\\')
+ {
+ next_char = READCHAR;
+ goto again;
+ }
+ chr = c1;
+ break;
+ }
+ /* Control modifiers (\C-x or \^x) are messy and not actually idempotent.
+ For example, ?\C-\C-a = ?\C-\001 = 0x4000001.
+ Keep a count of them and apply them separately. */
case 'C':
- c = READCHAR;
- if (c != '-')
- error ("Invalid escape character syntax");
+ {
+ int c1 = READCHAR;
+ if (c1 != '-')
+ invalid_escape_syntax_error ();
+ }
FALLTHROUGH;
+ /* The prefixes \C- and \^ are equivalent. */
case '^':
- c = READCHAR;
- if (c == '\\')
- c = read_escape (readcharfun);
- if ((c & ~CHAR_MODIFIER_MASK) == '?')
- return 0177 | (c & CHAR_MODIFIER_MASK);
- else if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
- return c | ctrl_modifier;
- /* ASCII control chars are made from letters (both cases),
- as well as the non-letters within 0100...0137. */
- else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
- return (c & (037 | ~0177));
- else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
- return (c & (037 | ~0177));
- else
- return c | ctrl_modifier;
-
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- /* An octal escape, as in ANSI C. */
{
- register int i = c - '0';
- register int count = 0;
- while (++count < 3)
+ ncontrol++;
+ int c1 = READCHAR;
+ if (c1 == '\\')
{
- if ((c = READCHAR) >= '0' && c <= '7')
- {
- i *= 8;
- i += c - '0';
- }
- else
+ next_char = READCHAR;
+ goto again;
+ }
+ chr = c1;
+ break;
+ }
+
+ /* 1-3 octal digits. Values in 0x80..0xff are encoded as raw bytes. */
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ {
+ int i = c - '0';
+ int count = 0;
+ while (count < 2)
+ {
+ int c = READCHAR;
+ if (c < '0' || c > '7')
{
UNREAD (c);
break;
}
+ i = (i << 3) + (c - '0');
+ count++;
}
if (i >= 0x80 && i < 0x100)
i = BYTE8_TO_CHAR (i);
- return i;
+ chr = i;
+ break;
}
+ /* 1 or more hex digits. Values may encode modifiers.
+ Values in 0x80..0xff using 2 hex digits are encoded as raw bytes. */
case 'x':
- /* A hex escape, as in ANSI C. */
{
unsigned int i = 0;
int count = 0;
while (1)
{
- c = READCHAR;
+ int c = READCHAR;
int digit = char_hexdigit (c);
if (digit < 0)
{
@@ -2796,40 +2779,37 @@ read_escape (Lisp_Object readcharfun)
i = (i << 4) + digit;
/* Allow hex escapes as large as ?\xfffffff, because some
packages use them to denote characters with modifiers. */
- if ((CHAR_META | (CHAR_META - 1)) < i)
+ if (i > (CHAR_META | (CHAR_META - 1)))
error ("Hex character out of range: \\x%x...", i);
count += count < 3;
}
+ if (count == 0)
+ invalid_escape_syntax_error ();
if (count < 3 && i >= 0x80)
- return BYTE8_TO_CHAR (i);
- return i;
+ i = BYTE8_TO_CHAR (i);
+ modifiers |= i & CHAR_MODIFIER_MASK;
+ chr = i & ~CHAR_MODIFIER_MASK;
+ break;
}
+ /* 8-digit Unicode hex escape: \UHHHHHHHH */
case 'U':
- /* Post-Unicode-2.0: Up to eight hex chars. */
unicode_hex_count = 8;
- FALLTHROUGH;
- case 'u':
+ goto unicode_hex;
- /* A Unicode escape. We only permit them in strings and characters,
- not arbitrarily in the source code, as in some other languages. */
+ /* 4-digit Unicode hex escape: \uHHHH */
+ case 'u':
+ unicode_hex_count = 4;
+ unicode_hex:
{
unsigned int i = 0;
- int count = 0;
-
- while (++count <= unicode_hex_count)
+ for (int count = 0; count < unicode_hex_count; count++)
{
- c = READCHAR;
+ int c = READCHAR;
if (c < 0)
- {
- if (unicode_hex_count > 4)
- error ("Malformed Unicode escape: \\U%x", i);
- else
- error ("Malformed Unicode escape: \\u%x", i);
- }
- /* `isdigit' and `isalpha' may be locale-specific, which we don't
- want. */
+ error ("Malformed Unicode escape: \\%c%x",
+ unicode_hex_count == 4 ? 'u' : 'U', i);
int digit = char_hexdigit (c);
if (digit < 0)
error ("Non-hex character used for Unicode escape: %c (%d)",
@@ -2838,13 +2818,14 @@ read_escape (Lisp_Object readcharfun)
}
if (i > 0x10FFFF)
error ("Non-Unicode character: 0x%x", i);
- return i;
+ chr = i;
+ break;
}
+ /* Named character: \N{name} */
case 'N':
- /* Named character. */
{
- c = READCHAR;
+ int c = READCHAR;
if (c != '{')
invalid_syntax ("Expected opening brace after \\N", readcharfun);
char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
@@ -2852,12 +2833,12 @@ read_escape (Lisp_Object readcharfun)
ptrdiff_t length = 0;
while (true)
{
- c = READCHAR;
+ int c = READCHAR;
if (c < 0)
end_of_file_error ();
if (c == '}')
break;
- if (! (0 < c && c < 0x80))
+ if (c >= 0x80)
{
AUTO_STRING (format,
"Invalid character U+%04X in character name");
@@ -2886,13 +2867,41 @@ read_escape (Lisp_Object readcharfun)
name[length] = '\0';
/* character_name_to_code can invoke read0, recursively.
- This is why read0's buffer is not static. */
- return character_name_to_code (name, length, readcharfun);
+ This is why read0 needs to be re-entrant. */
+ chr = character_name_to_code (name, length, readcharfun);
+ break;
}
default:
- return c;
+ chr = c;
+ break;
}
+ eassert (chr >= 0 && chr < (1 << CHARACTERBITS));
+
+ /* Apply Control modifiers, using the rules:
+ \C-X = ascii_ctrl(nomod(X)) | mods(X) if nomod(X) is one of:
+ A-Z a-z ? @ [ \ ] ^ _
+
+ X | ctrl_modifier otherwise
+
+ where
+ nomod(c) = c without modifiers
+ mods(c) = the modifiers of c
+ ascii_ctrl(c) = 127 if c = '?'
+ c & 0x1f otherwise
+ */
+ while (ncontrol > 0)
+ {
+ if ((chr >= '@' && chr <= '_') || (chr >= 'a' && chr <= 'z'))
+ chr &= 0x1f;
+ else if (chr == '?')
+ chr = 127;
+ else
+ modifiers |= ctrl_modifier;
+ ncontrol--;
+ }
+
+ return chr | modifiers;
}
/* Return the digit that CHARACTER stands for in the given BASE.
@@ -3014,7 +3023,7 @@ read_char_literal (Lisp_Object readcharfun)
}
if (ch == '\\')
- ch = read_escape (readcharfun);
+ ch = read_char_escape (readcharfun, READCHAR);
int modifiers = ch & CHAR_MODIFIER_MASK;
ch &= ~CHAR_MODIFIER_MASK;
@@ -3080,8 +3089,7 @@ read_string_literal (Lisp_Object readcharfun)
/* `\SPC' and `\LF' generate no characters at all. */
continue;
default:
- UNREAD (ch);
- ch = read_escape (readcharfun);
+ ch = read_char_escape (readcharfun, ch);
break;
}