Disallow string data resizing (bug#79784)

Only allow string mutation that is certain not to require string data to
be resized and reallocated: writing bytes into a unibyte string, and
changing ASCII to ASCII in a multibyte string.

This ensures that mutation will never transform a unibyte string to
multibyte, that the size of a string in bytes never changes, and that
the byte offsets of characters remain the same.  Most importantly, it
removes a long-standing obstacle to reform of string representation and
allow for future performance improvements.

* src/data.c (Faset): Disallow resizing string mutation.
* src/fns.c (clear_string_char_byte_cache):
* src/alloc.c (resize_string_data):  Remove.
* test/src/data-tests.el (data-aset-string): New test.
* test/lisp/subr-tests.el (subr--subst-char-in-string):
Skip error cases.
* test/src/alloc-tests.el (aset-nbytes-change):
Remove test that is no longer relevant.
* doc/lispref/strings.texi (Modifying Strings):
* doc/lispref/sequences.texi (Array Functions):
* doc/lispref/text.texi (Substitution):  Update manual.
* etc/NEWS: Announce.
This commit is contained in:
Mattias Engdegård 2024-05-02 18:05:21 +02:00 committed by Mattias Engdegård
parent 230ed1f9b6
commit 308e3ab1db
11 changed files with 67 additions and 105 deletions

View file

@ -1441,8 +1441,8 @@ x
The @var{array} should be mutable. @xref{Mutability}.
If @var{array} is a string and @var{object} is not a character, a
@code{wrong-type-argument} error results. The function converts a
unibyte string to multibyte if necessary to insert a character.
@code{wrong-type-argument} error results. For more information about
string mutation, @pxref{Modifying Strings}.
@end defun
@defun fillarray array object

View file

@ -467,12 +467,10 @@ described in this section. @xref{Mutability}.
The most basic way to alter the contents of an existing string is with
@code{aset} (@pxref{Array Functions}). @w{@code{(aset @var{string}
@var{idx} @var{char})}} stores @var{char} into @var{string} at character
index @var{idx}. It will automatically convert a pure-@acronym{ASCII}
@var{string} to a multibyte string (@pxref{Text Representations}) if
needed, but we recommend to always make sure @var{string} is multibyte
(e.g., by using @code{string-to-multibyte}, @pxref{Converting
Representations}), if @var{char} is a non-@acronym{ASCII} character, not
a raw byte.
index @var{idx}. When @var{string} is a unibyte string (@pxref{Text
Representations}), @var{char} must be a single byte (0--255); when
@var{string} is multibyte, both @var{char} and the previous character at
@var{idx} must be ASCII (0--127).
To clear out a string that contained a password, use
@code{clear-string}:

View file

@ -4641,6 +4641,8 @@ with @var{tochar} in @var{string}. By default, substitution occurs in
a copy of @var{string}, but if the optional argument @var{inplace} is
non-@code{nil}, the function modifies the @var{string} itself. In any
case, the function returns the resulting string.
For restrictions when altering an existing string, @pxref{Modifying Strings}.
@end defun
@deffn Command translate-region start end table

View file

@ -2680,6 +2680,21 @@ enabled for files named "go.work".
* Incompatible Lisp Changes in Emacs 31.1
+++
** String mutation has been restricted further.
'aset' on unibyte strings now requires the new character to be a single
byte (0-255). On multibyte strings the new character and the character
being replaced must both be ASCII (0-127).
These rules ensure that mutation will never transform a unibyte string
to multibyte, and that the size of a string in bytes (as reported by
'string-bytes') never changes. They also allow strings to be
represented more efficiently in the future.
Other functions that use 'aset' to modify string data, such as
'subst-char-in-string' with a non-nil INPLACE argument, will signal an
error if called with arguments that would violate these rules.
** Nested backquotes are not supported any more in Pcase patterns.
---

View file

@ -1815,56 +1815,6 @@ allocate_string_data (struct Lisp_String *s,
tally_consing (needed);
}
/* Reallocate multibyte STRING data when a single character is replaced.
The character is at byte offset CIDX_BYTE in the string.
The character being replaced is CLEN bytes long,
and the character that will replace it is NEW_CLEN bytes long.
Return the address where the caller should store the new character. */
unsigned char *
resize_string_data (Lisp_Object string, ptrdiff_t cidx_byte,
int clen, int new_clen)
{
eassume (STRING_MULTIBYTE (string));
sdata *old_sdata = SDATA_OF_STRING (XSTRING (string));
ptrdiff_t nchars = SCHARS (string);
ptrdiff_t nbytes = SBYTES (string);
ptrdiff_t new_nbytes = nbytes + (new_clen - clen);
unsigned char *data = SDATA (string);
unsigned char *new_charaddr;
if (sdata_size (nbytes) == sdata_size (new_nbytes))
{
/* No need to reallocate, as the size change falls within the
alignment slop. */
XSTRING (string)->u.s.size_byte = new_nbytes;
#ifdef GC_CHECK_STRING_BYTES
SDATA_NBYTES (old_sdata) = new_nbytes;
#endif
new_charaddr = data + cidx_byte;
memmove (new_charaddr + new_clen, new_charaddr + clen,
nbytes - (cidx_byte + (clen - 1)));
}
else
{
allocate_string_data (XSTRING (string), nchars, new_nbytes, false, false);
unsigned char *new_data = SDATA (string);
new_charaddr = new_data + cidx_byte;
memcpy (new_charaddr + new_clen, data + cidx_byte + clen,
nbytes - (cidx_byte + clen));
memcpy (new_data, data, cidx_byte);
/* Mark old string data as free by setting its string back-pointer
to null, and record the size of the data in it. */
SDATA_NBYTES (old_sdata) = nbytes;
old_sdata->string = NULL;
}
clear_string_char_byte_cache ();
return new_charaddr;
}
/* Sweep and compact strings. */

View file

@ -2574,7 +2574,10 @@ or a byte-code object. IDX starts at 0. */)
DEFUN ("aset", Faset, Saset, 3, 3, 0,
doc: /* Store into the element of ARRAY at index IDX the value NEWELT.
Return NEWELT. ARRAY may be a vector, a string, a char-table or a
bool-vector. IDX starts at 0. */)
bool-vector. IDX starts at 0.
If ARRAY is a unibyte string, NEWELT must be a single byte (0-255).
If ARRAY is a multibyte string, NEWELT and the previous character at
index IDX must both be ASCII (0-127). */)
(register Lisp_Object array, Lisp_Object idx, Lisp_Object newelt)
{
register EMACS_INT idxval;
@ -2613,42 +2616,24 @@ bool-vector. IDX starts at 0. */)
args_out_of_range (array, idx);
CHECK_CHARACTER (newelt);
int c = XFIXNAT (newelt);
ptrdiff_t idxval_byte;
int prev_bytes;
unsigned char workbuf[MAX_MULTIBYTE_LENGTH], *p0 = workbuf, *p1;
if (STRING_MULTIBYTE (array))
{
idxval_byte = string_char_to_byte (array, idxval);
p1 = SDATA (array) + idxval_byte;
prev_bytes = BYTES_BY_CHAR_HEAD (*p1);
}
else if (SINGLE_BYTE_CHAR_P (c))
{
SSET (array, idxval, c);
return newelt;
if (c > 0x7f)
error ("Attempt to store non-ASCII char into multibyte string");
ptrdiff_t idxval_byte = string_char_to_byte (array, idxval);
unsigned char *p = SDATA (array) + idxval_byte;
if (*p > 0x7f)
error ("Attempt to replace non-ASCII char in multibyte string");
*p = c;
}
else
{
for (ptrdiff_t i = SBYTES (array) - 1; i >= 0; i--)
if (!ASCII_CHAR_P (SREF (array, i)))
args_out_of_range (array, newelt);
/* ARRAY is an ASCII string. Convert it to a multibyte string. */
STRING_SET_MULTIBYTE (array);
idxval_byte = idxval;
p1 = SDATA (array) + idxval_byte;
prev_bytes = 1;
if (c > 0xff)
error ("Attempt to store non-byte value into unibyte string");
SSET (array, idxval, c);
}
int new_bytes = CHAR_STRING (c, p0);
if (prev_bytes != new_bytes)
p1 = resize_string_data (array, idxval_byte, prev_bytes, new_bytes);
do
*p1++ = *p0++;
while (--new_bytes != 0);
}
return newelt;
}

View file

@ -1189,12 +1189,6 @@ static Lisp_Object string_char_byte_cache_string;
static ptrdiff_t string_char_byte_cache_charpos;
static ptrdiff_t string_char_byte_cache_bytepos;
void
clear_string_char_byte_cache (void)
{
string_char_byte_cache_string = Qnil;
}
/* Return the byte index corresponding to CHAR_INDEX in STRING. */
ptrdiff_t

View file

@ -4289,7 +4289,6 @@ extern Lisp_Object nconc2 (Lisp_Object, Lisp_Object);
extern Lisp_Object assq_no_quit (Lisp_Object, Lisp_Object);
extern Lisp_Object assq_no_signal (Lisp_Object, Lisp_Object);
extern Lisp_Object assoc_no_quit (Lisp_Object, Lisp_Object);
extern void clear_string_char_byte_cache (void);
extern ptrdiff_t string_char_to_byte (Lisp_Object, ptrdiff_t);
extern ptrdiff_t string_byte_to_char (Lisp_Object, ptrdiff_t);
extern Lisp_Object string_to_multibyte (Lisp_Object);
@ -4444,7 +4443,6 @@ extern void parse_str_as_multibyte (const unsigned char *, ptrdiff_t,
/* Defined in alloc.c. */
extern intptr_t garbage_collection_inhibited;
unsigned char *resize_string_data (Lisp_Object, ptrdiff_t, int, int);
extern void malloc_warning (const char *);
extern AVOID memory_full (size_t);
extern AVOID buffer_memory_full (ptrdiff_t);

View file

@ -1454,9 +1454,16 @@ final or penultimate step during initialization."))
(dolist (inplace '(nil t))
(dolist (from '(?a #x80 #x3fff80))
(dolist (to '(?o ?☃ #x1313f #xff #x3fffc9))
;; Can't put a non-byte value in a non-ASCII unibyte string.
(unless (and (not mb) (> to #xff)
(not (string-match-p (rx bos (* ascii) eos) str)))
(unless (or
;; Can't put non-byte in a non-ASCII unibyte string.
(and (not mb) (> to #xff)
(not (string-match-p
(rx bos (* ascii) eos) str)))
;; Skip illegal mutation.
(and inplace (not (if mb
(and (<= 0 from 127)
(<= 0 to 127))
(<= 0 to 255)))))
(let* ((in (copy-sequence str))
(ref (if (and (not mb) (> from #xff))
in ; nothing to replace

View file

@ -52,11 +52,4 @@
(dotimes (i 4)
(should (eql (aref x i) (aref y i))))))
;; Bug#39207
(ert-deftest aset-nbytes-change ()
(let ((s (make-string 1 ?a)))
(dolist (c (list 10003 ?b 128 ?c ?d (max-char) ?e))
(aset s 0 c)
(should (equal s (make-string 1 c))))))
;;; alloc-tests.el ends here

View file

@ -929,4 +929,24 @@ comparing the subr with a much slower Lisp implementation."
((eq subtype 'function) (cl-functionp val))
(t (should-not (cl-typep val subtype))))))))))
(ert-deftest data-aset-string ()
;; unibyte
(let ((s (copy-sequence "abcdef")))
(cl-assert (not (multibyte-string-p s)))
(aset s 4 ?E)
(should (equal s "abcdEf"))
(aset s 2 255)
(should (equal s "ab\377dEf"))
(should-error (aset s 3 256)) ; not a byte value
(should-error (aset s 3 #x3fff80))) ; not a byte value
;; multibyte
(let ((s (copy-sequence "abçdef")))
(cl-assert (multibyte-string-p s))
(aset s 4 ?E)
(should (equal s "abçdEf"))
(should-error (aset s 2 ?c)) ; previous char not ASCII
(should-error (aset s 2 #xe9)) ; new char not ASCII
(should-error (aset s 3 #x3fff80))) ; new char not ASCII
)
;;; data-tests.el ends here