mirror of
git://git.sv.gnu.org/emacs.git
synced 2026-02-16 09:14:18 +00:00
Disallow string data resizing (bug#79784)
Only allow string mutation that is certain not to require string data to be resized and reallocated: writing bytes into a unibyte string, and changing ASCII to ASCII in a multibyte string. This ensures that mutation will never transform a unibyte string to multibyte, that the size of a string in bytes never changes, and that the byte offsets of characters remain the same. Most importantly, it removes a long-standing obstacle to reform of string representation and allow for future performance improvements. * src/data.c (Faset): Disallow resizing string mutation. * src/fns.c (clear_string_char_byte_cache): * src/alloc.c (resize_string_data): Remove. * test/src/data-tests.el (data-aset-string): New test. * test/lisp/subr-tests.el (subr--subst-char-in-string): Skip error cases. * test/src/alloc-tests.el (aset-nbytes-change): Remove test that is no longer relevant. * doc/lispref/strings.texi (Modifying Strings): * doc/lispref/sequences.texi (Array Functions): * doc/lispref/text.texi (Substitution): Update manual. * etc/NEWS: Announce.
This commit is contained in:
parent
230ed1f9b6
commit
308e3ab1db
11 changed files with 67 additions and 105 deletions
|
|
@ -1441,8 +1441,8 @@ x
|
|||
The @var{array} should be mutable. @xref{Mutability}.
|
||||
|
||||
If @var{array} is a string and @var{object} is not a character, a
|
||||
@code{wrong-type-argument} error results. The function converts a
|
||||
unibyte string to multibyte if necessary to insert a character.
|
||||
@code{wrong-type-argument} error results. For more information about
|
||||
string mutation, @pxref{Modifying Strings}.
|
||||
@end defun
|
||||
|
||||
@defun fillarray array object
|
||||
|
|
|
|||
|
|
@ -467,12 +467,10 @@ described in this section. @xref{Mutability}.
|
|||
The most basic way to alter the contents of an existing string is with
|
||||
@code{aset} (@pxref{Array Functions}). @w{@code{(aset @var{string}
|
||||
@var{idx} @var{char})}} stores @var{char} into @var{string} at character
|
||||
index @var{idx}. It will automatically convert a pure-@acronym{ASCII}
|
||||
@var{string} to a multibyte string (@pxref{Text Representations}) if
|
||||
needed, but we recommend to always make sure @var{string} is multibyte
|
||||
(e.g., by using @code{string-to-multibyte}, @pxref{Converting
|
||||
Representations}), if @var{char} is a non-@acronym{ASCII} character, not
|
||||
a raw byte.
|
||||
index @var{idx}. When @var{string} is a unibyte string (@pxref{Text
|
||||
Representations}), @var{char} must be a single byte (0--255); when
|
||||
@var{string} is multibyte, both @var{char} and the previous character at
|
||||
@var{idx} must be ASCII (0--127).
|
||||
|
||||
To clear out a string that contained a password, use
|
||||
@code{clear-string}:
|
||||
|
|
|
|||
|
|
@ -4641,6 +4641,8 @@ with @var{tochar} in @var{string}. By default, substitution occurs in
|
|||
a copy of @var{string}, but if the optional argument @var{inplace} is
|
||||
non-@code{nil}, the function modifies the @var{string} itself. In any
|
||||
case, the function returns the resulting string.
|
||||
|
||||
For restrictions when altering an existing string, @pxref{Modifying Strings}.
|
||||
@end defun
|
||||
|
||||
@deffn Command translate-region start end table
|
||||
|
|
|
|||
15
etc/NEWS
15
etc/NEWS
|
|
@ -2680,6 +2680,21 @@ enabled for files named "go.work".
|
|||
|
||||
* Incompatible Lisp Changes in Emacs 31.1
|
||||
|
||||
+++
|
||||
** String mutation has been restricted further.
|
||||
'aset' on unibyte strings now requires the new character to be a single
|
||||
byte (0-255). On multibyte strings the new character and the character
|
||||
being replaced must both be ASCII (0-127).
|
||||
|
||||
These rules ensure that mutation will never transform a unibyte string
|
||||
to multibyte, and that the size of a string in bytes (as reported by
|
||||
'string-bytes') never changes. They also allow strings to be
|
||||
represented more efficiently in the future.
|
||||
|
||||
Other functions that use 'aset' to modify string data, such as
|
||||
'subst-char-in-string' with a non-nil INPLACE argument, will signal an
|
||||
error if called with arguments that would violate these rules.
|
||||
|
||||
** Nested backquotes are not supported any more in Pcase patterns.
|
||||
|
||||
---
|
||||
|
|
|
|||
50
src/alloc.c
50
src/alloc.c
|
|
@ -1815,56 +1815,6 @@ allocate_string_data (struct Lisp_String *s,
|
|||
tally_consing (needed);
|
||||
}
|
||||
|
||||
/* Reallocate multibyte STRING data when a single character is replaced.
|
||||
The character is at byte offset CIDX_BYTE in the string.
|
||||
The character being replaced is CLEN bytes long,
|
||||
and the character that will replace it is NEW_CLEN bytes long.
|
||||
Return the address where the caller should store the new character. */
|
||||
|
||||
unsigned char *
|
||||
resize_string_data (Lisp_Object string, ptrdiff_t cidx_byte,
|
||||
int clen, int new_clen)
|
||||
{
|
||||
eassume (STRING_MULTIBYTE (string));
|
||||
sdata *old_sdata = SDATA_OF_STRING (XSTRING (string));
|
||||
ptrdiff_t nchars = SCHARS (string);
|
||||
ptrdiff_t nbytes = SBYTES (string);
|
||||
ptrdiff_t new_nbytes = nbytes + (new_clen - clen);
|
||||
unsigned char *data = SDATA (string);
|
||||
unsigned char *new_charaddr;
|
||||
|
||||
if (sdata_size (nbytes) == sdata_size (new_nbytes))
|
||||
{
|
||||
/* No need to reallocate, as the size change falls within the
|
||||
alignment slop. */
|
||||
XSTRING (string)->u.s.size_byte = new_nbytes;
|
||||
#ifdef GC_CHECK_STRING_BYTES
|
||||
SDATA_NBYTES (old_sdata) = new_nbytes;
|
||||
#endif
|
||||
new_charaddr = data + cidx_byte;
|
||||
memmove (new_charaddr + new_clen, new_charaddr + clen,
|
||||
nbytes - (cidx_byte + (clen - 1)));
|
||||
}
|
||||
else
|
||||
{
|
||||
allocate_string_data (XSTRING (string), nchars, new_nbytes, false, false);
|
||||
unsigned char *new_data = SDATA (string);
|
||||
new_charaddr = new_data + cidx_byte;
|
||||
memcpy (new_charaddr + new_clen, data + cidx_byte + clen,
|
||||
nbytes - (cidx_byte + clen));
|
||||
memcpy (new_data, data, cidx_byte);
|
||||
|
||||
/* Mark old string data as free by setting its string back-pointer
|
||||
to null, and record the size of the data in it. */
|
||||
SDATA_NBYTES (old_sdata) = nbytes;
|
||||
old_sdata->string = NULL;
|
||||
}
|
||||
|
||||
clear_string_char_byte_cache ();
|
||||
|
||||
return new_charaddr;
|
||||
}
|
||||
|
||||
|
||||
/* Sweep and compact strings. */
|
||||
|
||||
|
|
|
|||
43
src/data.c
43
src/data.c
|
|
@ -2574,7 +2574,10 @@ or a byte-code object. IDX starts at 0. */)
|
|||
DEFUN ("aset", Faset, Saset, 3, 3, 0,
|
||||
doc: /* Store into the element of ARRAY at index IDX the value NEWELT.
|
||||
Return NEWELT. ARRAY may be a vector, a string, a char-table or a
|
||||
bool-vector. IDX starts at 0. */)
|
||||
bool-vector. IDX starts at 0.
|
||||
If ARRAY is a unibyte string, NEWELT must be a single byte (0-255).
|
||||
If ARRAY is a multibyte string, NEWELT and the previous character at
|
||||
index IDX must both be ASCII (0-127). */)
|
||||
(register Lisp_Object array, Lisp_Object idx, Lisp_Object newelt)
|
||||
{
|
||||
register EMACS_INT idxval;
|
||||
|
|
@ -2613,42 +2616,24 @@ bool-vector. IDX starts at 0. */)
|
|||
args_out_of_range (array, idx);
|
||||
CHECK_CHARACTER (newelt);
|
||||
int c = XFIXNAT (newelt);
|
||||
ptrdiff_t idxval_byte;
|
||||
int prev_bytes;
|
||||
unsigned char workbuf[MAX_MULTIBYTE_LENGTH], *p0 = workbuf, *p1;
|
||||
|
||||
if (STRING_MULTIBYTE (array))
|
||||
{
|
||||
idxval_byte = string_char_to_byte (array, idxval);
|
||||
p1 = SDATA (array) + idxval_byte;
|
||||
prev_bytes = BYTES_BY_CHAR_HEAD (*p1);
|
||||
}
|
||||
else if (SINGLE_BYTE_CHAR_P (c))
|
||||
{
|
||||
SSET (array, idxval, c);
|
||||
return newelt;
|
||||
if (c > 0x7f)
|
||||
error ("Attempt to store non-ASCII char into multibyte string");
|
||||
ptrdiff_t idxval_byte = string_char_to_byte (array, idxval);
|
||||
unsigned char *p = SDATA (array) + idxval_byte;
|
||||
if (*p > 0x7f)
|
||||
error ("Attempt to replace non-ASCII char in multibyte string");
|
||||
*p = c;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (ptrdiff_t i = SBYTES (array) - 1; i >= 0; i--)
|
||||
if (!ASCII_CHAR_P (SREF (array, i)))
|
||||
args_out_of_range (array, newelt);
|
||||
/* ARRAY is an ASCII string. Convert it to a multibyte string. */
|
||||
STRING_SET_MULTIBYTE (array);
|
||||
idxval_byte = idxval;
|
||||
p1 = SDATA (array) + idxval_byte;
|
||||
prev_bytes = 1;
|
||||
if (c > 0xff)
|
||||
error ("Attempt to store non-byte value into unibyte string");
|
||||
SSET (array, idxval, c);
|
||||
}
|
||||
|
||||
int new_bytes = CHAR_STRING (c, p0);
|
||||
if (prev_bytes != new_bytes)
|
||||
p1 = resize_string_data (array, idxval_byte, prev_bytes, new_bytes);
|
||||
|
||||
do
|
||||
*p1++ = *p0++;
|
||||
while (--new_bytes != 0);
|
||||
}
|
||||
|
||||
return newelt;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1189,12 +1189,6 @@ static Lisp_Object string_char_byte_cache_string;
|
|||
static ptrdiff_t string_char_byte_cache_charpos;
|
||||
static ptrdiff_t string_char_byte_cache_bytepos;
|
||||
|
||||
void
|
||||
clear_string_char_byte_cache (void)
|
||||
{
|
||||
string_char_byte_cache_string = Qnil;
|
||||
}
|
||||
|
||||
/* Return the byte index corresponding to CHAR_INDEX in STRING. */
|
||||
|
||||
ptrdiff_t
|
||||
|
|
|
|||
|
|
@ -4289,7 +4289,6 @@ extern Lisp_Object nconc2 (Lisp_Object, Lisp_Object);
|
|||
extern Lisp_Object assq_no_quit (Lisp_Object, Lisp_Object);
|
||||
extern Lisp_Object assq_no_signal (Lisp_Object, Lisp_Object);
|
||||
extern Lisp_Object assoc_no_quit (Lisp_Object, Lisp_Object);
|
||||
extern void clear_string_char_byte_cache (void);
|
||||
extern ptrdiff_t string_char_to_byte (Lisp_Object, ptrdiff_t);
|
||||
extern ptrdiff_t string_byte_to_char (Lisp_Object, ptrdiff_t);
|
||||
extern Lisp_Object string_to_multibyte (Lisp_Object);
|
||||
|
|
@ -4444,7 +4443,6 @@ extern void parse_str_as_multibyte (const unsigned char *, ptrdiff_t,
|
|||
|
||||
/* Defined in alloc.c. */
|
||||
extern intptr_t garbage_collection_inhibited;
|
||||
unsigned char *resize_string_data (Lisp_Object, ptrdiff_t, int, int);
|
||||
extern void malloc_warning (const char *);
|
||||
extern AVOID memory_full (size_t);
|
||||
extern AVOID buffer_memory_full (ptrdiff_t);
|
||||
|
|
|
|||
|
|
@ -1454,9 +1454,16 @@ final or penultimate step during initialization."))
|
|||
(dolist (inplace '(nil t))
|
||||
(dolist (from '(?a ?é ?Ω #x80 #x3fff80))
|
||||
(dolist (to '(?o ?á ?ƒ ?☃ #x1313f #xff #x3fffc9))
|
||||
;; Can't put a non-byte value in a non-ASCII unibyte string.
|
||||
(unless (and (not mb) (> to #xff)
|
||||
(not (string-match-p (rx bos (* ascii) eos) str)))
|
||||
(unless (or
|
||||
;; Can't put non-byte in a non-ASCII unibyte string.
|
||||
(and (not mb) (> to #xff)
|
||||
(not (string-match-p
|
||||
(rx bos (* ascii) eos) str)))
|
||||
;; Skip illegal mutation.
|
||||
(and inplace (not (if mb
|
||||
(and (<= 0 from 127)
|
||||
(<= 0 to 127))
|
||||
(<= 0 to 255)))))
|
||||
(let* ((in (copy-sequence str))
|
||||
(ref (if (and (not mb) (> from #xff))
|
||||
in ; nothing to replace
|
||||
|
|
|
|||
|
|
@ -52,11 +52,4 @@
|
|||
(dotimes (i 4)
|
||||
(should (eql (aref x i) (aref y i))))))
|
||||
|
||||
;; Bug#39207
|
||||
(ert-deftest aset-nbytes-change ()
|
||||
(let ((s (make-string 1 ?a)))
|
||||
(dolist (c (list 10003 ?b 128 ?c ?d (max-char) ?e))
|
||||
(aset s 0 c)
|
||||
(should (equal s (make-string 1 c))))))
|
||||
|
||||
;;; alloc-tests.el ends here
|
||||
|
|
|
|||
|
|
@ -929,4 +929,24 @@ comparing the subr with a much slower Lisp implementation."
|
|||
((eq subtype 'function) (cl-functionp val))
|
||||
(t (should-not (cl-typep val subtype))))))))))
|
||||
|
||||
(ert-deftest data-aset-string ()
|
||||
;; unibyte
|
||||
(let ((s (copy-sequence "abcdef")))
|
||||
(cl-assert (not (multibyte-string-p s)))
|
||||
(aset s 4 ?E)
|
||||
(should (equal s "abcdEf"))
|
||||
(aset s 2 255)
|
||||
(should (equal s "ab\377dEf"))
|
||||
(should-error (aset s 3 256)) ; not a byte value
|
||||
(should-error (aset s 3 #x3fff80))) ; not a byte value
|
||||
;; multibyte
|
||||
(let ((s (copy-sequence "abçdef")))
|
||||
(cl-assert (multibyte-string-p s))
|
||||
(aset s 4 ?E)
|
||||
(should (equal s "abçdEf"))
|
||||
(should-error (aset s 2 ?c)) ; previous char not ASCII
|
||||
(should-error (aset s 2 #xe9)) ; new char not ASCII
|
||||
(should-error (aset s 3 #x3fff80))) ; new char not ASCII
|
||||
)
|
||||
|
||||
;;; data-tests.el ends here
|
||||
|
|
|
|||
Loading…
Reference in a new issue