From 308e3ab1dbd9633b843541af55d77c82b725df02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20Engdeg=C3=A5rd?= Date: Thu, 2 May 2024 18:05:21 +0200 Subject: [PATCH] Disallow string data resizing (bug#79784) Only allow string mutation that is certain not to require string data to be resized and reallocated: writing bytes into a unibyte string, and changing ASCII to ASCII in a multibyte string. This ensures that mutation will never transform a unibyte string to multibyte, that the size of a string in bytes never changes, and that the byte offsets of characters remain the same. Most importantly, it removes a long-standing obstacle to reform of string representation and allow for future performance improvements. * src/data.c (Faset): Disallow resizing string mutation. * src/fns.c (clear_string_char_byte_cache): * src/alloc.c (resize_string_data): Remove. * test/src/data-tests.el (data-aset-string): New test. * test/lisp/subr-tests.el (subr--subst-char-in-string): Skip error cases. * test/src/alloc-tests.el (aset-nbytes-change): Remove test that is no longer relevant. * doc/lispref/strings.texi (Modifying Strings): * doc/lispref/sequences.texi (Array Functions): * doc/lispref/text.texi (Substitution): Update manual. * etc/NEWS: Announce. --- doc/lispref/sequences.texi | 4 +-- doc/lispref/strings.texi | 10 +++----- doc/lispref/text.texi | 2 ++ etc/NEWS | 15 ++++++++++++ src/alloc.c | 50 -------------------------------------- src/data.c | 43 +++++++++++--------------------- src/fns.c | 6 ----- src/lisp.h | 2 -- test/lisp/subr-tests.el | 13 +++++++--- test/src/alloc-tests.el | 7 ------ test/src/data-tests.el | 20 +++++++++++++++ 11 files changed, 67 insertions(+), 105 deletions(-) diff --git a/doc/lispref/sequences.texi b/doc/lispref/sequences.texi index 5588d32c5e9..2f7c6876a8f 100644 --- a/doc/lispref/sequences.texi +++ b/doc/lispref/sequences.texi @@ -1441,8 +1441,8 @@ x The @var{array} should be mutable. @xref{Mutability}. If @var{array} is a string and @var{object} is not a character, a -@code{wrong-type-argument} error results. The function converts a -unibyte string to multibyte if necessary to insert a character. +@code{wrong-type-argument} error results. For more information about +string mutation, @pxref{Modifying Strings}. @end defun @defun fillarray array object diff --git a/doc/lispref/strings.texi b/doc/lispref/strings.texi index 93025574893..a3b335b426e 100644 --- a/doc/lispref/strings.texi +++ b/doc/lispref/strings.texi @@ -467,12 +467,10 @@ described in this section. @xref{Mutability}. The most basic way to alter the contents of an existing string is with @code{aset} (@pxref{Array Functions}). @w{@code{(aset @var{string} @var{idx} @var{char})}} stores @var{char} into @var{string} at character -index @var{idx}. It will automatically convert a pure-@acronym{ASCII} -@var{string} to a multibyte string (@pxref{Text Representations}) if -needed, but we recommend to always make sure @var{string} is multibyte -(e.g., by using @code{string-to-multibyte}, @pxref{Converting -Representations}), if @var{char} is a non-@acronym{ASCII} character, not -a raw byte. +index @var{idx}. When @var{string} is a unibyte string (@pxref{Text +Representations}), @var{char} must be a single byte (0--255); when +@var{string} is multibyte, both @var{char} and the previous character at +@var{idx} must be ASCII (0--127). To clear out a string that contained a password, use @code{clear-string}: diff --git a/doc/lispref/text.texi b/doc/lispref/text.texi index 60bf8ecc37b..943d08579ed 100644 --- a/doc/lispref/text.texi +++ b/doc/lispref/text.texi @@ -4641,6 +4641,8 @@ with @var{tochar} in @var{string}. By default, substitution occurs in a copy of @var{string}, but if the optional argument @var{inplace} is non-@code{nil}, the function modifies the @var{string} itself. In any case, the function returns the resulting string. + +For restrictions when altering an existing string, @pxref{Modifying Strings}. @end defun @deffn Command translate-region start end table diff --git a/etc/NEWS b/etc/NEWS index bd2ce33b851..aee83c2f604 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -2680,6 +2680,21 @@ enabled for files named "go.work". * Incompatible Lisp Changes in Emacs 31.1 ++++ +** String mutation has been restricted further. +'aset' on unibyte strings now requires the new character to be a single +byte (0-255). On multibyte strings the new character and the character +being replaced must both be ASCII (0-127). + +These rules ensure that mutation will never transform a unibyte string +to multibyte, and that the size of a string in bytes (as reported by +'string-bytes') never changes. They also allow strings to be +represented more efficiently in the future. + +Other functions that use 'aset' to modify string data, such as +'subst-char-in-string' with a non-nil INPLACE argument, will signal an +error if called with arguments that would violate these rules. + ** Nested backquotes are not supported any more in Pcase patterns. --- diff --git a/src/alloc.c b/src/alloc.c index 07ca8474bf3..9ace6f01856 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -1815,56 +1815,6 @@ allocate_string_data (struct Lisp_String *s, tally_consing (needed); } -/* Reallocate multibyte STRING data when a single character is replaced. - The character is at byte offset CIDX_BYTE in the string. - The character being replaced is CLEN bytes long, - and the character that will replace it is NEW_CLEN bytes long. - Return the address where the caller should store the new character. */ - -unsigned char * -resize_string_data (Lisp_Object string, ptrdiff_t cidx_byte, - int clen, int new_clen) -{ - eassume (STRING_MULTIBYTE (string)); - sdata *old_sdata = SDATA_OF_STRING (XSTRING (string)); - ptrdiff_t nchars = SCHARS (string); - ptrdiff_t nbytes = SBYTES (string); - ptrdiff_t new_nbytes = nbytes + (new_clen - clen); - unsigned char *data = SDATA (string); - unsigned char *new_charaddr; - - if (sdata_size (nbytes) == sdata_size (new_nbytes)) - { - /* No need to reallocate, as the size change falls within the - alignment slop. */ - XSTRING (string)->u.s.size_byte = new_nbytes; -#ifdef GC_CHECK_STRING_BYTES - SDATA_NBYTES (old_sdata) = new_nbytes; -#endif - new_charaddr = data + cidx_byte; - memmove (new_charaddr + new_clen, new_charaddr + clen, - nbytes - (cidx_byte + (clen - 1))); - } - else - { - allocate_string_data (XSTRING (string), nchars, new_nbytes, false, false); - unsigned char *new_data = SDATA (string); - new_charaddr = new_data + cidx_byte; - memcpy (new_charaddr + new_clen, data + cidx_byte + clen, - nbytes - (cidx_byte + clen)); - memcpy (new_data, data, cidx_byte); - - /* Mark old string data as free by setting its string back-pointer - to null, and record the size of the data in it. */ - SDATA_NBYTES (old_sdata) = nbytes; - old_sdata->string = NULL; - } - - clear_string_char_byte_cache (); - - return new_charaddr; -} - /* Sweep and compact strings. */ diff --git a/src/data.c b/src/data.c index 493a8dd63fc..b8a48203bcf 100644 --- a/src/data.c +++ b/src/data.c @@ -2574,7 +2574,10 @@ or a byte-code object. IDX starts at 0. */) DEFUN ("aset", Faset, Saset, 3, 3, 0, doc: /* Store into the element of ARRAY at index IDX the value NEWELT. Return NEWELT. ARRAY may be a vector, a string, a char-table or a -bool-vector. IDX starts at 0. */) +bool-vector. IDX starts at 0. +If ARRAY is a unibyte string, NEWELT must be a single byte (0-255). +If ARRAY is a multibyte string, NEWELT and the previous character at +index IDX must both be ASCII (0-127). */) (register Lisp_Object array, Lisp_Object idx, Lisp_Object newelt) { register EMACS_INT idxval; @@ -2613,42 +2616,24 @@ bool-vector. IDX starts at 0. */) args_out_of_range (array, idx); CHECK_CHARACTER (newelt); int c = XFIXNAT (newelt); - ptrdiff_t idxval_byte; - int prev_bytes; - unsigned char workbuf[MAX_MULTIBYTE_LENGTH], *p0 = workbuf, *p1; if (STRING_MULTIBYTE (array)) { - idxval_byte = string_char_to_byte (array, idxval); - p1 = SDATA (array) + idxval_byte; - prev_bytes = BYTES_BY_CHAR_HEAD (*p1); - } - else if (SINGLE_BYTE_CHAR_P (c)) - { - SSET (array, idxval, c); - return newelt; + if (c > 0x7f) + error ("Attempt to store non-ASCII char into multibyte string"); + ptrdiff_t idxval_byte = string_char_to_byte (array, idxval); + unsigned char *p = SDATA (array) + idxval_byte; + if (*p > 0x7f) + error ("Attempt to replace non-ASCII char in multibyte string"); + *p = c; } else { - for (ptrdiff_t i = SBYTES (array) - 1; i >= 0; i--) - if (!ASCII_CHAR_P (SREF (array, i))) - args_out_of_range (array, newelt); - /* ARRAY is an ASCII string. Convert it to a multibyte string. */ - STRING_SET_MULTIBYTE (array); - idxval_byte = idxval; - p1 = SDATA (array) + idxval_byte; - prev_bytes = 1; + if (c > 0xff) + error ("Attempt to store non-byte value into unibyte string"); + SSET (array, idxval, c); } - - int new_bytes = CHAR_STRING (c, p0); - if (prev_bytes != new_bytes) - p1 = resize_string_data (array, idxval_byte, prev_bytes, new_bytes); - - do - *p1++ = *p0++; - while (--new_bytes != 0); } - return newelt; } diff --git a/src/fns.c b/src/fns.c index 1cf63384218..5334c9f94a8 100644 --- a/src/fns.c +++ b/src/fns.c @@ -1189,12 +1189,6 @@ static Lisp_Object string_char_byte_cache_string; static ptrdiff_t string_char_byte_cache_charpos; static ptrdiff_t string_char_byte_cache_bytepos; -void -clear_string_char_byte_cache (void) -{ - string_char_byte_cache_string = Qnil; -} - /* Return the byte index corresponding to CHAR_INDEX in STRING. */ ptrdiff_t diff --git a/src/lisp.h b/src/lisp.h index 64b5c227583..fe942c917f0 100644 --- a/src/lisp.h +++ b/src/lisp.h @@ -4289,7 +4289,6 @@ extern Lisp_Object nconc2 (Lisp_Object, Lisp_Object); extern Lisp_Object assq_no_quit (Lisp_Object, Lisp_Object); extern Lisp_Object assq_no_signal (Lisp_Object, Lisp_Object); extern Lisp_Object assoc_no_quit (Lisp_Object, Lisp_Object); -extern void clear_string_char_byte_cache (void); extern ptrdiff_t string_char_to_byte (Lisp_Object, ptrdiff_t); extern ptrdiff_t string_byte_to_char (Lisp_Object, ptrdiff_t); extern Lisp_Object string_to_multibyte (Lisp_Object); @@ -4444,7 +4443,6 @@ extern void parse_str_as_multibyte (const unsigned char *, ptrdiff_t, /* Defined in alloc.c. */ extern intptr_t garbage_collection_inhibited; -unsigned char *resize_string_data (Lisp_Object, ptrdiff_t, int, int); extern void malloc_warning (const char *); extern AVOID memory_full (size_t); extern AVOID buffer_memory_full (ptrdiff_t); diff --git a/test/lisp/subr-tests.el b/test/lisp/subr-tests.el index de2c59b9c25..a4059a7d290 100644 --- a/test/lisp/subr-tests.el +++ b/test/lisp/subr-tests.el @@ -1454,9 +1454,16 @@ final or penultimate step during initialization.")) (dolist (inplace '(nil t)) (dolist (from '(?a ?é ?Ω #x80 #x3fff80)) (dolist (to '(?o ?á ?ƒ ?☃ #x1313f #xff #x3fffc9)) - ;; Can't put a non-byte value in a non-ASCII unibyte string. - (unless (and (not mb) (> to #xff) - (not (string-match-p (rx bos (* ascii) eos) str))) + (unless (or + ;; Can't put non-byte in a non-ASCII unibyte string. + (and (not mb) (> to #xff) + (not (string-match-p + (rx bos (* ascii) eos) str))) + ;; Skip illegal mutation. + (and inplace (not (if mb + (and (<= 0 from 127) + (<= 0 to 127)) + (<= 0 to 255))))) (let* ((in (copy-sequence str)) (ref (if (and (not mb) (> from #xff)) in ; nothing to replace diff --git a/test/src/alloc-tests.el b/test/src/alloc-tests.el index cba69023044..cf7d1ca1cd3 100644 --- a/test/src/alloc-tests.el +++ b/test/src/alloc-tests.el @@ -52,11 +52,4 @@ (dotimes (i 4) (should (eql (aref x i) (aref y i)))))) -;; Bug#39207 -(ert-deftest aset-nbytes-change () - (let ((s (make-string 1 ?a))) - (dolist (c (list 10003 ?b 128 ?c ?d (max-char) ?e)) - (aset s 0 c) - (should (equal s (make-string 1 c)))))) - ;;; alloc-tests.el ends here diff --git a/test/src/data-tests.el b/test/src/data-tests.el index 1eaf1759c17..e93cc3831f9 100644 --- a/test/src/data-tests.el +++ b/test/src/data-tests.el @@ -929,4 +929,24 @@ comparing the subr with a much slower Lisp implementation." ((eq subtype 'function) (cl-functionp val)) (t (should-not (cl-typep val subtype)))))))))) +(ert-deftest data-aset-string () + ;; unibyte + (let ((s (copy-sequence "abcdef"))) + (cl-assert (not (multibyte-string-p s))) + (aset s 4 ?E) + (should (equal s "abcdEf")) + (aset s 2 255) + (should (equal s "ab\377dEf")) + (should-error (aset s 3 256)) ; not a byte value + (should-error (aset s 3 #x3fff80))) ; not a byte value + ;; multibyte + (let ((s (copy-sequence "abçdef"))) + (cl-assert (multibyte-string-p s)) + (aset s 4 ?E) + (should (equal s "abçdEf")) + (should-error (aset s 2 ?c)) ; previous char not ASCII + (should-error (aset s 2 #xe9)) ; new char not ASCII + (should-error (aset s 3 #x3fff80))) ; new char not ASCII + ) + ;;; data-tests.el ends here