From 308e3ab1dbd9633b843541af55d77c82b725df02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mattias=20Engdeg=C3=A5rd?= <mattiase@acm.org>
Date: Thu, 2 May 2024 18:05:21 +0200
Subject: [PATCH] Disallow string data resizing (bug#79784)

Only allow string mutation that is certain not to require string data to
be resized and reallocated: writing bytes into a unibyte string, and
changing ASCII to ASCII in a multibyte string.

This ensures that mutation will never transform a unibyte string to
multibyte, that the size of a string in bytes never changes, and that
the byte offsets of characters remain the same.  Most importantly, it
removes a long-standing obstacle to reform of string representation and
allow for future performance improvements.

* src/data.c (Faset): Disallow resizing string mutation.
* src/fns.c (clear_string_char_byte_cache):
* src/alloc.c (resize_string_data):  Remove.
* test/src/data-tests.el (data-aset-string): New test.
* test/lisp/subr-tests.el (subr--subst-char-in-string):
Skip error cases.
* test/src/alloc-tests.el (aset-nbytes-change):
Remove test that is no longer relevant.
* doc/lispref/strings.texi (Modifying Strings):
* doc/lispref/sequences.texi (Array Functions):
* doc/lispref/text.texi (Substitution):  Update manual.
* etc/NEWS: Announce.
---
 doc/lispref/sequences.texi |  4 +--
 doc/lispref/strings.texi   | 10 +++-----
 doc/lispref/text.texi      |  2 ++
 etc/NEWS                   | 15 ++++++++++++
 src/alloc.c                | 50 --------------------------------------
 src/data.c                 | 43 +++++++++++---------------------
 src/fns.c                  |  6 -----
 src/lisp.h                 |  2 --
 test/lisp/subr-tests.el    | 13 +++++++---
 test/src/alloc-tests.el    |  7 ------
 test/src/data-tests.el     | 20 +++++++++++++++
 11 files changed, 67 insertions(+), 105 deletions(-)

diff --git a/doc/lispref/sequences.texi b/doc/lispref/sequences.texi
index 5588d32c5e9..2f7c6876a8f 100644
--- a/doc/lispref/sequences.texi
+++ b/doc/lispref/sequences.texi
@@ -1441,8 +1441,8 @@ x
 The @var{array} should be mutable.  @xref{Mutability}.
 
 If @var{array} is a string and @var{object} is not a character, a
-@code{wrong-type-argument} error results.  The function converts a
-unibyte string to multibyte if necessary to insert a character.
+@code{wrong-type-argument} error results.  For more information about
+string mutation, @pxref{Modifying Strings}.
 @end defun
 
 @defun fillarray array object
diff --git a/doc/lispref/strings.texi b/doc/lispref/strings.texi
index 93025574893..a3b335b426e 100644
--- a/doc/lispref/strings.texi
+++ b/doc/lispref/strings.texi
@@ -467,12 +467,10 @@ described in this section.  @xref{Mutability}.
   The most basic way to alter the contents of an existing string is with
 @code{aset} (@pxref{Array Functions}).  @w{@code{(aset @var{string}
 @var{idx} @var{char})}} stores @var{char} into @var{string} at character
-index @var{idx}.  It will automatically convert a pure-@acronym{ASCII}
-@var{string} to a multibyte string (@pxref{Text Representations}) if
-needed, but we recommend to always make sure @var{string} is multibyte
-(e.g., by using @code{string-to-multibyte}, @pxref{Converting
-Representations}), if @var{char} is a non-@acronym{ASCII} character, not
-a raw byte.
+index @var{idx}.  When @var{string} is a unibyte string (@pxref{Text
+Representations}), @var{char} must be a single byte (0--255); when
+@var{string} is multibyte, both @var{char} and the previous character at
+@var{idx} must be ASCII (0--127).
 
   To clear out a string that contained a password, use
 @code{clear-string}:
diff --git a/doc/lispref/text.texi b/doc/lispref/text.texi
index 60bf8ecc37b..943d08579ed 100644
--- a/doc/lispref/text.texi
+++ b/doc/lispref/text.texi
@@ -4641,6 +4641,8 @@ with @var{tochar} in @var{string}.  By default, substitution occurs in
 a copy of @var{string}, but if the optional argument @var{inplace} is
 non-@code{nil}, the function modifies the @var{string} itself.  In any
 case, the function returns the resulting string.
+
+For restrictions when altering an existing string, @pxref{Modifying Strings}.
 @end defun
 
 @deffn Command translate-region start end table
diff --git a/etc/NEWS b/etc/NEWS
index bd2ce33b851..aee83c2f604 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -2680,6 +2680,21 @@ enabled for files named "go.work".
 
 * Incompatible Lisp Changes in Emacs 31.1
 
++++
+** String mutation has been restricted further.
+'aset' on unibyte strings now requires the new character to be a single
+byte (0-255).  On multibyte strings the new character and the character
+being replaced must both be ASCII (0-127).
+
+These rules ensure that mutation will never transform a unibyte string
+to multibyte, and that the size of a string in bytes (as reported by
+'string-bytes') never changes.  They also allow strings to be
+represented more efficiently in the future.
+
+Other functions that use 'aset' to modify string data, such as
+'subst-char-in-string' with a non-nil INPLACE argument, will signal an
+error if called with arguments that would violate these rules.
+
 ** Nested backquotes are not supported any more in Pcase patterns.
 
 ---
diff --git a/src/alloc.c b/src/alloc.c
index 07ca8474bf3..9ace6f01856 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -1815,56 +1815,6 @@ allocate_string_data (struct Lisp_String *s,
   tally_consing (needed);
 }
 
-/* Reallocate multibyte STRING data when a single character is replaced.
-   The character is at byte offset CIDX_BYTE in the string.
-   The character being replaced is CLEN bytes long,
-   and the character that will replace it is NEW_CLEN bytes long.
-   Return the address where the caller should store the new character.  */
-
-unsigned char *
-resize_string_data (Lisp_Object string, ptrdiff_t cidx_byte,
-		    int clen, int new_clen)
-{
-  eassume (STRING_MULTIBYTE (string));
-  sdata *old_sdata = SDATA_OF_STRING (XSTRING (string));
-  ptrdiff_t nchars = SCHARS (string);
-  ptrdiff_t nbytes = SBYTES (string);
-  ptrdiff_t new_nbytes = nbytes + (new_clen - clen);
-  unsigned char *data = SDATA (string);
-  unsigned char *new_charaddr;
-
-  if (sdata_size (nbytes) == sdata_size (new_nbytes))
-    {
-      /* No need to reallocate, as the size change falls within the
-	 alignment slop.  */
-      XSTRING (string)->u.s.size_byte = new_nbytes;
-#ifdef GC_CHECK_STRING_BYTES
-      SDATA_NBYTES (old_sdata) = new_nbytes;
-#endif
-      new_charaddr = data + cidx_byte;
-      memmove (new_charaddr + new_clen, new_charaddr + clen,
-	       nbytes - (cidx_byte + (clen - 1)));
-    }
-  else
-    {
-      allocate_string_data (XSTRING (string), nchars, new_nbytes, false, false);
-      unsigned char *new_data = SDATA (string);
-      new_charaddr = new_data + cidx_byte;
-      memcpy (new_charaddr + new_clen, data + cidx_byte + clen,
-	      nbytes - (cidx_byte + clen));
-      memcpy (new_data, data, cidx_byte);
-
-      /* Mark old string data as free by setting its string back-pointer
-	 to null, and record the size of the data in it.  */
-      SDATA_NBYTES (old_sdata) = nbytes;
-      old_sdata->string = NULL;
-    }
-
-  clear_string_char_byte_cache ();
-
-  return new_charaddr;
-}
-
 
 /* Sweep and compact strings.  */
 
diff --git a/src/data.c b/src/data.c
index 493a8dd63fc..b8a48203bcf 100644
--- a/src/data.c
+++ b/src/data.c
@@ -2574,7 +2574,10 @@ or a byte-code object.  IDX starts at 0.  */)
 DEFUN ("aset", Faset, Saset, 3, 3, 0,
        doc: /* Store into the element of ARRAY at index IDX the value NEWELT.
 Return NEWELT.  ARRAY may be a vector, a string, a char-table or a
-bool-vector.  IDX starts at 0.  */)
+bool-vector.  IDX starts at 0.
+If ARRAY is a unibyte string, NEWELT must be a single byte (0-255).
+If ARRAY is a multibyte string, NEWELT and the previous character at
+index IDX must both be ASCII (0-127).  */)
   (register Lisp_Object array, Lisp_Object idx, Lisp_Object newelt)
 {
   register EMACS_INT idxval;
@@ -2613,42 +2616,24 @@ bool-vector.  IDX starts at 0.  */)
 	args_out_of_range (array, idx);
       CHECK_CHARACTER (newelt);
       int c = XFIXNAT (newelt);
-      ptrdiff_t idxval_byte;
-      int prev_bytes;
-      unsigned char workbuf[MAX_MULTIBYTE_LENGTH], *p0 = workbuf, *p1;
 
       if (STRING_MULTIBYTE (array))
 	{
-	  idxval_byte = string_char_to_byte (array, idxval);
-	  p1 = SDATA (array) + idxval_byte;
-	  prev_bytes = BYTES_BY_CHAR_HEAD (*p1);
-	}
-      else if (SINGLE_BYTE_CHAR_P (c))
-	{
-	  SSET (array, idxval, c);
-	  return newelt;
+	  if (c > 0x7f)
+	    error ("Attempt to store non-ASCII char into multibyte string");
+	  ptrdiff_t idxval_byte = string_char_to_byte (array, idxval);
+	  unsigned char *p = SDATA (array) + idxval_byte;
+	  if (*p > 0x7f)
+	    error ("Attempt to replace non-ASCII char in multibyte string");
+	  *p = c;
 	}
       else
 	{
-	  for (ptrdiff_t i = SBYTES (array) - 1; i >= 0; i--)
-	    if (!ASCII_CHAR_P (SREF (array, i)))
-	      args_out_of_range (array, newelt);
-	  /* ARRAY is an ASCII string.  Convert it to a multibyte string.  */
-	  STRING_SET_MULTIBYTE (array);
-	  idxval_byte = idxval;
-	  p1 = SDATA (array) + idxval_byte;
-	  prev_bytes = 1;
+	  if (c > 0xff)
+	    error ("Attempt to store non-byte value into unibyte string");
+	  SSET (array, idxval, c);
 	}
-
-      int new_bytes = CHAR_STRING (c, p0);
-      if (prev_bytes != new_bytes)
-	p1 = resize_string_data (array, idxval_byte, prev_bytes, new_bytes);
-
-      do
-	*p1++ = *p0++;
-      while (--new_bytes != 0);
     }
-
   return newelt;
 }
 
diff --git a/src/fns.c b/src/fns.c
index 1cf63384218..5334c9f94a8 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -1189,12 +1189,6 @@ static Lisp_Object string_char_byte_cache_string;
 static ptrdiff_t string_char_byte_cache_charpos;
 static ptrdiff_t string_char_byte_cache_bytepos;
 
-void
-clear_string_char_byte_cache (void)
-{
-  string_char_byte_cache_string = Qnil;
-}
-
 /* Return the byte index corresponding to CHAR_INDEX in STRING.  */
 
 ptrdiff_t
diff --git a/src/lisp.h b/src/lisp.h
index 64b5c227583..fe942c917f0 100644
--- a/src/lisp.h
+++ b/src/lisp.h
@@ -4289,7 +4289,6 @@ extern Lisp_Object nconc2 (Lisp_Object, Lisp_Object);
 extern Lisp_Object assq_no_quit (Lisp_Object, Lisp_Object);
 extern Lisp_Object assq_no_signal (Lisp_Object, Lisp_Object);
 extern Lisp_Object assoc_no_quit (Lisp_Object, Lisp_Object);
-extern void clear_string_char_byte_cache (void);
 extern ptrdiff_t string_char_to_byte (Lisp_Object, ptrdiff_t);
 extern ptrdiff_t string_byte_to_char (Lisp_Object, ptrdiff_t);
 extern Lisp_Object string_to_multibyte (Lisp_Object);
@@ -4444,7 +4443,6 @@ extern void parse_str_as_multibyte (const unsigned char *, ptrdiff_t,
 
 /* Defined in alloc.c.  */
 extern intptr_t garbage_collection_inhibited;
-unsigned char *resize_string_data (Lisp_Object, ptrdiff_t, int, int);
 extern void malloc_warning (const char *);
 extern AVOID memory_full (size_t);
 extern AVOID buffer_memory_full (ptrdiff_t);
diff --git a/test/lisp/subr-tests.el b/test/lisp/subr-tests.el
index de2c59b9c25..a4059a7d290 100644
--- a/test/lisp/subr-tests.el
+++ b/test/lisp/subr-tests.el
@@ -1454,9 +1454,16 @@ final or penultimate step during initialization."))
           (dolist (inplace '(nil t))
             (dolist (from '(?a ?é ?Ω #x80 #x3fff80))
               (dolist (to '(?o ?á ?ƒ ?☃ #x1313f #xff #x3fffc9))
-                ;; Can't put a non-byte value in a non-ASCII unibyte string.
-                (unless (and (not mb) (> to #xff)
-                             (not (string-match-p (rx bos (* ascii) eos) str)))
+                (unless (or
+                         ;; Can't put non-byte in a non-ASCII unibyte string.
+                         (and (not mb) (> to #xff)
+                              (not (string-match-p
+                                    (rx bos (* ascii) eos) str)))
+                         ;; Skip illegal mutation.
+                         (and inplace (not (if mb
+                                               (and (<= 0 from 127)
+                                                    (<= 0 to 127))
+                                             (<= 0 to 255)))))
                   (let* ((in (copy-sequence str))
                          (ref (if (and (not mb) (> from #xff))
                                   in    ; nothing to replace
diff --git a/test/src/alloc-tests.el b/test/src/alloc-tests.el
index cba69023044..cf7d1ca1cd3 100644
--- a/test/src/alloc-tests.el
+++ b/test/src/alloc-tests.el
@@ -52,11 +52,4 @@
     (dotimes (i 4)
       (should (eql (aref x i) (aref y i))))))
 
-;; Bug#39207
-(ert-deftest aset-nbytes-change ()
-  (let ((s (make-string 1 ?a)))
-    (dolist (c (list 10003 ?b 128 ?c ?d (max-char) ?e))
-      (aset s 0 c)
-      (should (equal s (make-string 1 c))))))
-
 ;;; alloc-tests.el ends here
diff --git a/test/src/data-tests.el b/test/src/data-tests.el
index 1eaf1759c17..e93cc3831f9 100644
--- a/test/src/data-tests.el
+++ b/test/src/data-tests.el
@@ -929,4 +929,24 @@ comparing the subr with a much slower Lisp implementation."
              ((eq subtype 'function) (cl-functionp val))
              (t (should-not (cl-typep val subtype))))))))))
 
+(ert-deftest data-aset-string ()
+  ;; unibyte
+  (let ((s (copy-sequence "abcdef")))
+    (cl-assert (not (multibyte-string-p s)))
+    (aset s 4 ?E)
+    (should (equal s "abcdEf"))
+    (aset s 2 255)
+    (should (equal s "ab\377dEf"))
+    (should-error (aset s 3 256))       ; not a byte value
+    (should-error (aset s 3 #x3fff80))) ; not a byte value
+  ;; multibyte
+  (let ((s (copy-sequence "abçdef")))
+    (cl-assert (multibyte-string-p s))
+    (aset s 4 ?E)
+    (should (equal s "abçdEf"))
+    (should-error (aset s 2 ?c))        ; previous char not ASCII
+    (should-error (aset s 2 #xe9))      ; new char not ASCII
+    (should-error (aset s 3 #x3fff80))) ; new char not ASCII
+  )
+
 ;;; data-tests.el ends here