diff mbox series

[dunfell,5/7] gcc: Fix -fstack-protector issue on aarch64

Message ID d411ef9f660c443c00eee9bfbbc8c60c3cd0e92d.1695737244.git.steve@sakoman.com
State Accepted, archived
Delegated to: Steve Sakoman
Headers show
Series [dunfell,1/7] gdb: Fix CVE-2023-39128 | expand

Commit Message

Steve Sakoman Sept. 26, 2023, 2:12 p.m. UTC
From: Ross Burton <ross.burton@arm.com>

This series of patches fixes deficiencies in GCC's -fstack-protector
implementation for AArch64 when using dynamically allocated stack space.
This is CVE-2023-4039.  See:

https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf

for more details.

Signed-off-by: Ross Burton <ross.burton@arm.com>
Signed-off-by: Steve Sakoman <steve@sakoman.com>
---
 meta/recipes-devtools/gcc/gcc-9.5.inc         |    1 +
 .../gcc/gcc-9.5/CVE-2023-4039.patch           | 1506 +++++++++++++++++
 2 files changed, 1507 insertions(+)
 create mode 100644 meta/recipes-devtools/gcc/gcc-9.5/CVE-2023-4039.patch
diff mbox series

Patch

diff --git a/meta/recipes-devtools/gcc/gcc-9.5.inc b/meta/recipes-devtools/gcc/gcc-9.5.inc
index 23bfb1a9db..9bb41bbe24 100644
--- a/meta/recipes-devtools/gcc/gcc-9.5.inc
+++ b/meta/recipes-devtools/gcc/gcc-9.5.inc
@@ -70,6 +70,7 @@  SRC_URI = "\
            file://0038-gentypes-genmodes-Do-not-use-__LINE__-for-maintainin.patch \
            file://0039-process_alt_operands-Don-t-match-user-defined-regs-o.patch \
            file://0002-libstdc-Fix-inconsistent-noexcept-specific-for-valar.patch \
+           file://CVE-2023-4039.patch \
 "
 S = "${TMPDIR}/work-shared/gcc-${PV}-${PR}/gcc-${PV}"
 SRC_URI[sha256sum] = "27769f64ef1d4cd5e2be8682c0c93f9887983e6cfd1a927ce5a0a2915a95cf8f"
diff --git a/meta/recipes-devtools/gcc/gcc-9.5/CVE-2023-4039.patch b/meta/recipes-devtools/gcc/gcc-9.5/CVE-2023-4039.patch
new file mode 100644
index 0000000000..56d229066f
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc-9.5/CVE-2023-4039.patch
@@ -0,0 +1,1506 @@ 
+From: Richard Sandiford <richard.sandiford@arm.com>
+Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue
+Date: Tue, 12 Sep 2023 16:25:10 +0100
+
+This series of patches fixes deficiencies in GCC's -fstack-protector
+implementation for AArch64 when using dynamically allocated stack space.
+This is CVE-2023-4039.  See:
+
+https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
+https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
+
+for more details.
+
+The fix is to put the saved registers above the locals area when
+-fstack-protector is used.
+
+The series also fixes a stack-clash problem that I found while working
+on the CVE.  In unpatched sources, the stack-clash problem would only
+trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an
+equivalent).  But it would be a more significant issue with the new
+-fstack-protector frame layout.  It's therefore important that both
+problems are fixed together.
+
+Some reorganisation of the code seemed necessary to fix the problems in a
+cleanish way.  The series is therefore quite long, but only a handful of
+patches should have any effect on code generation.
+
+See the individual patches for a detailed description.
+
+Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches.
+I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039.
+
+CVE: CVE-2023-4039
+Upstream-Status: Submitted
+Signed-off-by: Ross Burton <ross.burton@arm.com>
+  
+  
+From 78ebdb7b12d5e258b9811bab715734454268fd0c Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Fri, 16 Jun 2023 17:00:51 +0100
+Subject: [PATCH 01/10] aarch64: Explicitly handle frames with no saved
+ registers
+
+If a frame has no saved registers, it can be allocated in one go.
+There is no need to treat the areas below and above the saved
+registers as separate.
+
+And if we allocate the frame in one go, it should be allocated
+as the initial_adjust rather than the final_adjust.  This allows the
+frame size to grow to guard_size - guard_used_by_caller before a stack
+probe is needed.  (A frame with no register saves is necessarily a
+leaf frame.)
+
+This is a no-op as thing stand, since a leaf function will have
+no outgoing arguments, and so all the frame will be above where
+the saved registers normally go.
+
+gcc/
+	* config/aarch64/aarch64.c (aarch64_layout_frame): Explicitly
+	allocate the frame in one go if there are no saved registers.
+---
+ gcc/config/aarch64/aarch64.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index a35dceab9fc..e9dad682738 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -4771,9 +4771,11 @@ aarch64_layout_frame (void)
+     max_push_offset = 256;
+ 
+   HOST_WIDE_INT const_size, const_fp_offset;
+-  if (cfun->machine->frame.frame_size.is_constant (&const_size)
+-      && const_size < max_push_offset
+-      && known_eq (crtl->outgoing_args_size, 0))
++  if (cfun->machine->frame.saved_regs_size == 0)
++    cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
++  else if (cfun->machine->frame.frame_size.is_constant (&const_size)
++	   && const_size < max_push_offset
++	   && known_eq (crtl->outgoing_args_size, 0))
+     {
+       /* Simple, small frame with no outgoing arguments:
+ 	 stp reg1, reg2, [sp, -frame_size]!
+-- 
+2.34.1
+
+
+From 347487fffa0266d43bf18f1f91878410881f596e Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Fri, 16 Jun 2023 16:55:12 +0100
+Subject: [PATCH 02/10] aarch64: Add bytes_below_hard_fp to frame info
+
+The frame layout code currently hard-codes the assumption that
+the number of bytes below the saved registers is equal to the
+size of the outgoing arguments.  This patch abstracts that
+value into a new field of aarch64_frame.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): New
+	field.
+	* config/aarch64/aarch64.c (aarch64_layout_frame): Initialize it,
+	and use it instead of crtl->outgoing_args_size.
+	(aarch64_get_separate_components): Use bytes_below_hard_fp instead
+	of outgoing_args_size.
+	(aarch64_process_components): Likewise.
+---
+ gcc/config/aarch64/aarch64.c | 50 +++++++++++++++++++-----------------
+ gcc/config/aarch64/aarch64.h |  6 ++++-
+ 2 files changed, 32 insertions(+), 24 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index e9dad682738..25cf10cc4b9 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -4684,6 +4684,8 @@ aarch64_layout_frame (void)
+ 	last_fp_reg = regno;
+       }
+ 
++  cfun->machine->frame.bytes_below_hard_fp = crtl->outgoing_args_size;
++
+   if (cfun->machine->frame.emit_frame_chain)
+     {
+       /* FP and LR are placed in the linkage record.  */
+@@ -4751,11 +4753,11 @@ aarch64_layout_frame (void)
+ 			   STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+   /* Both these values are already aligned.  */
+-  gcc_assert (multiple_p (crtl->outgoing_args_size,
++  gcc_assert (multiple_p (cfun->machine->frame.bytes_below_hard_fp,
+ 			  STACK_BOUNDARY / BITS_PER_UNIT));
+   cfun->machine->frame.frame_size
+     = (cfun->machine->frame.hard_fp_offset
+-       + crtl->outgoing_args_size);
++       + cfun->machine->frame.bytes_below_hard_fp);
+ 
+   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
+ 
+@@ -4775,23 +4777,23 @@ aarch64_layout_frame (void)
+     cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
+   else if (cfun->machine->frame.frame_size.is_constant (&const_size)
+ 	   && const_size < max_push_offset
+-	   && known_eq (crtl->outgoing_args_size, 0))
++	   && known_eq (cfun->machine->frame.bytes_below_hard_fp, 0))
+     {
+-      /* Simple, small frame with no outgoing arguments:
++      /* Simple, small frame with no data below the saved registers.
+ 	 stp reg1, reg2, [sp, -frame_size]!
+ 	 stp reg3, reg4, [sp, 16]  */
+       cfun->machine->frame.callee_adjust = const_size;
+     }
+-  else if (known_lt (crtl->outgoing_args_size
++  else if (known_lt (cfun->machine->frame.bytes_below_hard_fp
+ 		     + cfun->machine->frame.saved_regs_size, 512)
+ 	   && !(cfun->calls_alloca
+ 		&& known_lt (cfun->machine->frame.hard_fp_offset,
+ 			     max_push_offset)))
+     {
+-      /* Frame with small outgoing arguments:
++      /* Frame with small area below the saved registers:
+ 	 sub sp, sp, frame_size
+-	 stp reg1, reg2, [sp, outgoing_args_size]
+-	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
++	 stp reg1, reg2, [sp, bytes_below_hard_fp]
++	 stp reg3, reg4, [sp, bytes_below_hard_fp + 16]  */
+       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
+       cfun->machine->frame.callee_offset
+ 	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
+@@ -4799,22 +4801,23 @@ aarch64_layout_frame (void)
+   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
+ 	   && const_fp_offset < max_push_offset)
+     {
+-      /* Frame with large outgoing arguments but a small local area:
++      /* Frame with large area below the saved registers, but with a
++	 small area above:
+ 	 stp reg1, reg2, [sp, -hard_fp_offset]!
+ 	 stp reg3, reg4, [sp, 16]
+-	 sub sp, sp, outgoing_args_size  */
++	 sub sp, sp, bytes_below_hard_fp  */
+       cfun->machine->frame.callee_adjust = const_fp_offset;
+       cfun->machine->frame.final_adjust
+ 	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
+     }
+   else
+     {
+-      /* Frame with large local area and outgoing arguments using frame pointer:
++      /* General case:
+ 	 sub sp, sp, hard_fp_offset
+ 	 stp x29, x30, [sp, 0]
+ 	 add x29, sp, 0
+ 	 stp reg3, reg4, [sp, 16]
+-	 sub sp, sp, outgoing_args_size  */
++	 sub sp, sp, bytes_below_hard_fp  */
+       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
+       cfun->machine->frame.final_adjust
+ 	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
+@@ -5243,9 +5246,11 @@ aarch64_get_separate_components (void)
+     if (aarch64_register_saved_on_entry (regno))
+       {
+ 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
++
++	/* Get the offset relative to the register we'll use.  */
+ 	if (!frame_pointer_needed)
+-	  offset += cfun->machine->frame.frame_size
+-		    - cfun->machine->frame.hard_fp_offset;
++	  offset += cfun->machine->frame.bytes_below_hard_fp;
++
+ 	/* Check that we can access the stack slot of the register with one
+ 	   direct load with no adjustments needed.  */
+ 	if (offset_12bit_unsigned_scaled_p (DImode, offset))
+@@ -5367,8 +5372,8 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       rtx reg = gen_rtx_REG (mode, regno);
+       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
+       if (!frame_pointer_needed)
+-	offset += cfun->machine->frame.frame_size
+-		  - cfun->machine->frame.hard_fp_offset;
++	offset += cfun->machine->frame.bytes_below_hard_fp;
++
+       rtx addr = plus_constant (Pmode, ptr_reg, offset);
+       rtx mem = gen_frame_mem (mode, addr);
+ 
+@@ -5410,8 +5415,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       /* REGNO2 can be saved/restored in a pair with REGNO.  */
+       rtx reg2 = gen_rtx_REG (mode, regno2);
+       if (!frame_pointer_needed)
+-	offset2 += cfun->machine->frame.frame_size
+-		  - cfun->machine->frame.hard_fp_offset;
++	offset2 += cfun->machine->frame.bytes_below_hard_fp;
+       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
+       rtx mem2 = gen_frame_mem (mode, addr2);
+       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
+@@ -5478,10 +5482,10 @@ aarch64_stack_clash_protection_alloca_probe_range (void)
+    registers.  If POLY_SIZE is not large enough to require a probe this function
+    will only adjust the stack.  When allocating the stack space
+    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
+-   FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
+-   arguments.  If we are then we ensure that any allocation larger than the ABI
+-   defined buffer needs a probe so that the invariant of having a 1KB buffer is
+-   maintained.
++   FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
++   the saved registers.  If we are then we ensure that any allocation
++   larger than the ABI defined buffer needs a probe so that the
++   invariant of having a 1KB buffer is maintained.
+ 
+    We emit barriers after each stack adjustment to prevent optimizations from
+    breaking the invariant that we never drop the stack more than a page.  This
+@@ -5671,7 +5675,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
+      be probed.  This maintains the requirement that each page is probed at
+      least once.  For initial probing we probe only if the allocation is
+-     more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
++     more than GUARD_SIZE - buffer, and below the saved registers we probe
+      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
+      GUARD_SIZE.  This works that for any allocation that is large enough to
+      trigger a probe here, we'll have at least one, and if they're not large
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index af0bc3f1881..95831637ba7 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -712,9 +712,13 @@ struct GTY (()) aarch64_frame
+   HOST_WIDE_INT saved_varargs_size;
+ 
+   /* The size of the saved callee-save int/FP registers.  */
+-
+   HOST_WIDE_INT saved_regs_size;
+ 
++  /* The number of bytes between the bottom of the static frame (the bottom
++     of the outgoing arguments) and the hard frame pointer.  This value is
++     always a multiple of STACK_BOUNDARY.  */
++  poly_int64 bytes_below_hard_fp;
++
+   /* Offset from the base of the frame (incomming SP) to the
+      top of the locals area.  This value is always a multiple of
+      STACK_BOUNDARY.  */
+-- 
+2.34.1
+
+
+From 4604c4cd0a6c4c26d6594ec9a0383b4d9197d9df Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 27 Jun 2023 11:25:40 +0100
+Subject: [PATCH 03/10] aarch64: Rename locals_offset to bytes_above_locals
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+locals_offset was described as:
+
+  /* Offset from the base of the frame (incomming SP) to the
+     top of the locals area.  This value is always a multiple of
+     STACK_BOUNDARY.  */
+
+This is implicitly an “upside down” view of the frame: the incoming
+SP is at offset 0, and anything N bytes below the incoming SP is at
+offset N (rather than -N).
+
+However, reg_offset instead uses a “right way up” view; that is,
+it views offsets in address terms.  Something above X is at a
+positive offset from X and something below X is at a negative
+offset from X.
+
+Also, even on FRAME_GROWS_DOWNWARD targets like AArch64,
+target-independent code views offsets in address terms too:
+locals are allocated at negative offsets to virtual_stack_vars.
+
+It seems confusing to have *_offset fields of the same structure
+using different polarities like this.  This patch tries to avoid
+that by renaming locals_offset to bytes_above_locals.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename to...
+	(aarch64_frame::bytes_above_locals): ...this.
+	* config/aarch64/aarch64.c (aarch64_layout_frame)
+	(aarch64_initial_elimination_offset): Update accordingly.
+---
+ gcc/config/aarch64/aarch64.c | 9 +++++----
+ gcc/config/aarch64/aarch64.h | 6 +++---
+ 2 files changed, 8 insertions(+), 7 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index 25cf10cc4b9..dcaf491af42 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -4759,7 +4759,8 @@ aarch64_layout_frame (void)
+     = (cfun->machine->frame.hard_fp_offset
+        + cfun->machine->frame.bytes_below_hard_fp);
+ 
+-  cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
++  cfun->machine->frame.bytes_above_locals
++    = cfun->machine->frame.saved_varargs_size;
+ 
+   cfun->machine->frame.initial_adjust = 0;
+   cfun->machine->frame.final_adjust = 0;
+@@ -8566,14 +8567,14 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
+ 
+       if (from == FRAME_POINTER_REGNUM)
+ 	return cfun->machine->frame.hard_fp_offset
+-	       - cfun->machine->frame.locals_offset;
++	       - cfun->machine->frame.bytes_above_locals;
+     }
+ 
+   if (to == STACK_POINTER_REGNUM)
+     {
+       if (from == FRAME_POINTER_REGNUM)
+-	  return cfun->machine->frame.frame_size
+-		 - cfun->machine->frame.locals_offset;
++	return cfun->machine->frame.frame_size
++	       - cfun->machine->frame.bytes_above_locals;
+     }
+ 
+   return cfun->machine->frame.frame_size;
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 95831637ba7..a079a88b4f4 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -719,10 +719,10 @@ struct GTY (()) aarch64_frame
+      always a multiple of STACK_BOUNDARY.  */
+   poly_int64 bytes_below_hard_fp;
+ 
+-  /* Offset from the base of the frame (incomming SP) to the
+-     top of the locals area.  This value is always a multiple of
++  /* The number of bytes between the top of the locals area and the top
++     of the frame (the incomming SP).  This value is always a multiple of
+      STACK_BOUNDARY.  */
+-  poly_int64 locals_offset;
++  poly_int64 bytes_above_locals;
+ 
+   /* Offset from the base of the frame (incomming SP) to the
+      hard_frame_pointer.  This value is always a multiple of
+-- 
+2.34.1
+
+
+From 16016465ff28a75f5e0540cbaeb4eb102fdc3230 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 27 Jun 2023 11:28:11 +0100
+Subject: [PATCH 04/10] aarch64: Rename hard_fp_offset to bytes_above_hard_fp
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Similarly to the previous locals_offset patch, hard_fp_offset
+was described as:
+
+  /* Offset from the base of the frame (incomming SP) to the
+     hard_frame_pointer.  This value is always a multiple of
+     STACK_BOUNDARY.  */
+  poly_int64 hard_fp_offset;
+
+which again took an “upside-down” view: higher offsets meant lower
+addresses.  This patch renames the field to bytes_above_hard_fp instead.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename
+	to...
+	(aarch64_frame::bytes_above_hard_fp): ...this.
+	* config/aarch64/aarch64.c (aarch64_layout_frame)
+	(aarch64_expand_prologue): Update accordingly.
+	(aarch64_initial_elimination_offset): Likewise.
+---
+ gcc/config/aarch64/aarch64.c | 21 +++++++++++----------
+ gcc/config/aarch64/aarch64.h |  6 +++---
+ 2 files changed, 14 insertions(+), 13 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index dcaf491af42..2681e0c2bb9 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -4747,7 +4747,7 @@ aarch64_layout_frame (void)
+   HOST_WIDE_INT varargs_and_saved_regs_size
+     = offset + cfun->machine->frame.saved_varargs_size;
+ 
+-  cfun->machine->frame.hard_fp_offset
++  cfun->machine->frame.bytes_above_hard_fp
+     = aligned_upper_bound (varargs_and_saved_regs_size
+ 			   + get_frame_size (),
+ 			   STACK_BOUNDARY / BITS_PER_UNIT);
+@@ -4756,7 +4756,7 @@ aarch64_layout_frame (void)
+   gcc_assert (multiple_p (cfun->machine->frame.bytes_below_hard_fp,
+ 			  STACK_BOUNDARY / BITS_PER_UNIT));
+   cfun->machine->frame.frame_size
+-    = (cfun->machine->frame.hard_fp_offset
++    = (cfun->machine->frame.bytes_above_hard_fp
+        + cfun->machine->frame.bytes_below_hard_fp);
+ 
+   cfun->machine->frame.bytes_above_locals
+@@ -4788,7 +4788,7 @@ aarch64_layout_frame (void)
+   else if (known_lt (cfun->machine->frame.bytes_below_hard_fp
+ 		     + cfun->machine->frame.saved_regs_size, 512)
+ 	   && !(cfun->calls_alloca
+-		&& known_lt (cfun->machine->frame.hard_fp_offset,
++		&& known_lt (cfun->machine->frame.bytes_above_hard_fp,
+ 			     max_push_offset)))
+     {
+       /* Frame with small area below the saved registers:
+@@ -4797,14 +4797,14 @@ aarch64_layout_frame (void)
+ 	 stp reg3, reg4, [sp, bytes_below_hard_fp + 16]  */
+       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
+       cfun->machine->frame.callee_offset
+-	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
++	= cfun->machine->frame.frame_size - cfun->machine->frame.bytes_above_hard_fp;
+     }
+-  else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
++  else if (cfun->machine->frame.bytes_above_hard_fp.is_constant (&const_fp_offset)
+ 	   && const_fp_offset < max_push_offset)
+     {
+       /* Frame with large area below the saved registers, but with a
+ 	 small area above:
+-	 stp reg1, reg2, [sp, -hard_fp_offset]!
++	 stp reg1, reg2, [sp, -bytes_above_hard_fp]!
+ 	 stp reg3, reg4, [sp, 16]
+ 	 sub sp, sp, bytes_below_hard_fp  */
+       cfun->machine->frame.callee_adjust = const_fp_offset;
+@@ -4814,12 +4814,13 @@ aarch64_layout_frame (void)
+   else
+     {
+       /* General case:
+-	 sub sp, sp, hard_fp_offset
++	 sub sp, sp, bytes_above_hard_fp
+ 	 stp x29, x30, [sp, 0]
+ 	 add x29, sp, 0
+ 	 stp reg3, reg4, [sp, 16]
+ 	 sub sp, sp, bytes_below_hard_fp  */
+-      cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
++      cfun->machine->frame.initial_adjust
++	= cfun->machine->frame.bytes_above_hard_fp;
+       cfun->machine->frame.final_adjust
+ 	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
+     }
+@@ -8563,10 +8564,10 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
+   if (to == HARD_FRAME_POINTER_REGNUM)
+     {
+       if (from == ARG_POINTER_REGNUM)
+-	return cfun->machine->frame.hard_fp_offset;
++	return cfun->machine->frame.bytes_above_hard_fp;
+ 
+       if (from == FRAME_POINTER_REGNUM)
+-	return cfun->machine->frame.hard_fp_offset
++	return cfun->machine->frame.bytes_above_hard_fp
+ 	       - cfun->machine->frame.bytes_above_locals;
+     }
+ 
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index a079a88b4f4..eab6da84a02 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -724,10 +724,10 @@ struct GTY (()) aarch64_frame
+      STACK_BOUNDARY.  */
+   poly_int64 bytes_above_locals;
+ 
+-  /* Offset from the base of the frame (incomming SP) to the
+-     hard_frame_pointer.  This value is always a multiple of
++  /* The number of bytes between the hard_frame_pointer and the top of
++     the frame (the incomming SP).  This value is always a multiple of
+      STACK_BOUNDARY.  */
+-  poly_int64 hard_fp_offset;
++  poly_int64 bytes_above_hard_fp;
+ 
+   /* The size of the frame.  This value is the offset from base of the
+      frame (incomming SP) to the stack_pointer.  This value is always
+-- 
+2.34.1
+
+
+From eb2271eb6bb68ec3c9aa9ae4746ea1ee5f18874a Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Thu, 22 Jun 2023 22:26:30 +0100
+Subject: [PATCH 05/10] aarch64: Tweak frame_size comment
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This patch fixes another case in which a value was described with
+an “upside-down” view.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak comment.
+---
+ gcc/config/aarch64/aarch64.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index eab6da84a02..7c4b65ec55b 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -729,8 +729,8 @@ struct GTY (()) aarch64_frame
+      STACK_BOUNDARY.  */
+   poly_int64 bytes_above_hard_fp;
+ 
+-  /* The size of the frame.  This value is the offset from base of the
+-     frame (incomming SP) to the stack_pointer.  This value is always
++  /* The size of the frame, i.e. the number of bytes between the bottom
++     of the outgoing arguments and the incoming SP.  This value is always
+      a multiple of STACK_BOUNDARY.  */
+   poly_int64 frame_size;
+ 
+-- 
+2.34.1
+
+
+From cfed3b87e9351edff1568ade4ef666edc9887639 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 15 Aug 2023 19:05:30 +0100
+Subject: [PATCH 06/10] Backport check-function-bodies support
+
+---
+ gcc/testsuite/lib/scanasm.exp | 191 ++++++++++++++++++++++++++++++++++
+ 1 file changed, 191 insertions(+)
+
+diff --git a/gcc/testsuite/lib/scanasm.exp b/gcc/testsuite/lib/scanasm.exp
+index 35ccbc86fc0..c9af27bf47a 100644
+--- a/gcc/testsuite/lib/scanasm.exp
++++ b/gcc/testsuite/lib/scanasm.exp
+@@ -546,3 +546,194 @@ proc scan-lto-assembler { args } {
+     verbose "output_file: $output_file"
+     dg-scan "scan-lto-assembler" 1 $testcase $output_file $args
+ }
++
++# Read assembly file FILENAME and store a mapping from function names
++# to function bodies in array RESULT.  FILENAME has already been uploaded
++# locally where necessary and is known to exist.
++
++proc parse_function_bodies { filename result } {
++    upvar $result up_result
++
++    # Regexp for the start of a function definition (name in \1).
++    set label {^([a-zA-Z_]\S+):$}
++
++    # Regexp for the end of a function definition.
++    set terminator {^\s*\.size}
++
++    # Regexp for lines that aren't interesting.
++    set fluff {^\s*(?:\.|//|@|$)}
++
++    set fd [open $filename r]
++    set in_function 0
++    while { [gets $fd line] >= 0 } {
++	if { [regexp $label $line dummy function_name] } {
++	    set in_function 1
++	    set function_body ""
++	} elseif { $in_function } {
++	    if { [regexp $terminator $line] } {
++		set up_result($function_name) $function_body
++		set in_function 0
++	    } elseif { ![regexp $fluff $line] } {
++		append function_body $line "\n"
++	    }
++	}
++    }
++    close $fd
++}
++
++# FUNCTIONS is an array that maps function names to function bodies.
++# Return true if it contains a definition of function NAME and if
++# that definition matches BODY_REGEXP.
++
++proc check_function_body { functions name body_regexp } {
++    upvar $functions up_functions
++
++    if { ![info exists up_functions($name)] } {
++	return 0
++    }
++    set fn_res [regexp "^$body_regexp\$" $up_functions($name)]
++    if { !$fn_res } {
++      verbose -log "body: $body_regexp"
++      verbose -log "against: $up_functions($name)"
++    }
++    return $fn_res
++}
++
++# Check the implementations of functions against expected output.  Used as:
++#
++# { dg-do { check-function-bodies PREFIX TERMINATOR[ OPTION[ SELECTOR]] } }
++#
++# See sourcebuild.texi for details.
++
++proc check-function-bodies { args } {
++    if { [llength $args] < 2 } {
++	error "too few arguments to check-function-bodies"
++    }
++    if { [llength $args] > 4 } {
++	error "too many arguments to check-function-bodies"
++    }
++
++    if { [llength $args] >= 3 } {
++	set required_flags [lindex $args 2]
++
++	upvar 2 dg-extra-tool-flags extra_tool_flags
++	set flags $extra_tool_flags
++
++	global torture_current_flags
++	if { [info exists torture_current_flags] } {
++	    append flags " " $torture_current_flags
++	}
++	foreach required_flag $required_flags {
++	    switch -- $required_flag {
++		target -
++		xfail {
++		    error "misplaced $required_flag in check-function-bodies"
++		}
++	    }
++	}
++	foreach required_flag $required_flags {
++	    if { ![regexp " $required_flag " $flags] } {
++		return
++	    }
++	}
++    }
++
++    set xfail_all 0
++    if { [llength $args] >= 4 } {
++	switch [dg-process-target [lindex $args 3]] {
++	    "S" { }
++	    "N" { return }
++	    "F" { set xfail_all 1 }
++	    "P" { }
++	}
++    }
++
++    set testcase [testname-for-summary]
++    # The name might include a list of options; extract the file name.
++    set filename [lindex $testcase 0]
++
++    global srcdir
++    set input_filename "$srcdir/$filename"
++    set output_filename "[file rootname [file tail $filename]].s"
++
++    set prefix [lindex $args 0]
++    set prefix_len [string length $prefix]
++    set terminator [lindex $args 1]
++    if { [string equal $terminator ""] } {
++	set terminator "*/"
++    }
++    set terminator_len [string length $terminator]
++
++    set have_bodies 0
++    if { [is_remote host] } {
++	remote_upload host "$filename"
++    }
++    if { [file exists $output_filename] } {
++	parse_function_bodies $output_filename functions
++	set have_bodies 1
++    } else {
++	verbose -log "$testcase: output file does not exist"
++    }
++
++    set count 0
++    set function_regexp ""
++    set label {^(\S+):$}
++
++    set lineno 1
++    set fd [open $input_filename r]
++    set in_function 0
++    while { [gets $fd line] >= 0 } {
++	if { [string equal -length $prefix_len $line $prefix] } {
++	    set line [string trim [string range $line $prefix_len end]]
++	    if { !$in_function } {
++		if { [regexp "^(.*?\\S)\\s+{(.*)}\$" $line dummy \
++			  line selector] } {
++		    set selector [dg-process-target $selector]
++		} else {
++		    set selector "P"
++		}
++		if { ![regexp $label $line dummy function_name] } {
++		    close $fd
++		    error "check-function-bodies: line $lineno does not have a function label"
++		}
++		set in_function 1
++		set function_regexp ""
++	    } elseif { [string equal $line "("] } {
++		append function_regexp "(?:"
++	    } elseif { [string equal $line "|"] } {
++		append function_regexp "|"
++	    } elseif { [string equal $line ")"] } {
++		append function_regexp ")"
++	    } elseif { [string equal $line "..."] } {
++		append function_regexp ".*"
++	    } else {
++		append function_regexp "\t" $line "\n"
++	    }
++	} elseif { [string equal -length $terminator_len $line $terminator] } {
++	    if { ![string equal $selector "N"] } {
++		if { $xfail_all || [string equal $selector "F"] } {
++		    setup_xfail "*-*-*"
++		}
++		set testname "$testcase check-function-bodies $function_name"
++		if { !$have_bodies } {
++		    unresolved $testname
++		} elseif { [check_function_body functions $function_name \
++				$function_regexp] } {
++		    pass $testname
++		} else {
++		    fail $testname
++		}
++	    }
++	    set in_function 0
++	    incr count
++	}
++	incr lineno
++    }
++    close $fd
++    if { $in_function } {
++	error "check-function-bodies: missing \"$terminator\""
++    }
++    if { $count == 0 } {
++	error "check-function-bodies: no matches found"
++    }
++}
+-- 
+2.34.1
+
+
+From 4dd8925d95d3d6d89779b494b5f4cfadcf9fa96e Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 27 Jun 2023 15:11:44 +0100
+Subject: [PATCH 07/10] aarch64: Tweak stack clash boundary condition
+
+The AArch64 ABI says that, when stack clash protection is used,
+there can be a maximum of 1KiB of unprobed space at sp on entry
+to a function.  Therefore, we need to probe when allocating
+>= guard_size - 1KiB of data (>= rather than >).  This is what
+GCC does.
+
+If an allocation is exactly guard_size bytes, it is enough to allocate
+those bytes and probe once at offset 1024.  It isn't possible to use a
+single probe at any other offset: higher would conmplicate later code,
+by leaving more unprobed space than usual, while lower would risk
+leaving an entire page unprobed.  For simplicity, the code probes all
+allocations at offset 1024.
+
+Some register saves also act as probes.  If we need to allocate
+more space below the last such register save probe, we need to
+probe the allocation if it is > 1KiB.  Again, this allocation is
+then sometimes (but not always) probed at offset 1024.  This sort of
+allocation is currently only used for outgoing arguments, which are
+rarely this big.
+
+However, the code also probed if this final outgoing-arguments
+allocation was == 1KiB, rather than just > 1KiB.  This isn't
+necessary, since the register save then probes at offset 1024
+as required.  Continuing to probe allocations of exactly 1KiB
+would complicate later patches.
+
+gcc/
+	* config/aarch64/aarch64.c (aarch64_allocate_and_probe_stack_space):
+	Don't probe final allocations that are exactly 1KiB in size (after
+	unprobed space above the final allocation has been deducted).
+
+gcc/testsuite/
+	* gcc.target/aarch64/stack-check-prologue-17.c: New test.
+---
+ gcc/config/aarch64/aarch64.c                  |  6 +-
+ .../aarch64/stack-check-prologue-17.c         | 55 +++++++++++++++++++
+ 2 files changed, 60 insertions(+), 1 deletion(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index 2681e0c2bb9..4c9e11cd7cf 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -5506,6 +5506,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   HOST_WIDE_INT guard_size
+     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
++  HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
++  gcc_assert (multiple_p (poly_size, byte_sp_alignment));
+   /* When doing the final adjustment for the outgoing argument size we can't
+      assume that LR was saved at position 0.  So subtract it's offset from the
+      ABI safe buffer so that we don't accidentally allow an adjustment that
+@@ -5513,7 +5515,9 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+      probing.  */
+   HOST_WIDE_INT min_probe_threshold
+     = final_adjustment_p
+-      ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
++      ? (guard_used_by_caller
++	 + byte_sp_alignment
++	 - cfun->machine->frame.reg_offset[LR_REGNUM])
+       : guard_size - guard_used_by_caller;
+ 
+   poly_int64 frame_size = cfun->machine->frame.frame_size;
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+new file mode 100644
+index 00000000000..0d8a25d73a2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+@@ -0,0 +1,55 @@
++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void f(int, ...);
++void g();
++
++/*
++** test1:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1024
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test1(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
++    }
++  g();
++  return 1;
++}
++
++/*
++** test2:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1040
++**	str	xzr, \[sp\]
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test2(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x);
++    }
++  g();
++  return 1;
++}
+-- 
+2.34.1
+
+
+From 12517baf6c88447e3bda3a459ac4c29d61f84e6c Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 27 Jun 2023 15:12:55 +0100
+Subject: [PATCH 08/10] aarch64: Put LR save probe in first 16 bytes
+
+-fstack-clash-protection uses the save of LR as a probe for the next
+allocation.  The next allocation could be:
+
+* another part of the static frame, e.g. when allocating SVE save slots
+  or outgoing arguments
+
+* an alloca in the same function
+
+* an allocation made by a callee function
+
+However, when -fomit-frame-pointer is used, the LR save slot is placed
+above the other GPR save slots.  It could therefore be up to 80 bytes
+above the base of the GPR save area (which is also the hard fp address).
+
+aarch64_allocate_and_probe_stack_space took this into account when
+deciding how much subsequent space could be allocated without needing
+a probe.  However, it interacted badly with:
+
+      /* If doing a small final adjustment, we always probe at offset 0.
+	 This is done to avoid issues when LR is not at position 0 or when
+	 the final adjustment is smaller than the probing offset.  */
+      else if (final_adjustment_p && rounded_size == 0)
+	residual_probe_offset = 0;
+
+which forces any allocation that is smaller than the guard page size
+to be probed at offset 0 rather than the usual offset 1024.  It was
+therefore possible to construct cases in which we had:
+
+* a probe using LR at SP + 80 bytes (or some other value >= 16)
+* an allocation of the guard page size - 16 bytes
+* a probe at SP + 0
+
+which allocates guard page size + 64 consecutive unprobed bytes.
+
+This patch requires the LR probe to be in the first 16 bytes of the
+save area when stack clash protection is active.  Doing it
+unconditionally would cause code-quality regressions.
+
+gcc/
+	* config/aarch64/aarch64.c (aarch64_layout_frame): Ensure that
+	the LR save slot is in the first 16 bytes of the register save area.
+	(aarch64_allocate_and_probe_stack_space): Remove workaround for
+	when LR was not in the first 16 bytes.
+
+gcc/testsuite/
+	* gcc.target/aarch64/stack-check-prologue-18.c: New test.
+---
+ gcc/config/aarch64/aarch64.c                  |  50 +++++----
+ .../aarch64/stack-check-prologue-18.c         | 100 ++++++++++++++++++
+ 2 files changed, 127 insertions(+), 23 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index 4c9e11cd7cf..1e8467fdd03 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -4686,15 +4686,31 @@ aarch64_layout_frame (void)
+ 
+   cfun->machine->frame.bytes_below_hard_fp = crtl->outgoing_args_size;
+ 
++#define ALLOCATE_GPR_SLOT(REGNO)					\
++  do									\
++    {									\
++      cfun->machine->frame.reg_offset[REGNO] = offset;			\
++      if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)		\
++	cfun->machine->frame.wb_candidate1 = (REGNO);			\
++      else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)	\
++	cfun->machine->frame.wb_candidate2 = (REGNO);			\
++      offset += UNITS_PER_WORD;						\
++    }									\
++  while (0)
++
+   if (cfun->machine->frame.emit_frame_chain)
+     {
+       /* FP and LR are placed in the linkage record.  */
+-      cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
+-      cfun->machine->frame.wb_candidate1 = R29_REGNUM;
+-      cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
+-      cfun->machine->frame.wb_candidate2 = R30_REGNUM;
+-      offset = 2 * UNITS_PER_WORD;
++      ALLOCATE_GPR_SLOT (R29_REGNUM);
++      ALLOCATE_GPR_SLOT (R30_REGNUM);
+     }
++  else if (flag_stack_clash_protection
++	   && cfun->machine->frame.reg_offset[R30_REGNUM] == SLOT_REQUIRED)
++    /* Put the LR save slot first, since it makes a good choice of probe
++       for stack clash purposes.  The idea is that the link register usually
++       has to be saved before a call anyway, and so we lose little by
++       stopping it from being individually shrink-wrapped.  */
++    ALLOCATE_GPR_SLOT (R30_REGNUM);
+ 
+   /* With stack-clash, LR must be saved in non-leaf functions.  */
+   gcc_assert (crtl->is_leaf
+@@ -4704,14 +4720,9 @@ aarch64_layout_frame (void)
+   /* Now assign stack slots for them.  */
+   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
+     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
+-      {
+-	cfun->machine->frame.reg_offset[regno] = offset;
+-	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
+-	  cfun->machine->frame.wb_candidate1 = regno;
+-	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
+-	  cfun->machine->frame.wb_candidate2 = regno;
+-	offset += UNITS_PER_WORD;
+-      }
++      ALLOCATE_GPR_SLOT (regno);
++
++#undef ALLOCATE_GPR_SLOT
+ 
+   HOST_WIDE_INT max_int_offset = offset;
+   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+@@ -5508,16 +5519,9 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+   HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
+   gcc_assert (multiple_p (poly_size, byte_sp_alignment));
+-  /* When doing the final adjustment for the outgoing argument size we can't
+-     assume that LR was saved at position 0.  So subtract it's offset from the
+-     ABI safe buffer so that we don't accidentally allow an adjustment that
+-     would result in an allocation larger than the ABI buffer without
+-     probing.  */
+   HOST_WIDE_INT min_probe_threshold
+     = final_adjustment_p
+-      ? (guard_used_by_caller
+-	 + byte_sp_alignment
+-	 - cfun->machine->frame.reg_offset[LR_REGNUM])
++      ? guard_used_by_caller + byte_sp_alignment
+       : guard_size - guard_used_by_caller;
+ 
+   poly_int64 frame_size = cfun->machine->frame.frame_size;
+@@ -5697,8 +5701,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+       if (final_adjustment_p && rounded_size != 0)
+ 	min_probe_threshold = 0;
+       /* If doing a small final adjustment, we always probe at offset 0.
+-	 This is done to avoid issues when LR is not at position 0 or when
+-	 the final adjustment is smaller than the probing offset.  */
++	 This is done to avoid issues when the final adjustment is smaller
++	 than the probing offset.  */
+       else if (final_adjustment_p && rounded_size == 0)
+ 	residual_probe_offset = 0;
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+new file mode 100644
+index 00000000000..82447d20fff
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+@@ -0,0 +1,100 @@
++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void f(int, ...);
++void g();
++
++/*
++** test1:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #4064
++**	str	xzr, \[sp\]
++**	cbnz	w0, .*
++**	bl	g
++**	...
++**	str	x26, \[sp, #?4128\]
++**	...
++*/
++int test1(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x);
++    }
++  g();
++  return 1;
++}
++
++/*
++** test2:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1040
++**	str	xzr, \[sp\]
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test2(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x);
++    }
++  g();
++  return 1;
++}
++
++/*
++** test3:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1024
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test3(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
++    }
++  g();
++  return 1;
++}
+-- 
+2.34.1
+
+
+From f2684e63652bb251d22c79e40081c646df1f36b6 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 8 Aug 2023 01:57:26 +0100
+Subject: [PATCH 09/10] aarch64: Simplify probe of final frame allocation
+
+Previous patches ensured that the final frame allocation only needs
+a probe when the size is strictly greater than 1KiB.  It's therefore
+safe to use the normal 1024 probe offset in all cases.
+
+The main motivation for doing this is to simplify the code and
+remove the number of special cases.
+
+gcc/
+	* config/aarch64/aarch64.c (aarch64_allocate_and_probe_stack_space):
+	Always probe the residual allocation at offset 1024, asserting
+	that that is in range.
+
+gcc/testsuite/
+	* gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe
+	to be at offset 1024 rather than offset 0.
+	* gcc.target/aarch64/stack-check-prologue-18.c: Likewise.
+---
+ gcc/config/aarch64/aarch64.c                         | 12 ++++--------
+ .../gcc.target/aarch64/stack-check-prologue-17.c     |  2 +-
+ .../gcc.target/aarch64/stack-check-prologue-18.c     |  7 +++++--
+ 3 files changed, 10 insertions(+), 11 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index 1e8467fdd03..705f719a2ea 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -5695,16 +5695,12 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+      are still safe.  */
+   if (residual)
+     {
+-      HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
++      gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
++
+       /* If we're doing final adjustments, and we've done any full page
+ 	 allocations then any residual needs to be probed.  */
+       if (final_adjustment_p && rounded_size != 0)
+ 	min_probe_threshold = 0;
+-      /* If doing a small final adjustment, we always probe at offset 0.
+-	 This is done to avoid issues when the final adjustment is smaller
+-	 than the probing offset.  */
+-      else if (final_adjustment_p && rounded_size == 0)
+-	residual_probe_offset = 0;
+ 
+       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
+       if (residual >= min_probe_threshold)
+@@ -5715,8 +5711,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ 		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
+ 		     "\n", residual);
+ 
+-	    emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+-					     residual_probe_offset));
++	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
++					   guard_used_by_caller));
+ 	  emit_insn (gen_blockage ());
+ 	}
+     }
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+index 0d8a25d73a2..f0ec1389771 100644
+--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+@@ -33,7 +33,7 @@ int test1(int z) {
+ **	...
+ **	str	x30, \[sp\]
+ **	sub	sp, sp, #1040
+-**	str	xzr, \[sp\]
++**	str	xzr, \[sp, #?1024\]
+ **	cbnz	w0, .*
+ **	bl	g
+ **	...
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+index 82447d20fff..71d33ba34e9 100644
+--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+@@ -8,8 +8,9 @@ void g();
+ ** test1:
+ **	...
+ **	str	x30, \[sp\]
++**	...
+ **	sub	sp, sp, #4064
+-**	str	xzr, \[sp\]
++**	str	xzr, \[sp, #?1024\]
+ **	cbnz	w0, .*
+ **	bl	g
+ **	...
+@@ -49,8 +50,9 @@ int test1(int z) {
+ ** test2:
+ **	...
+ **	str	x30, \[sp\]
++**	...
+ **	sub	sp, sp, #1040
+-**	str	xzr, \[sp\]
++**	str	xzr, \[sp, #?1024\]
+ **	cbnz	w0, .*
+ **	bl	g
+ **	...
+@@ -77,6 +79,7 @@ int test2(int z) {
+ ** test3:
+ **	...
+ **	str	x30, \[sp\]
++**	...
+ **	sub	sp, sp, #1024
+ **	cbnz	w0, .*
+ **	bl	g
+-- 
+2.34.1
+
+
+From bf3eeaa0182a92987570d9c787bd45079eebf528 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Thu, 15 Jun 2023 19:16:52 +0100
+Subject: [PATCH 10/10] aarch64: Make stack smash canary protect saved
+ registers
+
+AArch64 normally puts the saved registers near the bottom of the frame,
+immediately above any dynamic allocations.  But this means that a
+stack-smash attack on those dynamic allocations could overwrite the
+saved registers without needing to reach as far as the stack smash
+canary.
+
+The same thing could also happen for variable-sized arguments that are
+passed by value, since those are allocated before a call and popped on
+return.
+
+This patch avoids that by putting the locals (and thus the canary) below
+the saved registers when stack smash protection is active.
+
+The patch fixes CVE-2023-4039.
+
+gcc/
+	* config/aarch64/aarch64.c (aarch64_save_regs_above_locals_p):
+	New function.
+	(aarch64_layout_frame): Use it to decide whether locals should
+	go above or below the saved registers.
+	(aarch64_expand_prologue): Update stack layout comment.
+	Emit a stack tie after the final adjustment.
+
+gcc/testsuite/
+	* gcc.target/aarch64/stack-protector-8.c: New test.
+	* gcc.target/aarch64/stack-protector-9.c: Likewise.
+---
+ gcc/config/aarch64/aarch64.c                  | 46 +++++++++++++--
+ .../gcc.target/aarch64/stack-protector-8.c    | 58 +++++++++++++++++++
+ .../gcc.target/aarch64/stack-protector-9.c    | 33 +++++++++++
+ 3 files changed, 133 insertions(+), 4 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
+
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index 705f719a2ea..3d094214fac 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -4622,6 +4622,20 @@ aarch64_needs_frame_chain (void)
+   return aarch64_use_frame_pointer;
+ }
+ 
++/* Return true if the current function should save registers above
++   the locals area, rather than below it.  */
++
++static bool
++aarch64_save_regs_above_locals_p ()
++{
++  /* When using stack smash protection, make sure that the canary slot
++     comes between the locals and the saved registers.  Otherwise,
++     it would be possible for a carefully sized smash attack to change
++     the saved registers (particularly LR and FP) without reaching the
++     canary.  */
++  return crtl->stack_protect_guard;
++}
++
+ /* Mark the registers that need to be saved by the callee and calculate
+    the size of the callee-saved registers area and frame record (both FP
+    and LR may be omitted).  */
+@@ -4686,6 +4700,16 @@ aarch64_layout_frame (void)
+ 
+   cfun->machine->frame.bytes_below_hard_fp = crtl->outgoing_args_size;
+ 
++  bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
++
++  if (regs_at_top_p)
++    {
++      cfun->machine->frame.bytes_below_hard_fp += get_frame_size ();
++      cfun->machine->frame.bytes_below_hard_fp
++	= aligned_upper_bound (cfun->machine->frame.bytes_below_hard_fp,
++			       STACK_BOUNDARY / BITS_PER_UNIT);
++    }
++
+ #define ALLOCATE_GPR_SLOT(REGNO)					\
+   do									\
+     {									\
+@@ -4758,9 +4782,11 @@ aarch64_layout_frame (void)
+   HOST_WIDE_INT varargs_and_saved_regs_size
+     = offset + cfun->machine->frame.saved_varargs_size;
+ 
++  cfun->machine->frame.bytes_above_hard_fp = varargs_and_saved_regs_size;
++  if (!regs_at_top_p)
++    cfun->machine->frame.bytes_above_hard_fp += get_frame_size ();
+   cfun->machine->frame.bytes_above_hard_fp
+-    = aligned_upper_bound (varargs_and_saved_regs_size
+-			   + get_frame_size (),
++    = aligned_upper_bound (cfun->machine->frame.bytes_above_hard_fp,
+ 			   STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+   /* Both these values are already aligned.  */
+@@ -4772,6 +4798,9 @@ aarch64_layout_frame (void)
+ 
+   cfun->machine->frame.bytes_above_locals
+     = cfun->machine->frame.saved_varargs_size;
++  if (regs_at_top_p)
++    cfun->machine->frame.bytes_above_locals
++      += cfun->machine->frame.saved_regs_size;
+ 
+   cfun->machine->frame.initial_adjust = 0;
+   cfun->machine->frame.final_adjust = 0;
+@@ -5764,10 +5793,10 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
+ 	|  for register varargs         |
+ 	|                               |
+ 	+-------------------------------+
+-	|  local variables              | <-- frame_pointer_rtx
++	|  local variables (1)          | <-- frame_pointer_rtx
+ 	|                               |
+ 	+-------------------------------+
+-	|  padding                      | \
++	|  padding (1)                  | \
+ 	+-------------------------------+  |
+ 	|  callee-saved registers       |  | frame.saved_regs_size
+ 	+-------------------------------+  |
+@@ -5775,6 +5804,10 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
+ 	+-------------------------------+  |
+ 	|  FP'                          | / <- hard_frame_pointer_rtx (aligned)
+         +-------------------------------+
++	|  local variables (2)          |
++	+-------------------------------+
++	|  padding (2)                  |
++	+-------------------------------+
+ 	|  dynamic allocation           |
+ 	+-------------------------------+
+ 	|  padding                      |
+@@ -5784,6 +5817,9 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
+ 	+-------------------------------+
+ 	|                               | <-- stack_pointer_rtx (aligned)
+ 
++   The regions marked (1) and (2) are mutually exclusive.  (2) is used
++   when aarch64_save_regs_above_locals_p is true.
++
+    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
+    but leave frame_pointer_rtx and hard_frame_pointer_rtx
+    unchanged.
+@@ -5937,6 +5973,8 @@ aarch64_expand_prologue (void)
+      that is assumed by the called.  */
+   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
+ 					  !frame_pointer_needed, true);
++  if (emit_frame_chain && maybe_ne (final_adjust, 0))
++    emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
+ }
+ 
+ /* Return TRUE if we can use a simple_return insn.
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
+new file mode 100644
+index 00000000000..c5e7deef6c1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
+@@ -0,0 +1,58 @@
++/* { dg-options " -O -fstack-protector-strong -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 -mstack-protector-guard-offset=16" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void g(void *);
++
++/*
++** test1:
++**	sub	sp, sp, #288
++**	stp	x29, x30, \[sp, #?272\]
++**	add	x29, sp, #?272
++**	mrs	(x[0-9]+), tpidr2_el0
++**	ldr	(x[0-9]+), \[\1, #?16\]
++**	str	\2, \[sp, #?264\]
++**	mov	\2, *0
++**	add	x0, sp, #?8
++**	bl	g
++**	...
++**	mrs	.*
++**	...
++**	bne	.*
++**	...
++**	ldp	x29, x30, \[sp, #?272\]
++**	add	sp, sp, #?288
++**	ret
++**	bl	__stack_chk_fail
++*/
++int test1() {
++  int y[0x40];
++  g(y);
++  return 1;
++}
++
++/*
++** test2:
++**	stp	x29, x30, \[sp, #?-16\]!
++**	mov	x29, sp
++**	sub	sp, sp, #1040
++**	mrs	(x[0-9]+), tpidr2_el0
++**	ldr	(x[0-9]+), \[\1, #?16\]
++**	str	\2, \[sp, #?1032\]
++**	mov	\2, *0
++**	add	x0, sp, #?8
++**	bl	g
++**	...
++**	mrs	.*
++**	...
++**	bne	.*
++**	...
++**	add	sp, sp, #?1040
++**	ldp	x29, x30, \[sp\], #?16
++**	ret
++**	bl	__stack_chk_fail
++*/
++int test2() {
++  int y[0x100];
++  g(y);
++  return 1;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
+new file mode 100644
+index 00000000000..58f322aa480
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
+@@ -0,0 +1,33 @@
++/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/*
++** main:
++**	...
++**	stp	x29, x30, \[sp, #?-[0-9]+\]!
++**	...
++**	sub	sp, sp, #[0-9]+
++**	...
++**	str	x[0-9]+, \[x29, #?-8\]
++**	...
++*/
++int f(const char *);
++void g(void *);
++int main(int argc, char* argv[])
++{
++  int a;
++  int b;
++  char c[2+f(argv[1])];
++  int d[0x100];
++  char y;
++
++  y=42; a=4; b=10;
++  c[0] = 'h'; c[1] = '\0';
++
++  c[f(argv[2])] = '\0';
++
++  __builtin_printf("%d %d\n%s\n", a, b, c);
++  g(d);
++
++  return 0;
++}
+-- 
+2.34.1
+