diff mbox series

[mickledore] gcc: Fix -fstack-protector issue on aarch64

Message ID 20230912172439.2336327-1-ross.burton@arm.com
State New
Headers show
Series [mickledore] gcc: Fix -fstack-protector issue on aarch64 | expand

Commit Message

Ross Burton Sept. 12, 2023, 5:24 p.m. UTC
From: Ross Burton <ross.burton@arm.com>

This series of patches fixes deficiencies in GCC's -fstack-protector
implementation for AArch64 when using dynamically allocated stack space.
This is CVE-2023-4039.  See:

https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf

for more details.

Signed-off-by: Ross Burton <ross.burton@arm.com>
---
 meta/recipes-devtools/gcc/gcc-12.3.inc        |    1 +
 .../gcc/gcc/CVE-2023-4039.patch               | 3093 +++++++++++++++++
 2 files changed, 3094 insertions(+)
 create mode 100644 meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch

Comments

Martin Jansa Sept. 14, 2023, 9:07 a.m. UTC | #1
FYI: one of LGE proprietary components triggers ICE with this applied, I'll
try to find minimal reproducer later, this is just for other people who
might hit the same:

error: unrecognizable insn:
 2923 | }
      | ^
(insn 416 286 290 17 (parallel [
            (set (mem/c:SI (plus:DI (reg/f:DI 29 x29)
                        (const_int -260 [0xfffffffffffffefc])) [1
redacted.pixel_format+0 S4 A32])
                (const_int 0 [0]))
            (set (mem/c:SI (plus:DI (reg/f:DI 29 x29)
                        (const_int -256 [0xffffffffffffff00])) [1
redacted.pixel_value+0 S4 A128])
                (reg/v:SI 22 x22 [orig:141 color ] [141]))
        ])
"TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c":2903:45 -1
     (expr_list:REG_DEAD (reg/v:SI 22 x22 [orig:141 color ] [141])
        (nil)))
during RTL pass: cprop_hardreg
TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c:2923:1:
internal compiler error: in extract_insn, at recog.cc:2791
0x191624a internal_error(char const*, ...)
???:0
0x6bee26 fancy_abort(char const*, int, char const*)
???:0
0x697469 _fatal_insn(char const*, rtx_def const*, char const*, int, char
const*)
???:0
0x697485 _fatal_insn_not_found(rtx_def const*, char const*, int, char
const*)
???:0
0xbef198 extract_constrain_insn(rtx_insn*)
???:0

On Tue, Sep 12, 2023 at 7:24 PM Ross Burton <ross.burton@arm.com> wrote:

> From: Ross Burton <ross.burton@arm.com>
>
> This series of patches fixes deficiencies in GCC's -fstack-protector
> implementation for AArch64 when using dynamically allocated stack space.
> This is CVE-2023-4039.  See:
>
>
> https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
>
> https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
>
> for more details.
>
> Signed-off-by: Ross Burton <ross.burton@arm.com>
> ---
>  meta/recipes-devtools/gcc/gcc-12.3.inc        |    1 +
>  .../gcc/gcc/CVE-2023-4039.patch               | 3093 +++++++++++++++++
>  2 files changed, 3094 insertions(+)
>  create mode 100644 meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
>
> diff --git a/meta/recipes-devtools/gcc/gcc-12.3.inc
> b/meta/recipes-devtools/gcc/gcc-12.3.inc
> index 4ec03f925c8..5896f26e1af 100644
> --- a/meta/recipes-devtools/gcc/gcc-12.3.inc
> +++ b/meta/recipes-devtools/gcc/gcc-12.3.inc
> @@ -63,6 +63,7 @@ SRC_URI = "${BASEURI} \
>             file://0026-rust-recursion-limit.patch \
>             file://prefix-map-realpath.patch \
>             file://hardcoded-paths.patch \
> +           file://CVE-2023-4039.patch \
>  "
>  SRC_URI[sha256sum] =
> "949a5d4f99e786421a93b532b22ffab5578de7321369975b91aec97adfda8c3b"
>
> diff --git a/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
> b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
> new file mode 100644
> index 00000000000..8cb52849cd3
> --- /dev/null
> +++ b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
> @@ -0,0 +1,3093 @@
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue
> +Date: Tue, 12 Sep 2023 16:25:10 +0100
> +
> +This series of patches fixes deficiencies in GCC's -fstack-protector
> +implementation for AArch64 when using dynamically allocated stack space.
> +This is CVE-2023-4039.  See:
> +
> +
> https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
> +
> https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
> +
> +for more details.
> +
> +The fix is to put the saved registers above the locals area when
> +-fstack-protector is used.
> +
> +The series also fixes a stack-clash problem that I found while working
> +on the CVE.  In unpatched sources, the stack-clash problem would only
> +trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an
> +equivalent).  But it would be a more significant issue with the new
> +-fstack-protector frame layout.  It's therefore important that both
> +problems are fixed together.
> +
> +Some reorganisation of the code seemed necessary to fix the problems in a
> +cleanish way.  The series is therefore quite long, but only a handful of
> +patches should have any effect on code generation.
> +
> +See the individual patches for a detailed description.
> +
> +Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches.
> +I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039.
> +
> +CVE: CVE-2023-4039
> +Upstream-Status: Backport
> +Signed-off-by: Ross Burton <ross.burton@arm.com>
> +
> +
> +From 62fbb215cc817e9f2c1ca80282a64f4ee30806bc Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:48 +0100
> +Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping
> code
> +
> +aarch64_layout_frame uses a shorthand for referring to
> +cfun->machine->frame:
> +
> +  aarch64_frame &frame = cfun->machine->frame;
> +
> +This patch does the same for some other heavy users of the structure.
> +No functional change intended.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use
> +       a local shorthand for cfun->machine->frame.
> +       (aarch64_restore_callee_saves, aarch64_get_separate_components):
> +       (aarch64_process_components): Likewise.
> +       (aarch64_allocate_and_probe_stack_space): Likewise.
> +       (aarch64_expand_prologue, aarch64_expand_epilogue): Likewise.
> +       (aarch64_layout_frame): Use existing shorthand for one more case.
> +---
> + gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++----------------
> + 1 file changed, 64 insertions(+), 59 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 226dc9dffd4..ae42ffdedbe 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8351,7 +8351,7 @@ aarch64_layout_frame (void)
> +   frame.is_scs_enabled
> +     = (!crtl->calls_eh_return
> +        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
> +-       && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
> ++       && known_ge (frame.reg_offset[LR_REGNUM], 0));
> +
> +   /* When shadow call stack is enabled, the scs_pop in the epilogue will
> +      restore x30, and we don't need to pop x30 again in the traditional
> +@@ -8763,6 +8763,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +                          unsigned start, unsigned limit, bool skip_wb,
> +                          bool hard_fp_valid_p)
> + {
> ++  aarch64_frame &frame = cfun->machine->frame;
> +   rtx_insn *insn;
> +   unsigned regno;
> +   unsigned regno2;
> +@@ -8777,8 +8778,8 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
> +
> +       if (skip_wb
> +-        && (regno == cfun->machine->frame.wb_push_candidate1
> +-            || regno == cfun->machine->frame.wb_push_candidate2))
> ++        && (regno == frame.wb_push_candidate1
> ++            || regno == frame.wb_push_candidate2))
> +       continue;
> +
> +       if (cfun->machine->reg_is_wrapped_separately[regno])
> +@@ -8786,7 +8787,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +
> +       machine_mode mode = aarch64_reg_save_mode (regno);
> +       reg = gen_rtx_REG (mode, regno);
> +-      offset = start_offset + cfun->machine->frame.reg_offset[regno];
> ++      offset = start_offset + frame.reg_offset[regno];
> +       rtx base_rtx = stack_pointer_rtx;
> +       poly_int64 sp_offset = offset;
> +
> +@@ -8799,7 +8800,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +       {
> +         gcc_assert (known_eq (start_offset, 0));
> +         poly_int64 fp_offset
> +-          = cfun->machine->frame.below_hard_fp_saved_regs_size;
> ++          = frame.below_hard_fp_saved_regs_size;
> +         if (hard_fp_valid_p)
> +           base_rtx = hard_frame_pointer_rtx;
> +         else
> +@@ -8821,8 +8822,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +         && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <=
> limit
> +         && !cfun->machine->reg_is_wrapped_separately[regno2]
> +         && known_eq (GET_MODE_SIZE (mode),
> +-                     cfun->machine->frame.reg_offset[regno2]
> +-                     - cfun->machine->frame.reg_offset[regno]))
> ++                     frame.reg_offset[regno2] - frame.reg_offset[regno]))
> +       {
> +         rtx reg2 = gen_rtx_REG (mode, regno2);
> +         rtx mem2;
> +@@ -8872,6 +8872,7 @@ static void
> + aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
> +                             unsigned limit, bool skip_wb, rtx *cfi_ops)
> + {
> ++  aarch64_frame &frame = cfun->machine->frame;
> +   unsigned regno;
> +   unsigned regno2;
> +   poly_int64 offset;
> +@@ -8888,13 +8889,13 @@ aarch64_restore_callee_saves (poly_int64
> start_offset, unsigned start,
> +       rtx reg, mem;
> +
> +       if (skip_wb
> +-        && (regno == cfun->machine->frame.wb_pop_candidate1
> +-            || regno == cfun->machine->frame.wb_pop_candidate2))
> ++        && (regno == frame.wb_pop_candidate1
> ++            || regno == frame.wb_pop_candidate2))
> +       continue;
> +
> +       machine_mode mode = aarch64_reg_save_mode (regno);
> +       reg = gen_rtx_REG (mode, regno);
> +-      offset = start_offset + cfun->machine->frame.reg_offset[regno];
> ++      offset = start_offset + frame.reg_offset[regno];
> +       rtx base_rtx = stack_pointer_rtx;
> +       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
> +       aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
> +@@ -8905,8 +8906,7 @@ aarch64_restore_callee_saves (poly_int64
> start_offset, unsigned start,
> +         && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <=
> limit
> +         && !cfun->machine->reg_is_wrapped_separately[regno2]
> +         && known_eq (GET_MODE_SIZE (mode),
> +-                     cfun->machine->frame.reg_offset[regno2]
> +-                     - cfun->machine->frame.reg_offset[regno]))
> ++                     frame.reg_offset[regno2] - frame.reg_offset[regno]))
> +       {
> +         rtx reg2 = gen_rtx_REG (mode, regno2);
> +         rtx mem2;
> +@@ -9011,6 +9011,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode,
> poly_int64 offset)
> + static sbitmap
> + aarch64_get_separate_components (void)
> + {
> ++  aarch64_frame &frame = cfun->machine->frame;
> +   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
> +   bitmap_clear (components);
> +
> +@@ -9027,18 +9028,18 @@ aarch64_get_separate_components (void)
> +       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
> +         continue;
> +
> +-      poly_int64 offset = cfun->machine->frame.reg_offset[regno];
> ++      poly_int64 offset = frame.reg_offset[regno];
> +
> +       /* If the register is saved in the first SVE save slot, we use
> +          it as a stack probe for -fstack-clash-protection.  */
> +       if (flag_stack_clash_protection
> +-          && maybe_ne
> (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
> ++          && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
> +           && known_eq (offset, 0))
> +         continue;
> +
> +       /* Get the offset relative to the register we'll use.  */
> +       if (frame_pointer_needed)
> +-        offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
> ++        offset -= frame.below_hard_fp_saved_regs_size;
> +       else
> +         offset += crtl->outgoing_args_size;
> +
> +@@ -9057,11 +9058,11 @@ aarch64_get_separate_components (void)
> +   /* If the spare predicate register used by big-endian SVE code
> +      is call-preserved, it must be saved in the main prologue
> +      before any saves that use it.  */
> +-  if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
> +-    bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
> ++  if (frame.spare_pred_reg != INVALID_REGNUM)
> ++    bitmap_clear_bit (components, frame.spare_pred_reg);
> +
> +-  unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
> +-  unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
> ++  unsigned reg1 = frame.wb_push_candidate1;
> ++  unsigned reg2 = frame.wb_push_candidate2;
> +   /* If registers have been chosen to be stored/restored with
> +      writeback don't interfere with them to avoid having to output
> explicit
> +      stack adjustment instructions.  */
> +@@ -9170,6 +9171,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int
> start)
> + static void
> + aarch64_process_components (sbitmap components, bool prologue_p)
> + {
> ++  aarch64_frame &frame = cfun->machine->frame;
> +   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
> +                            ? HARD_FRAME_POINTER_REGNUM
> +                            : STACK_POINTER_REGNUM);
> +@@ -9184,9 +9186,9 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> +       machine_mode mode = aarch64_reg_save_mode (regno);
> +
> +       rtx reg = gen_rtx_REG (mode, regno);
> +-      poly_int64 offset = cfun->machine->frame.reg_offset[regno];
> ++      poly_int64 offset = frame.reg_offset[regno];
> +       if (frame_pointer_needed)
> +-      offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
> ++      offset -= frame.below_hard_fp_saved_regs_size;
> +       else
> +       offset += crtl->outgoing_args_size;
> +
> +@@ -9211,14 +9213,14 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> +         break;
> +       }
> +
> +-      poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
> ++      poly_int64 offset2 = frame.reg_offset[regno2];
> +       /* The next register is not of the same class or its offset is not
> +        mergeable with the current one into a pair.  */
> +       if (aarch64_sve_mode_p (mode)
> +         || !satisfies_constraint_Ump (mem)
> +         || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
> +         || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
> +-        || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
> ++        || maybe_ne ((offset2 - frame.reg_offset[regno]),
> +                      GET_MODE_SIZE (mode)))
> +       {
> +         insn = emit_insn (set);
> +@@ -9240,7 +9242,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> +       /* REGNO2 can be saved/restored in a pair with REGNO.  */
> +       rtx reg2 = gen_rtx_REG (mode, regno2);
> +       if (frame_pointer_needed)
> +-      offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
> ++      offset2 -= frame.below_hard_fp_saved_regs_size;
> +       else
> +       offset2 += crtl->outgoing_args_size;
> +       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
> +@@ -9335,6 +9337,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> +                                       bool frame_related_p,
> +                                       bool final_adjustment_p)
> + {
> ++  aarch64_frame &frame = cfun->machine->frame;
> +   HOST_WIDE_INT guard_size
> +     = 1 << param_stack_clash_protection_guard_size;
> +   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
> +@@ -9355,25 +9358,25 @@ aarch64_allocate_and_probe_stack_space (rtx
> temp1, rtx temp2,
> +        register as a probe.  We can't assume that LR was saved at
> position 0
> +        though, so treat any space below it as unprobed.  */
> +   if (final_adjustment_p
> +-      && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size,
> 0))
> ++      && known_eq (frame.below_hard_fp_saved_regs_size, 0))
> +     {
> +-      poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
> ++      poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
> +       if (known_ge (lr_offset, 0))
> +       min_probe_threshold -= lr_offset.to_constant ();
> +       else
> +       gcc_assert (!flag_stack_clash_protection || known_eq (poly_size,
> 0));
> +     }
> +
> +-  poly_int64 frame_size = cfun->machine->frame.frame_size;
> ++  poly_int64 frame_size = frame.frame_size;
> +
> +   /* We should always have a positive probe threshold.  */
> +   gcc_assert (min_probe_threshold > 0);
> +
> +   if (flag_stack_clash_protection && !final_adjustment_p)
> +     {
> +-      poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
> +-      poly_int64 sve_callee_adjust =
> cfun->machine->frame.sve_callee_adjust;
> +-      poly_int64 final_adjust = cfun->machine->frame.final_adjust;
> ++      poly_int64 initial_adjust = frame.initial_adjust;
> ++      poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> ++      poly_int64 final_adjust = frame.final_adjust;
> +
> +       if (known_eq (frame_size, 0))
> +       {
> +@@ -9662,17 +9665,18 @@ aarch64_epilogue_uses (int regno)
> + void
> + aarch64_expand_prologue (void)
> + {
> +-  poly_int64 frame_size = cfun->machine->frame.frame_size;
> +-  poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
> +-  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
> +-  poly_int64 final_adjust = cfun->machine->frame.final_adjust;
> +-  poly_int64 callee_offset = cfun->machine->frame.callee_offset;
> +-  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
> ++  aarch64_frame &frame = cfun->machine->frame;
> ++  poly_int64 frame_size = frame.frame_size;
> ++  poly_int64 initial_adjust = frame.initial_adjust;
> ++  HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> ++  poly_int64 final_adjust = frame.final_adjust;
> ++  poly_int64 callee_offset = frame.callee_offset;
> ++  poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> +   poly_int64 below_hard_fp_saved_regs_size
> +-    = cfun->machine->frame.below_hard_fp_saved_regs_size;
> +-  unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
> +-  unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
> +-  bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
> ++    = frame.below_hard_fp_saved_regs_size;
> ++  unsigned reg1 = frame.wb_push_candidate1;
> ++  unsigned reg2 = frame.wb_push_candidate2;
> ++  bool emit_frame_chain = frame.emit_frame_chain;
> +   rtx_insn *insn;
> +
> +   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
> +@@ -9703,7 +9707,7 @@ aarch64_expand_prologue (void)
> +     }
> +
> +   /* Push return address to shadow call stack.  */
> +-  if (cfun->machine->frame.is_scs_enabled)
> ++  if (frame.is_scs_enabled)
> +     emit_insn (gen_scs_push ());
> +
> +   if (flag_stack_usage_info)
> +@@ -9742,7 +9746,7 @@ aarch64_expand_prologue (void)
> +
> +   /* The offset of the frame chain record (if any) from the current SP.
> */
> +   poly_int64 chain_offset = (initial_adjust + callee_adjust
> +-                           - cfun->machine->frame.hard_fp_offset);
> ++                           - frame.hard_fp_offset);
> +   gcc_assert (known_ge (chain_offset, 0));
> +
> +   /* The offset of the bottom of the save area from the current SP.  */
> +@@ -9845,16 +9849,17 @@ aarch64_use_return_insn_p (void)
> + void
> + aarch64_expand_epilogue (bool for_sibcall)
> + {
> +-  poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
> +-  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
> +-  poly_int64 final_adjust = cfun->machine->frame.final_adjust;
> +-  poly_int64 callee_offset = cfun->machine->frame.callee_offset;
> +-  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
> ++  aarch64_frame &frame = cfun->machine->frame;
> ++  poly_int64 initial_adjust = frame.initial_adjust;
> ++  HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> ++  poly_int64 final_adjust = frame.final_adjust;
> ++  poly_int64 callee_offset = frame.callee_offset;
> ++  poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> +   poly_int64 below_hard_fp_saved_regs_size
> +-    = cfun->machine->frame.below_hard_fp_saved_regs_size;
> +-  unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
> +-  unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
> +-  unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
> ++    = frame.below_hard_fp_saved_regs_size;
> ++  unsigned reg1 = frame.wb_pop_candidate1;
> ++  unsigned reg2 = frame.wb_pop_candidate2;
> ++  unsigned int last_gpr = (frame.is_scs_enabled
> +                          ? R29_REGNUM : R30_REGNUM);
> +   rtx cfi_ops = NULL;
> +   rtx_insn *insn;
> +@@ -9888,7 +9893,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> +   /* We need to add memory barrier to prevent read from deallocated
> stack.  */
> +   bool need_barrier_p
> +     = maybe_ne (get_frame_size ()
> +-              + cfun->machine->frame.saved_varargs_size, 0);
> ++              + frame.saved_varargs_size, 0);
> +
> +   /* Emit a barrier to prevent loads from a deallocated stack.  */
> +   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
> +@@ -9969,7 +9974,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> +     }
> +
> +   /* Pop return address from shadow call stack.  */
> +-  if (cfun->machine->frame.is_scs_enabled)
> ++  if (frame.is_scs_enabled)
> +     {
> +       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
> +       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
> +@@ -12564,24 +12569,24 @@ aarch64_can_eliminate (const int from
> ATTRIBUTE_UNUSED, const int to)
> + poly_int64
> + aarch64_initial_elimination_offset (unsigned from, unsigned to)
> + {
> ++  aarch64_frame &frame = cfun->machine->frame;
> ++
> +   if (to == HARD_FRAME_POINTER_REGNUM)
> +     {
> +       if (from == ARG_POINTER_REGNUM)
> +-      return cfun->machine->frame.hard_fp_offset;
> ++      return frame.hard_fp_offset;
> +
> +       if (from == FRAME_POINTER_REGNUM)
> +-      return cfun->machine->frame.hard_fp_offset
> +-             - cfun->machine->frame.locals_offset;
> ++      return frame.hard_fp_offset - frame.locals_offset;
> +     }
> +
> +   if (to == STACK_POINTER_REGNUM)
> +     {
> +       if (from == FRAME_POINTER_REGNUM)
> +-        return cfun->machine->frame.frame_size
> +-               - cfun->machine->frame.locals_offset;
> ++      return frame.frame_size - frame.locals_offset;
> +     }
> +
> +-  return cfun->machine->frame.frame_size;
> ++  return frame.frame_size;
> + }
> +
> +
> +--
> +2.34.1
> +
> +
> +From 12a8889de169f892d2e927584c00d20b8b7e456f Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:49 +0100
> +Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset
> +
> +When we emit the frame chain, i.e. when we reach Here in this statement
> +of aarch64_expand_prologue:
> +
> +  if (emit_frame_chain)
> +    {
> +      // Here
> +      ...
> +    }
> +
> +the stack is in one of two states:
> +
> +- We've allocated up to the frame chain, but no more.
> +
> +- We've allocated the whole frame, and the frame chain is within easy
> +  reach of the new SP.
> +
> +The offset of the frame chain from the current SP is available
> +in aarch64_frame as callee_offset.  It is also available as the
> +chain_offset local variable, where the latter is calculated from other
> +data.  (However, chain_offset is not always equal to callee_offset when
> +!emit_frame_chain, so chain_offset isn't redundant.)
> +
> +In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using
> +chain_offset for the initialisation of the hard frame pointer:
> +
> +       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
> +-                         stack_pointer_rtx, callee_offset,
> ++                         stack_pointer_rtx, chain_offset,
> +                          tmp1_rtx, tmp0_rtx, frame_pointer_needed);
> +
> +But the later REG_CFA_ADJUST_CFA handling still used callee_offset.
> +
> +I think the difference is harmless, but it's more logical for the
> +CFA note to be in sync, and it's more convenient for later patches
> +if it uses chain_offset.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc (aarch64_expand_prologue): Use
> +       chain_offset rather than callee_offset.
> +---
> + gcc/config/aarch64/aarch64.cc | 4 +---
> + 1 file changed, 1 insertion(+), 3 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index ae42ffdedbe..79253322fd7 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -9670,7 +9670,6 @@ aarch64_expand_prologue (void)
> +   poly_int64 initial_adjust = frame.initial_adjust;
> +   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> +   poly_int64 final_adjust = frame.final_adjust;
> +-  poly_int64 callee_offset = frame.callee_offset;
> +   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> +   poly_int64 below_hard_fp_saved_regs_size
> +     = frame.below_hard_fp_saved_regs_size;
> +@@ -9783,8 +9782,7 @@ aarch64_expand_prologue (void)
> +            implicit.  */
> +         if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
> +           {
> +-            rtx src = plus_constant (Pmode, stack_pointer_rtx,
> +-                                     callee_offset);
> ++            rtx src = plus_constant (Pmode, stack_pointer_rtx,
> chain_offset);
> +             add_reg_note (insn, REG_CFA_ADJUST_CFA,
> +                           gen_rtx_SET (hard_frame_pointer_rtx, src));
> +           }
> +--
> +2.34.1
> +
> +
> +From 03d5e89e7f3be53fd7142556e8e0a2774c653dca Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:49 +0100
> +Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved
> + registers
> +
> +If a frame has no saved registers, it can be allocated in one go.
> +There is no need to treat the areas below and above the saved
> +registers as separate.
> +
> +And if we allocate the frame in one go, it should be allocated
> +as the initial_adjust rather than the final_adjust.  This allows the
> +frame size to grow to guard_size - guard_used_by_caller before a stack
> +probe is needed.  (A frame with no register saves is necessarily a
> +leaf frame.)
> +
> +This is a no-op as thing stand, since a leaf function will have
> +no outgoing arguments, and so all the frame will be above where
> +the saved registers normally go.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly
> +       allocate the frame in one go if there are no saved registers.
> +---
> + gcc/config/aarch64/aarch64.cc | 8 +++++---
> + 1 file changed, 5 insertions(+), 3 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 79253322fd7..e1f21230c15 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8378,9 +8378,11 @@ aarch64_layout_frame (void)
> +
> +   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
> +   HOST_WIDE_INT const_saved_regs_size;
> +-  if (frame.frame_size.is_constant (&const_size)
> +-      && const_size < max_push_offset
> +-      && known_eq (frame.hard_fp_offset, const_size))
> ++  if (known_eq (frame.saved_regs_size, 0))
> ++    frame.initial_adjust = frame.frame_size;
> ++  else if (frame.frame_size.is_constant (&const_size)
> ++         && const_size < max_push_offset
> ++         && known_eq (frame.hard_fp_offset, const_size))
> +     {
> +       /* Simple, small frame with no outgoing arguments:
> +
> +--
> +2.34.1
> +
> +
> +From 49c2eb7616756c323b7f6b18d8616ec945eb1263 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:49 +0100
> +Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info
> +
> +The frame layout code currently hard-codes the assumption that
> +the number of bytes below the saved registers is equal to the
> +size of the outgoing arguments.  This patch abstracts that
> +value into a new field of aarch64_frame.
> +
> +gcc/
> +       * config/aarch64/aarch64.h
> (aarch64_frame::bytes_below_saved_regs): New
> +       field.
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it,
> +       and use it instead of crtl->outgoing_args_size.
> +       (aarch64_get_separate_components): Use bytes_below_saved_regs
> instead
> +       of outgoing_args_size.
> +       (aarch64_process_components): Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++-----------------
> + gcc/config/aarch64/aarch64.h  |  5 +++
> + 2 files changed, 41 insertions(+), 35 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index e1f21230c15..94e1b686584 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8217,6 +8217,8 @@ aarch64_layout_frame (void)
> +   gcc_assert (crtl->is_leaf
> +             || maybe_ne (frame.reg_offset[R30_REGNUM],
> SLOT_NOT_REQUIRED));
> +
> ++  frame.bytes_below_saved_regs = crtl->outgoing_args_size;
> ++
> +   /* Now assign stack slots for the registers.  Start with the predicate
> +      registers, since predicate LDR and STR have a relatively small
> +      offset range.  These saves happen below the hard frame pointer.  */
> +@@ -8321,18 +8323,18 @@ aarch64_layout_frame (void)
> +
> +   poly_int64 varargs_and_saved_regs_size = offset +
> frame.saved_varargs_size;
> +
> +-  poly_int64 above_outgoing_args
> ++  poly_int64 saved_regs_and_above
> +     = aligned_upper_bound (varargs_and_saved_regs_size
> +                          + get_frame_size (),
> +                          STACK_BOUNDARY / BITS_PER_UNIT);
> +
> +   frame.hard_fp_offset
> +-    = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
> ++    = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
> +
> +   /* Both these values are already aligned.  */
> +-  gcc_assert (multiple_p (crtl->outgoing_args_size,
> ++  gcc_assert (multiple_p (frame.bytes_below_saved_regs,
> +                         STACK_BOUNDARY / BITS_PER_UNIT));
> +-  frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
> ++  frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
> +
> +   frame.locals_offset = frame.saved_varargs_size;
> +
> +@@ -8376,7 +8378,7 @@ aarch64_layout_frame (void)
> +   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
> +     max_push_offset = 256;
> +
> +-  HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
> ++  HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
> +   HOST_WIDE_INT const_saved_regs_size;
> +   if (known_eq (frame.saved_regs_size, 0))
> +     frame.initial_adjust = frame.frame_size;
> +@@ -8384,31 +8386,31 @@ aarch64_layout_frame (void)
> +          && const_size < max_push_offset
> +          && known_eq (frame.hard_fp_offset, const_size))
> +     {
> +-      /* Simple, small frame with no outgoing arguments:
> ++      /* Simple, small frame with no data below the saved registers.
> +
> +        stp reg1, reg2, [sp, -frame_size]!
> +        stp reg3, reg4, [sp, 16]  */
> +       frame.callee_adjust = const_size;
> +     }
> +-  else if (crtl->outgoing_args_size.is_constant
> (&const_outgoing_args_size)
> ++  else if (frame.bytes_below_saved_regs.is_constant
> (&const_below_saved_regs)
> +          && frame.saved_regs_size.is_constant (&const_saved_regs_size)
> +-         && const_outgoing_args_size + const_saved_regs_size < 512
> +-         /* We could handle this case even with outgoing args, provided
> +-            that the number of args left us with valid offsets for all
> +-            predicate and vector save slots.  It's such a rare case that
> +-            it hardly seems worth the effort though.  */
> +-         && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
> ++         && const_below_saved_regs + const_saved_regs_size < 512
> ++         /* We could handle this case even with data below the saved
> ++            registers, provided that that data left us with valid offsets
> ++            for all predicate and vector save slots.  It's such a rare
> ++            case that it hardly seems worth the effort though.  */
> ++         && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
> +          && !(cfun->calls_alloca
> +               && frame.hard_fp_offset.is_constant (&const_fp_offset)
> +               && const_fp_offset < max_push_offset))
> +     {
> +-      /* Frame with small outgoing arguments:
> ++      /* Frame with small area below the saved registers:
> +
> +        sub sp, sp, frame_size
> +-       stp reg1, reg2, [sp, outgoing_args_size]
> +-       stp reg3, reg4, [sp, outgoing_args_size + 16]  */
> ++       stp reg1, reg2, [sp, bytes_below_saved_regs]
> ++       stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
> +       frame.initial_adjust = frame.frame_size;
> +-      frame.callee_offset = const_outgoing_args_size;
> ++      frame.callee_offset = const_below_saved_regs;
> +     }
> +   else if (saves_below_hard_fp_p
> +          && known_eq (frame.saved_regs_size,
> +@@ -8418,30 +8420,29 @@ aarch64_layout_frame (void)
> +
> +        sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
> +        save SVE registers relative to SP
> +-       sub sp, sp, outgoing_args_size  */
> ++       sub sp, sp, bytes_below_saved_regs  */
> +       frame.initial_adjust = (frame.hard_fp_offset
> +                             + frame.below_hard_fp_saved_regs_size);
> +-      frame.final_adjust = crtl->outgoing_args_size;
> ++      frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
> +          && const_fp_offset < max_push_offset)
> +     {
> +-      /* Frame with large outgoing arguments or SVE saves, but with
> +-       a small local area:
> ++      /* Frame with large area below the saved registers, or with SVE
> saves,
> ++       but with a small area above:
> +
> +        stp reg1, reg2, [sp, -hard_fp_offset]!
> +        stp reg3, reg4, [sp, 16]
> +        [sub sp, sp, below_hard_fp_saved_regs_size]
> +        [save SVE registers relative to SP]
> +-       sub sp, sp, outgoing_args_size  */
> ++       sub sp, sp, bytes_below_saved_regs  */
> +       frame.callee_adjust = const_fp_offset;
> +       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> +-      frame.final_adjust = crtl->outgoing_args_size;
> ++      frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +   else
> +     {
> +-      /* Frame with large local area and outgoing arguments or SVE saves,
> +-       using frame pointer:
> ++      /* General case:
> +
> +        sub sp, sp, hard_fp_offset
> +        stp x29, x30, [sp, 0]
> +@@ -8449,10 +8450,10 @@ aarch64_layout_frame (void)
> +        stp reg3, reg4, [sp, 16]
> +        [sub sp, sp, below_hard_fp_saved_regs_size]
> +        [save SVE registers relative to SP]
> +-       sub sp, sp, outgoing_args_size  */
> ++       sub sp, sp, bytes_below_saved_regs  */
> +       frame.initial_adjust = frame.hard_fp_offset;
> +       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> +-      frame.final_adjust = crtl->outgoing_args_size;
> ++      frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +
> +   /* Make sure the individual adjustments add up to the full frame
> size.  */
> +@@ -9043,7 +9044,7 @@ aarch64_get_separate_components (void)
> +       if (frame_pointer_needed)
> +         offset -= frame.below_hard_fp_saved_regs_size;
> +       else
> +-        offset += crtl->outgoing_args_size;
> ++        offset += frame.bytes_below_saved_regs;
> +
> +       /* Check that we can access the stack slot of the register with one
> +          direct load with no adjustments needed.  */
> +@@ -9192,7 +9193,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> +       if (frame_pointer_needed)
> +       offset -= frame.below_hard_fp_saved_regs_size;
> +       else
> +-      offset += crtl->outgoing_args_size;
> ++      offset += frame.bytes_below_saved_regs;
> +
> +       rtx addr = plus_constant (Pmode, ptr_reg, offset);
> +       rtx mem = gen_frame_mem (mode, addr);
> +@@ -9246,7 +9247,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> +       if (frame_pointer_needed)
> +       offset2 -= frame.below_hard_fp_saved_regs_size;
> +       else
> +-      offset2 += crtl->outgoing_args_size;
> ++      offset2 += frame.bytes_below_saved_regs;
> +       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
> +       rtx mem2 = gen_frame_mem (mode, addr2);
> +       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
> +@@ -9320,10 +9321,10 @@ aarch64_stack_clash_protection_alloca_probe_range
> (void)
> +    registers.  If POLY_SIZE is not large enough to require a probe this
> function
> +    will only adjust the stack.  When allocating the stack space
> +    FRAME_RELATED_P is then used to indicate if the allocation is frame
> related.
> +-   FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
> +-   arguments.  If we are then we ensure that any allocation larger than
> the ABI
> +-   defined buffer needs a probe so that the invariant of having a 1KB
> buffer is
> +-   maintained.
> ++   FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
> ++   the saved registers.  If we are then we ensure that any allocation
> ++   larger than the ABI defined buffer needs a probe so that the
> ++   invariant of having a 1KB buffer is maintained.
> +
> +    We emit barriers after each stack adjustment to prevent optimizations
> from
> +    breaking the invariant that we never drop the stack more than a
> page.  This
> +@@ -9532,7 +9533,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> +   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD
> have to
> +      be probed.  This maintains the requirement that each page is probed
> at
> +      least once.  For initial probing we probe only if the allocation is
> +-     more than GUARD_SIZE - buffer, and for the outgoing arguments we
> probe
> ++     more than GUARD_SIZE - buffer, and below the saved registers we
> probe
> +      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer
> ==
> +      GUARD_SIZE.  This works that for any allocation that is large
> enough to
> +      trigger a probe here, we'll have at least one, and if they're not
> large
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 6834c3e9922..1e105e12db8 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -871,6 +871,11 @@ struct GTY (()) aarch64_frame
> +   /* The size of the callee-save registers with a slot in REG_OFFSET.  */
> +   poly_int64 saved_regs_size;
> +
> ++  /* The number of bytes between the bottom of the static frame (the
> bottom
> ++     of the outgoing arguments) and the bottom of the register save area.
> ++     This value is always a multiple of STACK_BOUNDARY.  */
> ++  poly_int64 bytes_below_saved_regs;
> ++
> +   /* The size of the callee-save registers with a slot in REG_OFFSET that
> +      are saved below the hard frame pointer.  */
> +   poly_int64 below_hard_fp_saved_regs_size;
> +--
> +2.34.1
> +
> +
> +From 34081079ea4de0c98331843f574b5f6f94d7b234 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:50 +0100
> +Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info
> +
> +Following on from the previous bytes_below_saved_regs patch, this one
> +records the number of bytes that are below the hard frame pointer.
> +This eventually replaces below_hard_fp_saved_regs_size.
> +
> +If a frame pointer is not needed, the epilogue adds final_adjust
> +to the stack pointer before restoring registers:
> +
> +     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
> +
> +Therefore, if the epilogue needs to restore the stack pointer from
> +the hard frame pointer, the directly corresponding offset is:
> +
> +     -bytes_below_hard_fp + final_adjust
> +
> +i.e. go from the hard frame pointer to the bottom of the frame,
> +then add the same amount as if we were using the stack pointer
> +from the outset.
> +
> +gcc/
> +       * config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp):
> New
> +       field.
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it.
> +       (aarch64_expand_epilogue): Use it instead of
> +       below_hard_fp_saved_regs_size.
> +---
> + gcc/config/aarch64/aarch64.cc | 6 +++---
> + gcc/config/aarch64/aarch64.h  | 5 +++++
> + 2 files changed, 8 insertions(+), 3 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 94e1b686584..c7d84245fbf 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8269,6 +8269,7 @@ aarch64_layout_frame (void)
> +      of the callee save area.  */
> +   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
> +   frame.below_hard_fp_saved_regs_size = offset;
> ++  frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
> +   if (frame.emit_frame_chain)
> +     {
> +       /* FP and LR are placed in the linkage record.  */
> +@@ -9856,8 +9857,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> +   poly_int64 final_adjust = frame.final_adjust;
> +   poly_int64 callee_offset = frame.callee_offset;
> +   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> +-  poly_int64 below_hard_fp_saved_regs_size
> +-    = frame.below_hard_fp_saved_regs_size;
> ++  poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
> +   unsigned reg1 = frame.wb_pop_candidate1;
> +   unsigned reg2 = frame.wb_pop_candidate2;
> +   unsigned int last_gpr = (frame.is_scs_enabled
> +@@ -9915,7 +9915,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> +        is restored on the instruction doing the writeback.  */
> +     aarch64_add_offset (Pmode, stack_pointer_rtx,
> +                       hard_frame_pointer_rtx,
> +-                      -callee_offset - below_hard_fp_saved_regs_size,
> ++                      -bytes_below_hard_fp + final_adjust,
> +                       tmp1_rtx, tmp0_rtx, callee_adjust == 0);
> +   else
> +      /* The case where we need to re-use the register here is very rare,
> so
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 1e105e12db8..de68ff7202f 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -880,6 +880,11 @@ struct GTY (()) aarch64_frame
> +      are saved below the hard frame pointer.  */
> +   poly_int64 below_hard_fp_saved_regs_size;
> +
> ++  /* The number of bytes between the bottom of the static frame (the
> bottom
> ++     of the outgoing arguments) and the hard frame pointer.  This value
> is
> ++     always a multiple of STACK_BOUNDARY.  */
> ++  poly_int64 bytes_below_hard_fp;
> ++
> +   /* Offset from the base of the frame (incomming SP) to the
> +      top of the locals area.  This value is always a multiple of
> +      STACK_BOUNDARY.  */
> +--
> +2.34.1
> +
> +
> +From 187861af7c51db9eddc6f954b589c121b210fc74 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:50 +0100
> +Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves
> +
> +aarch64_save_callee_saves and aarch64_restore_callee_saves took
> +a parameter called start_offset that gives the offset of the
> +bottom of the saved register area from the current stack pointer.
> +However, it's more convenient for later patches if we use the
> +bottom of the entire frame as the reference point, rather than
> +the bottom of the saved registers.
> +
> +Doing that removes the need for the callee_offset field.
> +Other than that, this is not a win on its own.  It only really
> +makes sense in combination with the follow-on patches.
> +
> +gcc/
> +       * config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete.
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Remove
> +       callee_offset handling.
> +       (aarch64_save_callee_saves): Replace the start_offset parameter
> +       with a bytes_below_sp parameter.
> +       (aarch64_restore_callee_saves): Likewise.
> +       (aarch64_expand_prologue): Update accordingly.
> +       (aarch64_expand_epilogue): Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------
> + gcc/config/aarch64/aarch64.h  |  4 ---
> + 2 files changed, 28 insertions(+), 32 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index c7d84245fbf..e79551af41d 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8343,7 +8343,6 @@ aarch64_layout_frame (void)
> +   frame.final_adjust = 0;
> +   frame.callee_adjust = 0;
> +   frame.sve_callee_adjust = 0;
> +-  frame.callee_offset = 0;
> +
> +   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
> +   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
> +@@ -8411,7 +8410,6 @@ aarch64_layout_frame (void)
> +        stp reg1, reg2, [sp, bytes_below_saved_regs]
> +        stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
> +       frame.initial_adjust = frame.frame_size;
> +-      frame.callee_offset = const_below_saved_regs;
> +     }
> +   else if (saves_below_hard_fp_p
> +          && known_eq (frame.saved_regs_size,
> +@@ -8758,12 +8756,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx
> reg,
> + }
> +
> + /* Emit code to save the callee-saved registers from register number
> START
> +-   to LIMIT to the stack at the location starting at offset START_OFFSET,
> +-   skipping any write-back candidates if SKIP_WB is true.
> HARD_FP_VALID_P
> +-   is true if the hard frame pointer has been set up.  */
> ++   to LIMIT to the stack.  The stack pointer is currently BYTES_BELOW_SP
> ++   bytes above the bottom of the static frame.  Skip any write-back
> ++   candidates if SKIP_WB is true.  HARD_FP_VALID_P is true if the hard
> ++   frame pointer has been set up.  */
> +
> + static void
> +-aarch64_save_callee_saves (poly_int64 start_offset,
> ++aarch64_save_callee_saves (poly_int64 bytes_below_sp,
> +                          unsigned start, unsigned limit, bool skip_wb,
> +                          bool hard_fp_valid_p)
> + {
> +@@ -8791,7 +8790,9 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +
> +       machine_mode mode = aarch64_reg_save_mode (regno);
> +       reg = gen_rtx_REG (mode, regno);
> +-      offset = start_offset + frame.reg_offset[regno];
> ++      offset = (frame.reg_offset[regno]
> ++              + frame.bytes_below_saved_regs
> ++              - bytes_below_sp);
> +       rtx base_rtx = stack_pointer_rtx;
> +       poly_int64 sp_offset = offset;
> +
> +@@ -8802,9 +8803,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +       else if (GP_REGNUM_P (regno)
> +              && (!offset.is_constant (&const_offset) || const_offset >=
> 512))
> +       {
> +-        gcc_assert (known_eq (start_offset, 0));
> +-        poly_int64 fp_offset
> +-          = frame.below_hard_fp_saved_regs_size;
> ++        poly_int64 fp_offset = frame.bytes_below_hard_fp -
> bytes_below_sp;
> +         if (hard_fp_valid_p)
> +           base_rtx = hard_frame_pointer_rtx;
> +         else
> +@@ -8868,12 +8867,13 @@ aarch64_save_callee_saves (poly_int64
> start_offset,
> + }
> +
> + /* Emit code to restore the callee registers from register number START
> +-   up to and including LIMIT.  Restore from the stack offset
> START_OFFSET,
> +-   skipping any write-back candidates if SKIP_WB is true.  Write the
> +-   appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
> ++   up to and including LIMIT.  The stack pointer is currently
> BYTES_BELOW_SP
> ++   bytes above the bottom of the static frame.  Skip any write-back
> ++   candidates if SKIP_WB is true.  Write the appropriate REG_CFA_RESTORE
> ++   notes into CFI_OPS.  */
> +
> + static void
> +-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
> ++aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
> +                             unsigned limit, bool skip_wb, rtx *cfi_ops)
> + {
> +   aarch64_frame &frame = cfun->machine->frame;
> +@@ -8899,7 +8899,9 @@ aarch64_restore_callee_saves (poly_int64
> start_offset, unsigned start,
> +
> +       machine_mode mode = aarch64_reg_save_mode (regno);
> +       reg = gen_rtx_REG (mode, regno);
> +-      offset = start_offset + frame.reg_offset[regno];
> ++      offset = (frame.reg_offset[regno]
> ++              + frame.bytes_below_saved_regs
> ++              - bytes_below_sp);
> +       rtx base_rtx = stack_pointer_rtx;
> +       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
> +       aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
> +@@ -9675,8 +9677,6 @@ aarch64_expand_prologue (void)
> +   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> +   poly_int64 final_adjust = frame.final_adjust;
> +   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> +-  poly_int64 below_hard_fp_saved_regs_size
> +-    = frame.below_hard_fp_saved_regs_size;
> +   unsigned reg1 = frame.wb_push_candidate1;
> +   unsigned reg2 = frame.wb_push_candidate2;
> +   bool emit_frame_chain = frame.emit_frame_chain;
> +@@ -9752,8 +9752,8 @@ aarch64_expand_prologue (void)
> +                            - frame.hard_fp_offset);
> +   gcc_assert (known_ge (chain_offset, 0));
> +
> +-  /* The offset of the bottom of the save area from the current SP.  */
> +-  poly_int64 saved_regs_offset = chain_offset -
> below_hard_fp_saved_regs_size;
> ++  /* The offset of the current SP from the bottom of the static frame.
> */
> ++  poly_int64 bytes_below_sp = frame_size - initial_adjust -
> callee_adjust;
> +
> +   if (emit_frame_chain)
> +     {
> +@@ -9761,7 +9761,7 @@ aarch64_expand_prologue (void)
> +       {
> +         reg1 = R29_REGNUM;
> +         reg2 = R30_REGNUM;
> +-        aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
> ++        aarch64_save_callee_saves (bytes_below_sp, reg1, reg2,
> +                                    false, false);
> +       }
> +       else
> +@@ -9801,7 +9801,7 @@ aarch64_expand_prologue (void)
> +       emit_insn (gen_stack_tie (stack_pointer_rtx,
> hard_frame_pointer_rtx));
> +     }
> +
> +-  aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
> ++  aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM,
> +                            callee_adjust != 0 || emit_frame_chain,
> +                            emit_frame_chain);
> +   if (maybe_ne (sve_callee_adjust, 0))
> +@@ -9811,16 +9811,17 @@ aarch64_expand_prologue (void)
> +       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
> +                                             sve_callee_adjust,
> +                                             !frame_pointer_needed,
> false);
> +-      saved_regs_offset += sve_callee_adjust;
> ++      bytes_below_sp -= sve_callee_adjust;
> +     }
> +-  aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
> ++  aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM,
> +                            false, emit_frame_chain);
> +-  aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
> ++  aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM,
> +                            callee_adjust != 0 || emit_frame_chain,
> +                            emit_frame_chain);
> +
> +   /* We may need to probe the final adjustment if it is larger than the
> guard
> +      that is assumed by the called.  */
> ++  gcc_assert (known_eq (bytes_below_sp, final_adjust));
> +   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
> final_adjust,
> +                                         !frame_pointer_needed, true);
> + }
> +@@ -9855,7 +9856,6 @@ aarch64_expand_epilogue (bool for_sibcall)
> +   poly_int64 initial_adjust = frame.initial_adjust;
> +   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> +   poly_int64 final_adjust = frame.final_adjust;
> +-  poly_int64 callee_offset = frame.callee_offset;
> +   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> +   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
> +   unsigned reg1 = frame.wb_pop_candidate1;
> +@@ -9925,9 +9925,9 @@ aarch64_expand_epilogue (bool for_sibcall)
> +
> +   /* Restore the vector registers before the predicate registers,
> +      so that we can use P4 as a temporary for big-endian SVE frames.  */
> +-  aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
> ++  aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM,
> +                               callee_adjust != 0, &cfi_ops);
> +-  aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
> ++  aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM,
> +                               false, &cfi_ops);
> +   if (maybe_ne (sve_callee_adjust, 0))
> +     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
> +@@ -9935,7 +9935,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> +   /* When shadow call stack is enabled, the scs_pop in the epilogue will
> +      restore x30, we don't need to restore x30 again in the traditional
> +      way.  */
> +-  aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
> ++  aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
> +                               R0_REGNUM, last_gpr,
> +                               callee_adjust != 0, &cfi_ops);
> +
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index de68ff7202f..94fca4b9471 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -907,10 +907,6 @@ struct GTY (()) aarch64_frame
> +      It is zero when no push is used.  */
> +   HOST_WIDE_INT callee_adjust;
> +
> +-  /* The offset from SP to the callee-save registers after
> initial_adjust.
> +-     It may be non-zero if no push is used (ie. callee_adjust == 0).  */
> +-  poly_int64 callee_offset;
> +-
> +   /* The size of the stack adjustment before saving or after restoring
> +      SVE registers.  */
> +   poly_int64 sve_callee_adjust;
> +--
> +2.34.1
> +
> +
> +From 2b983f9064d808daf909bde1d4a13980934a7e6e Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:51 +0100
> +Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a
> + chain
> +
> +After previous patches, it is no longer necessary to calculate
> +a chain_offset in cases where there is no chain record.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the
> +       calculation of chain_offset into the emit_frame_chain block.
> +---
> + gcc/config/aarch64/aarch64.cc | 10 +++++-----
> + 1 file changed, 5 insertions(+), 5 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index e79551af41d..d71a042d611 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -9747,16 +9747,16 @@ aarch64_expand_prologue (void)
> +   if (callee_adjust != 0)
> +     aarch64_push_regs (reg1, reg2, callee_adjust);
> +
> +-  /* The offset of the frame chain record (if any) from the current SP.
> */
> +-  poly_int64 chain_offset = (initial_adjust + callee_adjust
> +-                           - frame.hard_fp_offset);
> +-  gcc_assert (known_ge (chain_offset, 0));
> +-
> +   /* The offset of the current SP from the bottom of the static frame.
> */
> +   poly_int64 bytes_below_sp = frame_size - initial_adjust -
> callee_adjust;
> +
> +   if (emit_frame_chain)
> +     {
> ++      /* The offset of the frame chain record (if any) from the current
> SP.  */
> ++      poly_int64 chain_offset = (initial_adjust + callee_adjust
> ++                               - frame.hard_fp_offset);
> ++      gcc_assert (known_ge (chain_offset, 0));
> ++
> +       if (callee_adjust == 0)
> +       {
> +         reg1 = R29_REGNUM;
> +--
> +2.34.1
> +
> +
> +From 0a0a824808d1dec51004fb5805c1a0ae2a35433f Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:51 +0100
> +Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals
> +MIME-Version: 1.0
> +Content-Type: text/plain; charset=UTF-8
> +Content-Transfer-Encoding: 8bit
> +
> +locals_offset was described as:
> +
> +  /* Offset from the base of the frame (incomming SP) to the
> +     top of the locals area.  This value is always a multiple of
> +     STACK_BOUNDARY.  */
> +
> +This is implicitly an “upside down” view of the frame: the incoming
> +SP is at offset 0, and anything N bytes below the incoming SP is at
> +offset N (rather than -N).
> +
> +However, reg_offset instead uses a “right way up” view; that is,
> +it views offsets in address terms.  Something above X is at a
> +positive offset from X and something below X is at a negative
> +offset from X.
> +
> +Also, even on FRAME_GROWS_DOWNWARD targets like AArch64,
> +target-independent code views offsets in address terms too:
> +locals are allocated at negative offsets to virtual_stack_vars.
> +
> +It seems confusing to have *_offset fields of the same structure
> +using different polarities like this.  This patch tries to avoid
> +that by renaming locals_offset to bytes_above_locals.
> +
> +gcc/
> +       * config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename
> to...
> +       (aarch64_frame::bytes_above_locals): ...this.
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame)
> +       (aarch64_initial_elimination_offset): Update accordingly.
> +---
> + gcc/config/aarch64/aarch64.cc | 6 +++---
> + gcc/config/aarch64/aarch64.h  | 6 +++---
> + 2 files changed, 6 insertions(+), 6 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index d71a042d611..d4ec352ba98 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8337,7 +8337,7 @@ aarch64_layout_frame (void)
> +                         STACK_BOUNDARY / BITS_PER_UNIT));
> +   frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
> +
> +-  frame.locals_offset = frame.saved_varargs_size;
> ++  frame.bytes_above_locals = frame.saved_varargs_size;
> +
> +   frame.initial_adjust = 0;
> +   frame.final_adjust = 0;
> +@@ -12578,13 +12578,13 @@ aarch64_initial_elimination_offset (unsigned
> from, unsigned to)
> +       return frame.hard_fp_offset;
> +
> +       if (from == FRAME_POINTER_REGNUM)
> +-      return frame.hard_fp_offset - frame.locals_offset;
> ++      return frame.hard_fp_offset - frame.bytes_above_locals;
> +     }
> +
> +   if (to == STACK_POINTER_REGNUM)
> +     {
> +       if (from == FRAME_POINTER_REGNUM)
> +-      return frame.frame_size - frame.locals_offset;
> ++      return frame.frame_size - frame.bytes_above_locals;
> +     }
> +
> +   return frame.frame_size;
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 94fca4b9471..bf46e6124aa 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -885,10 +885,10 @@ struct GTY (()) aarch64_frame
> +      always a multiple of STACK_BOUNDARY.  */
> +   poly_int64 bytes_below_hard_fp;
> +
> +-  /* Offset from the base of the frame (incomming SP) to the
> +-     top of the locals area.  This value is always a multiple of
> ++  /* The number of bytes between the top of the locals area and the top
> ++     of the frame (the incomming SP).  This value is always a multiple of
> +      STACK_BOUNDARY.  */
> +-  poly_int64 locals_offset;
> ++  poly_int64 bytes_above_locals;
> +
> +   /* Offset from the base of the frame (incomming SP) to the
> +      hard_frame_pointer.  This value is always a multiple of
> +--
> +2.34.1
> +
> +
> +From 3fbf0789202b30a67b12e1fb785c7130f098d665 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:52 +0100
> +Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to
> bytes_above_hard_fp
> +MIME-Version: 1.0
> +Content-Type: text/plain; charset=UTF-8
> +Content-Transfer-Encoding: 8bit
> +
> +Similarly to the previous locals_offset patch, hard_fp_offset
> +was described as:
> +
> +  /* Offset from the base of the frame (incomming SP) to the
> +     hard_frame_pointer.  This value is always a multiple of
> +     STACK_BOUNDARY.  */
> +  poly_int64 hard_fp_offset;
> +
> +which again took an “upside-down” view: higher offsets meant lower
> +addresses.  This patch renames the field to bytes_above_hard_fp instead.
> +
> +gcc/
> +       * config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename
> +       to...
> +       (aarch64_frame::bytes_above_hard_fp): ...this.
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame)
> +       (aarch64_expand_prologue): Update accordingly.
> +       (aarch64_initial_elimination_offset): Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 26 +++++++++++++-------------
> + gcc/config/aarch64/aarch64.h  |  6 +++---
> + 2 files changed, 16 insertions(+), 16 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index d4ec352ba98..3c4052740e7 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8329,7 +8329,7 @@ aarch64_layout_frame (void)
> +                          + get_frame_size (),
> +                          STACK_BOUNDARY / BITS_PER_UNIT);
> +
> +-  frame.hard_fp_offset
> ++  frame.bytes_above_hard_fp
> +     = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
> +
> +   /* Both these values are already aligned.  */
> +@@ -8378,13 +8378,13 @@ aarch64_layout_frame (void)
> +   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
> +     max_push_offset = 256;
> +
> +-  HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
> ++  HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
> +   HOST_WIDE_INT const_saved_regs_size;
> +   if (known_eq (frame.saved_regs_size, 0))
> +     frame.initial_adjust = frame.frame_size;
> +   else if (frame.frame_size.is_constant (&const_size)
> +          && const_size < max_push_offset
> +-         && known_eq (frame.hard_fp_offset, const_size))
> ++         && known_eq (frame.bytes_above_hard_fp, const_size))
> +     {
> +       /* Simple, small frame with no data below the saved registers.
> +
> +@@ -8401,8 +8401,8 @@ aarch64_layout_frame (void)
> +             case that it hardly seems worth the effort though.  */
> +          && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
> +          && !(cfun->calls_alloca
> +-              && frame.hard_fp_offset.is_constant (&const_fp_offset)
> +-              && const_fp_offset < max_push_offset))
> ++              && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
> ++              && const_above_fp < max_push_offset))
> +     {
> +       /* Frame with small area below the saved registers:
> +
> +@@ -8420,12 +8420,12 @@ aarch64_layout_frame (void)
> +        sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
> +        save SVE registers relative to SP
> +        sub sp, sp, bytes_below_saved_regs  */
> +-      frame.initial_adjust = (frame.hard_fp_offset
> ++      frame.initial_adjust = (frame.bytes_above_hard_fp
> +                             + frame.below_hard_fp_saved_regs_size);
> +       frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +-  else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
> +-         && const_fp_offset < max_push_offset)
> ++  else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
> ++         && const_above_fp < max_push_offset)
> +     {
> +       /* Frame with large area below the saved registers, or with SVE
> saves,
> +        but with a small area above:
> +@@ -8435,7 +8435,7 @@ aarch64_layout_frame (void)
> +        [sub sp, sp, below_hard_fp_saved_regs_size]
> +        [save SVE registers relative to SP]
> +        sub sp, sp, bytes_below_saved_regs  */
> +-      frame.callee_adjust = const_fp_offset;
> ++      frame.callee_adjust = const_above_fp;
> +       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> +       frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +@@ -8450,7 +8450,7 @@ aarch64_layout_frame (void)
> +        [sub sp, sp, below_hard_fp_saved_regs_size]
> +        [save SVE registers relative to SP]
> +        sub sp, sp, bytes_below_saved_regs  */
> +-      frame.initial_adjust = frame.hard_fp_offset;
> ++      frame.initial_adjust = frame.bytes_above_hard_fp;
> +       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> +       frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +@@ -9754,7 +9754,7 @@ aarch64_expand_prologue (void)
> +     {
> +       /* The offset of the frame chain record (if any) from the current
> SP.  */
> +       poly_int64 chain_offset = (initial_adjust + callee_adjust
> +-                               - frame.hard_fp_offset);
> ++                               - frame.bytes_above_hard_fp);
> +       gcc_assert (known_ge (chain_offset, 0));
> +
> +       if (callee_adjust == 0)
> +@@ -12575,10 +12575,10 @@ aarch64_initial_elimination_offset (unsigned
> from, unsigned to)
> +   if (to == HARD_FRAME_POINTER_REGNUM)
> +     {
> +       if (from == ARG_POINTER_REGNUM)
> +-      return frame.hard_fp_offset;
> ++      return frame.bytes_above_hard_fp;
> +
> +       if (from == FRAME_POINTER_REGNUM)
> +-      return frame.hard_fp_offset - frame.bytes_above_locals;
> ++      return frame.bytes_above_hard_fp - frame.bytes_above_locals;
> +     }
> +
> +   if (to == STACK_POINTER_REGNUM)
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index bf46e6124aa..dd1f403f939 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -890,10 +890,10 @@ struct GTY (()) aarch64_frame
> +      STACK_BOUNDARY.  */
> +   poly_int64 bytes_above_locals;
> +
> +-  /* Offset from the base of the frame (incomming SP) to the
> +-     hard_frame_pointer.  This value is always a multiple of
> ++  /* The number of bytes between the hard_frame_pointer and the top of
> ++     the frame (the incomming SP).  This value is always a multiple of
> +      STACK_BOUNDARY.  */
> +-  poly_int64 hard_fp_offset;
> ++  poly_int64 bytes_above_hard_fp;
> +
> +   /* The size of the frame.  This value is the offset from base of the
> +      frame (incomming SP) to the stack_pointer.  This value is always
> +--
> +2.34.1
> +
> +
> +From aac8b31379ac3bbd14fc6427dce23f56e54e8485 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:52 +0100
> +Subject: [PATCH 10/19] aarch64: Tweak frame_size comment
> +MIME-Version: 1.0
> +Content-Type: text/plain; charset=UTF-8
> +Content-Transfer-Encoding: 8bit
> +
> +This patch fixes another case in which a value was described with
> +an “upside-down” view.
> +
> +gcc/
> +       * config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak
> comment.
> +---
> + gcc/config/aarch64/aarch64.h | 4 ++--
> + 1 file changed, 2 insertions(+), 2 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index dd1f403f939..700524ae22b 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -895,8 +895,8 @@ struct GTY (()) aarch64_frame
> +      STACK_BOUNDARY.  */
> +   poly_int64 bytes_above_hard_fp;
> +
> +-  /* The size of the frame.  This value is the offset from base of the
> +-     frame (incomming SP) to the stack_pointer.  This value is always
> ++  /* The size of the frame, i.e. the number of bytes between the bottom
> ++     of the outgoing arguments and the incoming SP.  This value is always
> +      a multiple of STACK_BOUNDARY.  */
> +   poly_int64 frame_size;
> +
> +--
> +2.34.1
> +
> +
> +From 8d5506a8aeb8dd7e8b209a3663b07688478f76b9 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:53 +0100
> +Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the
> + frame
> +
> +reg_offset was measured from the bottom of the saved register area.
> +This made perfect sense with the original layout, since the bottom
> +of the saved register area was also the hard frame pointer address.
> +It became slightly less obvious with SVE, since we save SVE
> +registers below the hard frame pointer, but it still made sense.
> +
> +However, if we want to allow different frame layouts, it's more
> +convenient and obvious to measure reg_offset from the bottom of
> +the frame.  After previous patches, it's also a slight simplification
> +in its own right.
> +
> +gcc/
> +       * config/aarch64/aarch64.h (aarch64_frame): Add comment above
> +       reg_offset.
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets
> +       from the bottom of the frame, rather than the bottom of the saved
> +       register area.  Measure reg_offset from the bottom of the frame
> +       rather than the bottom of the saved register area.
> +       (aarch64_save_callee_saves): Update accordingly.
> +       (aarch64_restore_callee_saves): Likewise.
> +       (aarch64_get_separate_components): Likewise.
> +       (aarch64_process_components): Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++-------------------
> + gcc/config/aarch64/aarch64.h  |  3 ++
> + 2 files changed, 27 insertions(+), 29 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 3c4052740e7..97dd077844b 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8139,7 +8139,6 @@ aarch64_needs_frame_chain (void)
> + static void
> + aarch64_layout_frame (void)
> + {
> +-  poly_int64 offset = 0;
> +   int regno, last_fp_reg = INVALID_REGNUM;
> +   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
> +   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
> +@@ -8217,7 +8216,9 @@ aarch64_layout_frame (void)
> +   gcc_assert (crtl->is_leaf
> +             || maybe_ne (frame.reg_offset[R30_REGNUM],
> SLOT_NOT_REQUIRED));
> +
> +-  frame.bytes_below_saved_regs = crtl->outgoing_args_size;
> ++  poly_int64 offset = crtl->outgoing_args_size;
> ++  gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> ++  frame.bytes_below_saved_regs = offset;
> +
> +   /* Now assign stack slots for the registers.  Start with the predicate
> +      registers, since predicate LDR and STR have a relatively small
> +@@ -8229,7 +8230,8 @@ aarch64_layout_frame (void)
> +       offset += BYTES_PER_SVE_PRED;
> +       }
> +
> +-  if (maybe_ne (offset, 0))
> ++  poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
> ++  if (maybe_ne (saved_prs_size, 0))
> +     {
> +       /* If we have any vector registers to save above the predicate
> registers,
> +        the offset of the vector register save slots need to be a multiple
> +@@ -8247,10 +8249,10 @@ aarch64_layout_frame (void)
> +       offset = aligned_upper_bound (offset, STACK_BOUNDARY /
> BITS_PER_UNIT);
> +       else
> +       {
> +-        if (known_le (offset, vector_save_size))
> +-          offset = vector_save_size;
> +-        else if (known_le (offset, vector_save_size * 2))
> +-          offset = vector_save_size * 2;
> ++        if (known_le (saved_prs_size, vector_save_size))
> ++          offset = frame.bytes_below_saved_regs + vector_save_size;
> ++        else if (known_le (saved_prs_size, vector_save_size * 2))
> ++          offset = frame.bytes_below_saved_regs + vector_save_size * 2;
> +         else
> +           gcc_unreachable ();
> +       }
> +@@ -8267,9 +8269,10 @@ aarch64_layout_frame (void)
> +
> +   /* OFFSET is now the offset of the hard frame pointer from the bottom
> +      of the callee save area.  */
> +-  bool saves_below_hard_fp_p = maybe_ne (offset, 0);
> +-  frame.below_hard_fp_saved_regs_size = offset;
> +-  frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
> ++  frame.below_hard_fp_saved_regs_size = offset -
> frame.bytes_below_saved_regs;
> ++  bool saves_below_hard_fp_p
> ++    = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
> ++  frame.bytes_below_hard_fp = offset;
> +   if (frame.emit_frame_chain)
> +     {
> +       /* FP and LR are placed in the linkage record.  */
> +@@ -8320,9 +8323,10 @@ aarch64_layout_frame (void)
> +
> +   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +
> +-  frame.saved_regs_size = offset;
> ++  frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
> +
> +-  poly_int64 varargs_and_saved_regs_size = offset +
> frame.saved_varargs_size;
> ++  poly_int64 varargs_and_saved_regs_size
> ++    = frame.saved_regs_size + frame.saved_varargs_size;
> +
> +   poly_int64 saved_regs_and_above
> +     = aligned_upper_bound (varargs_and_saved_regs_size
> +@@ -8790,9 +8794,7 @@ aarch64_save_callee_saves (poly_int64
> bytes_below_sp,
> +
> +       machine_mode mode = aarch64_reg_save_mode (regno);
> +       reg = gen_rtx_REG (mode, regno);
> +-      offset = (frame.reg_offset[regno]
> +-              + frame.bytes_below_saved_regs
> +-              - bytes_below_sp);
> ++      offset = frame.reg_offset[regno] - bytes_below_sp;
> +       rtx base_rtx = stack_pointer_rtx;
> +       poly_int64 sp_offset = offset;
> +
> +@@ -8899,9 +8901,7 @@ aarch64_restore_callee_saves (poly_int64
> bytes_below_sp, unsigned start,
> +
> +       machine_mode mode = aarch64_reg_save_mode (regno);
> +       reg = gen_rtx_REG (mode, regno);
> +-      offset = (frame.reg_offset[regno]
> +-              + frame.bytes_below_saved_regs
> +-              - bytes_below_sp);
> ++      offset = frame.reg_offset[regno] - bytes_below_sp;
> +       rtx base_rtx = stack_pointer_rtx;
> +       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
> +       aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
> +@@ -9040,14 +9040,12 @@ aarch64_get_separate_components (void)
> +          it as a stack probe for -fstack-clash-protection.  */
> +       if (flag_stack_clash_protection
> +           && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
> +-          && known_eq (offset, 0))
> ++          && known_eq (offset, frame.bytes_below_saved_regs))
> +         continue;
> +
> +       /* Get the offset relative to the register we'll use.  */
> +       if (frame_pointer_needed)
> +-        offset -= frame.below_hard_fp_saved_regs_size;
> +-      else
> +-        offset += frame.bytes_below_saved_regs;
> ++        offset -= frame.bytes_below_hard_fp;
> +
> +       /* Check that we can access the stack slot of the register with one
> +          direct load with no adjustments needed.  */
> +@@ -9194,9 +9192,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> +       rtx reg = gen_rtx_REG (mode, regno);
> +       poly_int64 offset = frame.reg_offset[regno];
> +       if (frame_pointer_needed)
> +-      offset -= frame.below_hard_fp_saved_regs_size;
> +-      else
> +-      offset += frame.bytes_below_saved_regs;
> ++      offset -= frame.bytes_below_hard_fp;
> +
> +       rtx addr = plus_constant (Pmode, ptr_reg, offset);
> +       rtx mem = gen_frame_mem (mode, addr);
> +@@ -9248,9 +9244,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> +       /* REGNO2 can be saved/restored in a pair with REGNO.  */
> +       rtx reg2 = gen_rtx_REG (mode, regno2);
> +       if (frame_pointer_needed)
> +-      offset2 -= frame.below_hard_fp_saved_regs_size;
> +-      else
> +-      offset2 += frame.bytes_below_saved_regs;
> ++      offset2 -= frame.bytes_below_hard_fp;
> +       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
> +       rtx mem2 = gen_frame_mem (mode, addr2);
> +       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
> +@@ -9366,7 +9360,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> +   if (final_adjustment_p
> +       && known_eq (frame.below_hard_fp_saved_regs_size, 0))
> +     {
> +-      poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
> ++      poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
> ++                            - frame.bytes_below_saved_regs);
> +       if (known_ge (lr_offset, 0))
> +       min_probe_threshold -= lr_offset.to_constant ();
> +       else
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 700524ae22b..b6135837073 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -860,6 +860,9 @@ extern enum aarch64_processor aarch64_tune;
> + #ifdef HAVE_POLY_INT_H
> + struct GTY (()) aarch64_frame
> + {
> ++  /* The offset from the bottom of the static frame (the bottom of the
> ++     outgoing arguments) of each register save slot, or -2 if no save is
> ++     needed.  */
> +   poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
> +
> +   /* The number of extra stack bytes taken up by register varargs.
> +--
> +2.34.1
> +
> +
> +From b47766614df3b9df878262efb2ad73aaac108363 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:53 +0100
> +Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation
> +
> +After previous patches, it no longer really makes sense to allocate
> +the top of the frame in terms of varargs_and_saved_regs_size and
> +saved_regs_and_above.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify
> +       the allocation of the top of the frame.
> +---
> + gcc/config/aarch64/aarch64.cc | 23 ++++++++---------------
> + 1 file changed, 8 insertions(+), 15 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 97dd077844b..81935852d5b 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8325,23 +8325,16 @@ aarch64_layout_frame (void)
> +
> +   frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
> +
> +-  poly_int64 varargs_and_saved_regs_size
> +-    = frame.saved_regs_size + frame.saved_varargs_size;
> +-
> +-  poly_int64 saved_regs_and_above
> +-    = aligned_upper_bound (varargs_and_saved_regs_size
> +-                         + get_frame_size (),
> +-                         STACK_BOUNDARY / BITS_PER_UNIT);
> +-
> +-  frame.bytes_above_hard_fp
> +-    = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
> ++  offset += get_frame_size ();
> ++  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> ++  auto top_of_locals = offset;
> +
> +-  /* Both these values are already aligned.  */
> +-  gcc_assert (multiple_p (frame.bytes_below_saved_regs,
> +-                        STACK_BOUNDARY / BITS_PER_UNIT));
> +-  frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
> ++  offset += frame.saved_varargs_size;
> ++  gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> ++  frame.frame_size = offset;
> +
> +-  frame.bytes_above_locals = frame.saved_varargs_size;
> ++  frame.bytes_above_hard_fp = frame.frame_size -
> frame.bytes_below_hard_fp;
> ++  frame.bytes_above_locals = frame.frame_size - top_of_locals;
> +
> +   frame.initial_adjust = 0;
> +   frame.final_adjust = 0;
> +--
> +2.34.1
> +
> +
> +From 08f71b4bb28fb74d20e8d2927a557e8119ce9f4d Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:54 +0100
> +Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak
> +
> +This patch just changes a calculation of initial_adjust
> +to one that makes it slightly more obvious that the total
> +adjustment is frame.frame_size.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak
> +       calculation of initial_adjust for frames in which all saves
> +       are SVE saves.
> +---
> + gcc/config/aarch64/aarch64.cc | 5 ++---
> + 1 file changed, 2 insertions(+), 3 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 81935852d5b..4d9fcf3d162 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8414,11 +8414,10 @@ aarch64_layout_frame (void)
> +     {
> +       /* Frame in which all saves are SVE saves:
> +
> +-       sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
> ++       sub sp, sp, frame_size - bytes_below_saved_regs
> +        save SVE registers relative to SP
> +        sub sp, sp, bytes_below_saved_regs  */
> +-      frame.initial_adjust = (frame.bytes_above_hard_fp
> +-                            + frame.below_hard_fp_saved_regs_size);
> ++      frame.initial_adjust = frame.frame_size -
> frame.bytes_below_saved_regs;
> +       frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +   else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
> +--
> +2.34.1
> +
> +
> +From f22315d5c19e8310e4dc880fd509678fd291fca8 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:54 +0100
> +Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition
> +
> +The AArch64 ABI says that, when stack clash protection is used,
> +there can be a maximum of 1KiB of unprobed space at sp on entry
> +to a function.  Therefore, we need to probe when allocating
> +>= guard_size - 1KiB of data (>= rather than >).  This is what
> +GCC does.
> +
> +If an allocation is exactly guard_size bytes, it is enough to allocate
> +those bytes and probe once at offset 1024.  It isn't possible to use a
> +single probe at any other offset: higher would conmplicate later code,
> +by leaving more unprobed space than usual, while lower would risk
> +leaving an entire page unprobed.  For simplicity, the code probes all
> +allocations at offset 1024.
> +
> +Some register saves also act as probes.  If we need to allocate
> +more space below the last such register save probe, we need to
> +probe the allocation if it is > 1KiB.  Again, this allocation is
> +then sometimes (but not always) probed at offset 1024.  This sort of
> +allocation is currently only used for outgoing arguments, which are
> +rarely this big.
> +
> +However, the code also probed if this final outgoing-arguments
> +allocation was == 1KiB, rather than just > 1KiB.  This isn't
> +necessary, since the register save then probes at offset 1024
> +as required.  Continuing to probe allocations of exactly 1KiB
> +would complicate later patches.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc
> (aarch64_allocate_and_probe_stack_space):
> +       Don't probe final allocations that are exactly 1KiB in size (after
> +       unprobed space above the final allocation has been deducted).
> +
> +gcc/testsuite/
> +       * gcc.target/aarch64/stack-check-prologue-17.c: New test.
> +---
> + gcc/config/aarch64/aarch64.cc                 |  4 +-
> + .../aarch64/stack-check-prologue-17.c         | 55 +++++++++++++++++++
> + 2 files changed, 58 insertions(+), 1 deletion(-)
> + create mode 100644
> gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 4d9fcf3d162..34c1d8614cd 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -9333,9 +9333,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> +   HOST_WIDE_INT guard_size
> +     = 1 << param_stack_clash_protection_guard_size;
> +   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
> ++  HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
> ++  gcc_assert (multiple_p (poly_size, byte_sp_alignment));
> +   HOST_WIDE_INT min_probe_threshold
> +     = (final_adjustment_p
> +-       ? guard_used_by_caller
> ++       ? guard_used_by_caller + byte_sp_alignment
> +        : guard_size - guard_used_by_caller);
> +   /* When doing the final adjustment for the outgoing arguments, take
> into
> +      account any unprobed space there is above the current SP.  There are
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +new file mode 100644
> +index 00000000000..0d8a25d73a2
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +@@ -0,0 +1,55 @@
> ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer
> --param stack-clash-protection-guard-size=12" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++void f(int, ...);
> ++void g();
> ++
> ++/*
> ++** test1:
> ++**    ...
> ++**    str     x30, \[sp\]
> ++**    sub     sp, sp, #1024
> ++**    cbnz    w0, .*
> ++**    bl      g
> ++**    ...
> ++*/
> ++int test1(int z) {
> ++  __uint128_t x = 0;
> ++  int y[0x400];
> ++  if (z)
> ++    {
> ++      f(0, 0, 0, 0, 0, 0, 0, &y,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++    }
> ++  g();
> ++  return 1;
> ++}
> ++
> ++/*
> ++** test2:
> ++**    ...
> ++**    str     x30, \[sp\]
> ++**    sub     sp, sp, #1040
> ++**    str     xzr, \[sp\]
> ++**    cbnz    w0, .*
> ++**    bl      g
> ++**    ...
> ++*/
> ++int test2(int z) {
> ++  __uint128_t x = 0;
> ++  int y[0x400];
> ++  if (z)
> ++    {
> ++      f(0, 0, 0, 0, 0, 0, 0, &y,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x);
> ++    }
> ++  g();
> ++  return 1;
> ++}
> +--
> +2.34.1
> +
> +
> +From 15e18831bf98fd25af098b970ebf0c9a6200a34b Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:55 +0100
> +Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes
> +
> +-fstack-clash-protection uses the save of LR as a probe for the next
> +allocation.  The next allocation could be:
> +
> +* another part of the static frame, e.g. when allocating SVE save slots
> +  or outgoing arguments
> +
> +* an alloca in the same function
> +
> +* an allocation made by a callee function
> +
> +However, when -fomit-frame-pointer is used, the LR save slot is placed
> +above the other GPR save slots.  It could therefore be up to 80 bytes
> +above the base of the GPR save area (which is also the hard fp address).
> +
> +aarch64_allocate_and_probe_stack_space took this into account when
> +deciding how much subsequent space could be allocated without needing
> +a probe.  However, it interacted badly with:
> +
> +      /* If doing a small final adjustment, we always probe at offset 0.
> +        This is done to avoid issues when LR is not at position 0 or when
> +        the final adjustment is smaller than the probing offset.  */
> +      else if (final_adjustment_p && rounded_size == 0)
> +       residual_probe_offset = 0;
> +
> +which forces any allocation that is smaller than the guard page size
> +to be probed at offset 0 rather than the usual offset 1024.  It was
> +therefore possible to construct cases in which we had:
> +
> +* a probe using LR at SP + 80 bytes (or some other value >= 16)
> +* an allocation of the guard page size - 16 bytes
> +* a probe at SP + 0
> +
> +which allocates guard page size + 64 consecutive unprobed bytes.
> +
> +This patch requires the LR probe to be in the first 16 bytes of the
> +save area when stack clash protection is active.  Doing it
> +unconditionally would cause code-quality regressions.
> +
> +Putting LR before other registers prevents push/pop allocation
> +when shadow call stacks are enabled, since LR is restored
> +separately from the other callee-saved registers.
> +
> +The new comment doesn't say that the probe register is required
> +to be LR, since a later patch removes that restriction.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that
> +       the LR save slot is in the first 16 bytes of the register save
> area.
> +       Only form STP/LDP push/pop candidates if both registers are valid.
> +       (aarch64_allocate_and_probe_stack_space): Remove workaround for
> +       when LR was not in the first 16 bytes.
> +
> +gcc/testsuite/
> +       * gcc.target/aarch64/stack-check-prologue-18.c: New test.
> +       * gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
> +       * gcc.target/aarch64/stack-check-prologue-20.c: Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc                 |  72 ++++++-------
> + .../aarch64/stack-check-prologue-18.c         | 100 ++++++++++++++++++
> + .../aarch64/stack-check-prologue-19.c         | 100 ++++++++++++++++++
> + .../aarch64/stack-check-prologue-20.c         |   3 +
> + 4 files changed, 233 insertions(+), 42 deletions(-)
> + create mode 100644
> gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> + create mode 100644
> gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> + create mode 100644
> gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 34c1d8614cd..16433fb70f4 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8273,26 +8273,34 @@ aarch64_layout_frame (void)
> +   bool saves_below_hard_fp_p
> +     = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
> +   frame.bytes_below_hard_fp = offset;
> ++
> ++  auto allocate_gpr_slot = [&](unsigned int regno)
> ++    {
> ++      frame.reg_offset[regno] = offset;
> ++      if (frame.wb_push_candidate1 == INVALID_REGNUM)
> ++      frame.wb_push_candidate1 = regno;
> ++      else if (frame.wb_push_candidate2 == INVALID_REGNUM)
> ++      frame.wb_push_candidate2 = regno;
> ++      offset += UNITS_PER_WORD;
> ++    };
> ++
> +   if (frame.emit_frame_chain)
> +     {
> +       /* FP and LR are placed in the linkage record.  */
> +-      frame.reg_offset[R29_REGNUM] = offset;
> +-      frame.wb_push_candidate1 = R29_REGNUM;
> +-      frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
> +-      frame.wb_push_candidate2 = R30_REGNUM;
> +-      offset += 2 * UNITS_PER_WORD;
> ++      allocate_gpr_slot (R29_REGNUM);
> ++      allocate_gpr_slot (R30_REGNUM);
> +     }
> ++  else if (flag_stack_clash_protection
> ++         && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
> ++    /* Put the LR save slot first, since it makes a good choice of probe
> ++       for stack clash purposes.  The idea is that the link register
> usually
> ++       has to be saved before a call anyway, and so we lose little by
> ++       stopping it from being individually shrink-wrapped.  */
> ++    allocate_gpr_slot (R30_REGNUM);
> +
> +   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
> +     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
> +-      {
> +-      frame.reg_offset[regno] = offset;
> +-      if (frame.wb_push_candidate1 == INVALID_REGNUM)
> +-        frame.wb_push_candidate1 = regno;
> +-      else if (frame.wb_push_candidate2 == INVALID_REGNUM)
> +-        frame.wb_push_candidate2 = regno;
> +-      offset += UNITS_PER_WORD;
> +-      }
> ++      allocate_gpr_slot (regno);
> +
> +   poly_int64 max_int_offset = offset;
> +   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +@@ -8370,10 +8378,13 @@ aarch64_layout_frame (void)
> +      max_push_offset to 0, because no registers are popped at this time,
> +      so callee_adjust cannot be adjusted.  */
> +   HOST_WIDE_INT max_push_offset = 0;
> +-  if (frame.wb_pop_candidate2 != INVALID_REGNUM)
> +-    max_push_offset = 512;
> +-  else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
> +-    max_push_offset = 256;
> ++  if (frame.wb_pop_candidate1 != INVALID_REGNUM)
> ++    {
> ++      if (frame.wb_pop_candidate2 != INVALID_REGNUM)
> ++      max_push_offset = 512;
> ++      else
> ++      max_push_offset = 256;
> ++    }
> +
> +   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
> +   HOST_WIDE_INT const_saved_regs_size;
> +@@ -9339,29 +9350,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> +     = (final_adjustment_p
> +        ? guard_used_by_caller + byte_sp_alignment
> +        : guard_size - guard_used_by_caller);
> +-  /* When doing the final adjustment for the outgoing arguments, take
> into
> +-     account any unprobed space there is above the current SP.  There are
> +-     two cases:
> +-
> +-     - When saving SVE registers below the hard frame pointer, we force
> +-       the lowest save to take place in the prologue before doing the
> final
> +-       adjustment (i.e. we don't allow the save to be shrink-wrapped).
> +-       This acts as a probe at SP, so there is no unprobed space.
> +-
> +-     - When there are no SVE register saves, we use the store of the link
> +-       register as a probe.  We can't assume that LR was saved at
> position 0
> +-       though, so treat any space below it as unprobed.  */
> +-  if (final_adjustment_p
> +-      && known_eq (frame.below_hard_fp_saved_regs_size, 0))
> +-    {
> +-      poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
> +-                            - frame.bytes_below_saved_regs);
> +-      if (known_ge (lr_offset, 0))
> +-      min_probe_threshold -= lr_offset.to_constant ();
> +-      else
> +-      gcc_assert (!flag_stack_clash_protection || known_eq (poly_size,
> 0));
> +-    }
> +-
> +   poly_int64 frame_size = frame.frame_size;
> +
> +   /* We should always have a positive probe threshold.  */
> +@@ -9541,8 +9529,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> +       if (final_adjustment_p && rounded_size != 0)
> +       min_probe_threshold = 0;
> +       /* If doing a small final adjustment, we always probe at offset 0.
> +-       This is done to avoid issues when LR is not at position 0 or when
> +-       the final adjustment is smaller than the probing offset.  */
> ++       This is done to avoid issues when the final adjustment is smaller
> ++       than the probing offset.  */
> +       else if (final_adjustment_p && rounded_size == 0)
> +       residual_probe_offset = 0;
> +
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> +new file mode 100644
> +index 00000000000..82447d20fff
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> +@@ -0,0 +1,100 @@
> ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer
> --param stack-clash-protection-guard-size=12" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++void f(int, ...);
> ++void g();
> ++
> ++/*
> ++** test1:
> ++**    ...
> ++**    str     x30, \[sp\]
> ++**    sub     sp, sp, #4064
> ++**    str     xzr, \[sp\]
> ++**    cbnz    w0, .*
> ++**    bl      g
> ++**    ...
> ++**    str     x26, \[sp, #?4128\]
> ++**    ...
> ++*/
> ++int test1(int z) {
> ++  __uint128_t x = 0;
> ++  int y[0x400];
> ++  if (z)
> ++    {
> ++      asm volatile ("" :::
> ++                  "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++      f(0, 0, 0, 0, 0, 0, 0, &y,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++    }
> ++  g();
> ++  return 1;
> ++}
> ++
> ++/*
> ++** test2:
> ++**    ...
> ++**    str     x30, \[sp\]
> ++**    sub     sp, sp, #1040
> ++**    str     xzr, \[sp\]
> ++**    cbnz    w0, .*
> ++**    bl      g
> ++**    ...
> ++*/
> ++int test2(int z) {
> ++  __uint128_t x = 0;
> ++  int y[0x400];
> ++  if (z)
> ++    {
> ++      asm volatile ("" :::
> ++                  "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++      f(0, 0, 0, 0, 0, 0, 0, &y,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x);
> ++    }
> ++  g();
> ++  return 1;
> ++}
> ++
> ++/*
> ++** test3:
> ++**    ...
> ++**    str     x30, \[sp\]
> ++**    sub     sp, sp, #1024
> ++**    cbnz    w0, .*
> ++**    bl      g
> ++**    ...
> ++*/
> ++int test3(int z) {
> ++  __uint128_t x = 0;
> ++  int y[0x400];
> ++  if (z)
> ++    {
> ++      asm volatile ("" :::
> ++                  "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++      f(0, 0, 0, 0, 0, 0, 0, &y,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++    }
> ++  g();
> ++  return 1;
> ++}
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> +new file mode 100644
> +index 00000000000..73ac3e4e4eb
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> +@@ -0,0 +1,100 @@
> ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer
> --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack
> -ffixed-x18" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++void f(int, ...);
> ++void g();
> ++
> ++/*
> ++** test1:
> ++**    ...
> ++**    str     x30, \[sp\]
> ++**    sub     sp, sp, #4064
> ++**    str     xzr, \[sp\]
> ++**    cbnz    w0, .*
> ++**    bl      g
> ++**    ...
> ++**    str     x26, \[sp, #?4128\]
> ++**    ...
> ++*/
> ++int test1(int z) {
> ++  __uint128_t x = 0;
> ++  int y[0x400];
> ++  if (z)
> ++    {
> ++      asm volatile ("" :::
> ++                  "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++      f(0, 0, 0, 0, 0, 0, 0, &y,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++    }
> ++  g();
> ++  return 1;
> ++}
> ++
> ++/*
> ++** test2:
> ++**    ...
> ++**    str     x30, \[sp\]
> ++**    sub     sp, sp, #1040
> ++**    str     xzr, \[sp\]
> ++**    cbnz    w0, .*
> ++**    bl      g
> ++**    ...
> ++*/
> ++int test2(int z) {
> ++  __uint128_t x = 0;
> ++  int y[0x400];
> ++  if (z)
> ++    {
> ++      asm volatile ("" :::
> ++                  "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++      f(0, 0, 0, 0, 0, 0, 0, &y,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x);
> ++    }
> ++  g();
> ++  return 1;
> ++}
> ++
> ++/*
> ++** test3:
> ++**    ...
> ++**    str     x30, \[sp\]
> ++**    sub     sp, sp, #1024
> ++**    cbnz    w0, .*
> ++**    bl      g
> ++**    ...
> ++*/
> ++int test3(int z) {
> ++  __uint128_t x = 0;
> ++  int y[0x400];
> ++  if (z)
> ++    {
> ++      asm volatile ("" :::
> ++                  "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++      f(0, 0, 0, 0, 0, 0, 0, &y,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++      x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++    }
> ++  g();
> ++  return 1;
> ++}
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
> +new file mode 100644
> +index 00000000000..690aae8dfd5
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
> +@@ -0,0 +1,3 @@
> ++/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection
> -fomit-frame-pointer --param stack-clash-protection-guard-size=12
> -fsanitize=shadow-call-stack -ffixed-x18" } */
> ++
> ++#include "stack-check-prologue-19.c"
> +--
> +2.34.1
> +
> +
> +From c4f0e121faa36342f1d21919e54a05ad841c4f86 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:55 +0100
> +Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation
> +
> +Previous patches ensured that the final frame allocation only needs
> +a probe when the size is strictly greater than 1KiB.  It's therefore
> +safe to use the normal 1024 probe offset in all cases.
> +
> +The main motivation for doing this is to simplify the code and
> +remove the number of special cases.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc
> (aarch64_allocate_and_probe_stack_space):
> +       Always probe the residual allocation at offset 1024, asserting
> +       that that is in range.
> +
> +gcc/testsuite/
> +       * gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe
> +       to be at offset 1024 rather than offset 0.
> +       * gcc.target/aarch64/stack-check-prologue-18.c: Likewise.
> +       * gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc                        | 12 ++++--------
> + .../gcc.target/aarch64/stack-check-prologue-17.c     |  2 +-
> + .../gcc.target/aarch64/stack-check-prologue-18.c     |  4 ++--
> + .../gcc.target/aarch64/stack-check-prologue-19.c     |  4 ++--
> + 4 files changed, 9 insertions(+), 13 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 16433fb70f4..8abf3d7a1e2 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -9523,16 +9523,12 @@ aarch64_allocate_and_probe_stack_space (rtx
> temp1, rtx temp2,
> +      are still safe.  */
> +   if (residual)
> +     {
> +-      HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
> ++      gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
> ++
> +       /* If we're doing final adjustments, and we've done any full page
> +        allocations then any residual needs to be probed.  */
> +       if (final_adjustment_p && rounded_size != 0)
> +       min_probe_threshold = 0;
> +-      /* If doing a small final adjustment, we always probe at offset 0.
> +-       This is done to avoid issues when the final adjustment is smaller
> +-       than the probing offset.  */
> +-      else if (final_adjustment_p && rounded_size == 0)
> +-      residual_probe_offset = 0;
> +
> +       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
> +       if (residual >= min_probe_threshold)
> +@@ -9543,8 +9539,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> +                    HOST_WIDE_INT_PRINT_DEC " bytes, probing will be
> required."
> +                    "\n", residual);
> +
> +-          emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
> +-                                           residual_probe_offset));
> ++        emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
> ++                                         guard_used_by_caller));
> +         emit_insn (gen_blockage ());
> +       }
> +     }
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +index 0d8a25d73a2..f0ec1389771 100644
> +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +@@ -33,7 +33,7 @@ int test1(int z) {
> + **    ...
> + **    str     x30, \[sp\]
> + **    sub     sp, sp, #1040
> +-**    str     xzr, \[sp\]
> ++**    str     xzr, \[sp, #?1024\]
> + **    cbnz    w0, .*
> + **    bl      g
> + **    ...
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> +index 82447d20fff..6383bec5ebc 100644
> +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> +@@ -9,7 +9,7 @@ void g();
> + **    ...
> + **    str     x30, \[sp\]
> + **    sub     sp, sp, #4064
> +-**    str     xzr, \[sp\]
> ++**    str     xzr, \[sp, #?1024\]
> + **    cbnz    w0, .*
> + **    bl      g
> + **    ...
> +@@ -50,7 +50,7 @@ int test1(int z) {
> + **    ...
> + **    str     x30, \[sp\]
> + **    sub     sp, sp, #1040
> +-**    str     xzr, \[sp\]
> ++**    str     xzr, \[sp, #?1024\]
> + **    cbnz    w0, .*
> + **    bl      g
> + **    ...
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> +index 73ac3e4e4eb..562039b5e9b 100644
> +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> +@@ -9,7 +9,7 @@ void g();
> + **    ...
> + **    str     x30, \[sp\]
> + **    sub     sp, sp, #4064
> +-**    str     xzr, \[sp\]
> ++**    str     xzr, \[sp, #?1024\]
> + **    cbnz    w0, .*
> + **    bl      g
> + **    ...
> +@@ -50,7 +50,7 @@ int test1(int z) {
> + **    ...
> + **    str     x30, \[sp\]
> + **    sub     sp, sp, #1040
> +-**    str     xzr, \[sp\]
> ++**    str     xzr, \[sp, #?1024\]
> + **    cbnz    w0, .*
> + **    bl      g
> + **    ...
> +--
> +2.34.1
> +
> +
> +From 6f0ab0a9f46a17b68349ff6035aa776bf65f0575 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:56 +0100
> +Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame
> + info
> +
> +The stack frame is currently divided into three areas:
> +
> +A: the area above the hard frame pointer
> +B: the SVE saves below the hard frame pointer
> +C: the outgoing arguments
> +
> +If the stack frame is allocated in one chunk, the allocation needs a
> +probe if the frame size is >= guard_size - 1KiB.  In addition, if the
> +function is not a leaf function, it must probe an address no more than
> +1KiB above the outgoing SP.  We ensured the second condition by
> +
> +(1) using single-chunk allocations for non-leaf functions only if
> +    the link register save slot is within 512 bytes of the bottom
> +    of the frame; and
> +
> +(2) using the link register save as a probe (meaning, for instance,
> +    that it can't be individually shrink wrapped)
> +
> +If instead the stack is allocated in multiple chunks, then:
> +
> +* an allocation involving only the outgoing arguments (C above) requires
> +  a probe if the allocation size is > 1KiB
> +
> +* any other allocation requires a probe if the allocation size
> +  is >= guard_size - 1KiB
> +
> +* second and subsequent allocations require the previous allocation
> +  to probe at the bottom of the allocated area, regardless of the size
> +  of that previous allocation
> +
> +The final point means that, unlike for single allocations,
> +it can be necessary to have both a non-SVE register probe and
> +an SVE register probe.  For example:
> +
> +* allocate A, probe using a non-SVE register save
> +* allocate B, probe using an SVE register save
> +* allocate C
> +
> +The non-SVE register used in this case was again the link register.
> +It was previously used even if the link register save slot was some
> +bytes above the bottom of the non-SVE register saves, but an earlier
> +patch avoided that by putting the link register save slot first.
> +
> +As a belt-and-braces fix, this patch explicitly records which
> +probe registers we're using and allows the non-SVE probe to be
> +whichever register comes first (as for SVE).
> +
> +The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
> +
> +gcc/
> +       * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
> +       (aarch64_frame::hard_fp_save_and_probe): New fields.
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize
> them.
> +       Rather than asserting that a leaf function saves LR, instead assert
> +       that a leaf function saves something.
> +       (aarch64_get_separate_components): Prevent the chosen probe
> +       registers from being individually shrink-wrapped.
> +       (aarch64_allocate_and_probe_stack_space): Remove workaround for
> +       probe registers that aren't at the bottom of the previous
> allocation.
> +
> +gcc/testsuite/
> +       * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant
> probes.
> +---
> + gcc/config/aarch64/aarch64.cc                 | 68 +++++++++++++++----
> + gcc/config/aarch64/aarch64.h                  |  8 +++
> + .../aarch64/sve/pcs/stack_clash_3.c           |  6 +-
> + 3 files changed, 64 insertions(+), 18 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 8abf3d7a1e2..a8d907df884 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8210,15 +8210,11 @@ aarch64_layout_frame (void)
> +       && !crtl->abi->clobbers_full_reg_p (regno))
> +       frame.reg_offset[regno] = SLOT_REQUIRED;
> +
> +-  /* With stack-clash, LR must be saved in non-leaf functions.  The
> saving of
> +-     LR counts as an implicit probe which allows us to maintain the
> invariant
> +-     described in the comment at expand_prologue.  */
> +-  gcc_assert (crtl->is_leaf
> +-            || maybe_ne (frame.reg_offset[R30_REGNUM],
> SLOT_NOT_REQUIRED));
> +
> +   poly_int64 offset = crtl->outgoing_args_size;
> +   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> +   frame.bytes_below_saved_regs = offset;
> ++  frame.sve_save_and_probe = INVALID_REGNUM;
> +
> +   /* Now assign stack slots for the registers.  Start with the predicate
> +      registers, since predicate LDR and STR have a relatively small
> +@@ -8226,6 +8222,8 @@ aarch64_layout_frame (void)
> +   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
> +     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
> +       {
> ++      if (frame.sve_save_and_probe == INVALID_REGNUM)
> ++        frame.sve_save_and_probe = regno;
> +       frame.reg_offset[regno] = offset;
> +       offset += BYTES_PER_SVE_PRED;
> +       }
> +@@ -8263,6 +8261,8 @@ aarch64_layout_frame (void)
> +     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
> +       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
> +       {
> ++        if (frame.sve_save_and_probe == INVALID_REGNUM)
> ++          frame.sve_save_and_probe = regno;
> +         frame.reg_offset[regno] = offset;
> +         offset += vector_save_size;
> +       }
> +@@ -8272,10 +8272,18 @@ aarch64_layout_frame (void)
> +   frame.below_hard_fp_saved_regs_size = offset -
> frame.bytes_below_saved_regs;
> +   bool saves_below_hard_fp_p
> +     = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
> ++  gcc_assert (!saves_below_hard_fp_p
> ++            || (frame.sve_save_and_probe != INVALID_REGNUM
> ++                && known_eq (frame.reg_offset[frame.sve_save_and_probe],
> ++                             frame.bytes_below_saved_regs)));
> ++
> +   frame.bytes_below_hard_fp = offset;
> ++  frame.hard_fp_save_and_probe = INVALID_REGNUM;
> +
> +   auto allocate_gpr_slot = [&](unsigned int regno)
> +     {
> ++      if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
> ++      frame.hard_fp_save_and_probe = regno;
> +       frame.reg_offset[regno] = offset;
> +       if (frame.wb_push_candidate1 == INVALID_REGNUM)
> +       frame.wb_push_candidate1 = regno;
> +@@ -8309,6 +8317,8 @@ aarch64_layout_frame (void)
> +   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
> +     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
> +       {
> ++      if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
> ++        frame.hard_fp_save_and_probe = regno;
> +       /* If there is an alignment gap between integer and fp
> callee-saves,
> +          allocate the last fp register to it if possible.  */
> +       if (regno == last_fp_reg
> +@@ -8332,6 +8342,17 @@ aarch64_layout_frame (void)
> +   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +
> +   frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
> ++  gcc_assert (known_eq (frame.saved_regs_size,
> ++                      frame.below_hard_fp_saved_regs_size)
> ++            || (frame.hard_fp_save_and_probe != INVALID_REGNUM
> ++                && known_eq
> (frame.reg_offset[frame.hard_fp_save_and_probe],
> ++                             frame.bytes_below_hard_fp)));
> ++
> ++  /* With stack-clash, a register must be saved in non-leaf functions.
> ++     The saving of the bottommost register counts as an implicit probe,
> ++     which allows us to maintain the invariant described in the comment
> ++     at expand_prologue.  */
> ++  gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
> +
> +   offset += get_frame_size ();
> +   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +@@ -8462,6 +8483,25 @@ aarch64_layout_frame (void)
> +       frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +
> ++  /* The frame is allocated in pieces, with each non-final piece
> ++     including a register save at offset 0 that acts as a probe for
> ++     the following piece.  In addition, the save of the bottommost
> register
> ++     acts as a probe for callees and allocas.  Roll back any probes that
> ++     aren't needed.
> ++
> ++     A probe isn't needed if it is associated with the final allocation
> ++     (including callees and allocas) that happens before the epilogue is
> ++     executed.  */
> ++  if (crtl->is_leaf
> ++      && !cfun->calls_alloca
> ++      && known_eq (frame.final_adjust, 0))
> ++    {
> ++      if (maybe_ne (frame.sve_callee_adjust, 0))
> ++      frame.sve_save_and_probe = INVALID_REGNUM;
> ++      else
> ++      frame.hard_fp_save_and_probe = INVALID_REGNUM;
> ++    }
> ++
> +   /* Make sure the individual adjustments add up to the full frame
> size.  */
> +   gcc_assert (known_eq (frame.initial_adjust
> +                       + frame.callee_adjust
> +@@ -9039,13 +9079,6 @@ aarch64_get_separate_components (void)
> +
> +       poly_int64 offset = frame.reg_offset[regno];
> +
> +-      /* If the register is saved in the first SVE save slot, we use
> +-         it as a stack probe for -fstack-clash-protection.  */
> +-      if (flag_stack_clash_protection
> +-          && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
> +-          && known_eq (offset, frame.bytes_below_saved_regs))
> +-        continue;
> +-
> +       /* Get the offset relative to the register we'll use.  */
> +       if (frame_pointer_needed)
> +         offset -= frame.bytes_below_hard_fp;
> +@@ -9080,6 +9113,13 @@ aarch64_get_separate_components (void)
> +
> +   bitmap_clear_bit (components, LR_REGNUM);
> +   bitmap_clear_bit (components, SP_REGNUM);
> ++  if (flag_stack_clash_protection)
> ++    {
> ++      if (frame.sve_save_and_probe != INVALID_REGNUM)
> ++      bitmap_clear_bit (components, frame.sve_save_and_probe);
> ++      if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
> ++      bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
> ++    }
> +
> +   return components;
> + }
> +@@ -9616,8 +9656,8 @@ aarch64_epilogue_uses (int regno)
> +    When probing is needed, we emit a probe at the start of the prologue
> +    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
> +
> +-   We have to track how much space has been allocated and the only stores
> +-   to the stack we track as implicit probes are the FP/LR stores.
> ++   We can also use register saves as probes.  These are stored in
> ++   sve_save_and_probe and hard_fp_save_and_probe.
> +
> +    For outgoing arguments we probe if the size is larger than 1KB, such
> that
> +    the ABI specified buffer is maintained for the next callee.
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index b6135837073..46d4693e206 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -957,6 +957,14 @@ struct GTY (()) aarch64_frame
> +      This is the register they should use.  */
> +   unsigned spare_pred_reg;
> +
> ++  /* An SVE register that is saved below the hard frame pointer and that
> acts
> ++     as a probe for later allocations, or INVALID_REGNUM if none.  */
> ++  unsigned sve_save_and_probe;
> ++
> ++  /* A register that is saved at the hard frame pointer and that acts
> ++     as a probe for later allocations, or INVALID_REGNUM if none.  */
> ++  unsigned hard_fp_save_and_probe;
> ++
> +   bool laid_out;
> +
> +   /* True if shadow call stack should be enabled for the current
> function.  */
> +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
> +index 3e01ec36c3a..3530a0d504b 100644
> +--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
> ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
> +@@ -11,11 +11,10 @@
> + **    mov     x11, sp
> + **    ...
> + **    sub     sp, sp, x13
> +-**    str     p4, \[sp\]
> + **    cbz     w0, [^\n]*
> ++**    str     p4, \[sp\]
> + **    ...
> + **    ptrue   p0\.b, all
> +-**    ldr     p4, \[sp\]
> + **    addvl   sp, sp, #1
> + **    ldr     x24, \[sp\], 32
> + **    ret
> +@@ -39,13 +38,12 @@ test_1 (int n)
> + **    mov     x11, sp
> + **    ...
> + **    sub     sp, sp, x13
> +-**    str     p4, \[sp\]
> + **    cbz     w0, [^\n]*
> ++**    str     p4, \[sp\]
> + **    str     p5, \[sp, #1, mul vl\]
> + **    str     p6, \[sp, #2, mul vl\]
> + **    ...
> + **    ptrue   p0\.b, all
> +-**    ldr     p4, \[sp\]
> + **    addvl   sp, sp, #1
> + **    ldr     x24, \[sp\], 32
> + **    ret
> +--
> +2.34.1
> +
> +
> +From 8254e1b9cd500e0c278465a3657543477e9d1250 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:56 +0100
> +Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size
> +
> +After previous patches, it's no longer necessary to store
> +saved_regs_size and below_hard_fp_saved_regs_size in the frame info.
> +All measurements instead use the top or bottom of the frame as
> +reference points.
> +
> +gcc/
> +       * config/aarch64/aarch64.h (aarch64_frame::saved_regs_size)
> +       (aarch64_frame::below_hard_fp_saved_regs_size): Delete.
> +       * config/aarch64/aarch64.cc (aarch64_layout_frame): Update
> accordingly.
> +---
> + gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++-------------------
> + gcc/config/aarch64/aarch64.h  |  7 ------
> + 2 files changed, 21 insertions(+), 31 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index a8d907df884..ac3d3b336a3 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8269,9 +8269,8 @@ aarch64_layout_frame (void)
> +
> +   /* OFFSET is now the offset of the hard frame pointer from the bottom
> +      of the callee save area.  */
> +-  frame.below_hard_fp_saved_regs_size = offset -
> frame.bytes_below_saved_regs;
> +-  bool saves_below_hard_fp_p
> +-    = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
> ++  auto below_hard_fp_saved_regs_size = offset -
> frame.bytes_below_saved_regs;
> ++  bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size,
> 0);
> +   gcc_assert (!saves_below_hard_fp_p
> +             || (frame.sve_save_and_probe != INVALID_REGNUM
> +                 && known_eq (frame.reg_offset[frame.sve_save_and_probe],
> +@@ -8341,9 +8340,8 @@ aarch64_layout_frame (void)
> +
> +   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +
> +-  frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
> +-  gcc_assert (known_eq (frame.saved_regs_size,
> +-                      frame.below_hard_fp_saved_regs_size)
> ++  auto saved_regs_size = offset - frame.bytes_below_saved_regs;
> ++  gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)
> +             || (frame.hard_fp_save_and_probe != INVALID_REGNUM
> +                 && known_eq
> (frame.reg_offset[frame.hard_fp_save_and_probe],
> +                              frame.bytes_below_hard_fp)));
> +@@ -8352,7 +8350,7 @@ aarch64_layout_frame (void)
> +      The saving of the bottommost register counts as an implicit probe,
> +      which allows us to maintain the invariant described in the comment
> +      at expand_prologue.  */
> +-  gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
> ++  gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
> +
> +   offset += get_frame_size ();
> +   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +@@ -8409,7 +8407,7 @@ aarch64_layout_frame (void)
> +
> +   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
> +   HOST_WIDE_INT const_saved_regs_size;
> +-  if (known_eq (frame.saved_regs_size, 0))
> ++  if (known_eq (saved_regs_size, 0))
> +     frame.initial_adjust = frame.frame_size;
> +   else if (frame.frame_size.is_constant (&const_size)
> +          && const_size < max_push_offset
> +@@ -8422,7 +8420,7 @@ aarch64_layout_frame (void)
> +       frame.callee_adjust = const_size;
> +     }
> +   else if (frame.bytes_below_saved_regs.is_constant
> (&const_below_saved_regs)
> +-         && frame.saved_regs_size.is_constant (&const_saved_regs_size)
> ++         && saved_regs_size.is_constant (&const_saved_regs_size)
> +          && const_below_saved_regs + const_saved_regs_size < 512
> +          /* We could handle this case even with data below the saved
> +             registers, provided that that data left us with valid offsets
> +@@ -8441,8 +8439,7 @@ aarch64_layout_frame (void)
> +       frame.initial_adjust = frame.frame_size;
> +     }
> +   else if (saves_below_hard_fp_p
> +-         && known_eq (frame.saved_regs_size,
> +-                      frame.below_hard_fp_saved_regs_size))
> ++         && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
> +     {
> +       /* Frame in which all saves are SVE saves:
> +
> +@@ -8464,7 +8461,7 @@ aarch64_layout_frame (void)
> +        [save SVE registers relative to SP]
> +        sub sp, sp, bytes_below_saved_regs  */
> +       frame.callee_adjust = const_above_fp;
> +-      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> ++      frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
> +       frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +   else
> +@@ -8479,7 +8476,7 @@ aarch64_layout_frame (void)
> +        [save SVE registers relative to SP]
> +        sub sp, sp, bytes_below_saved_regs  */
> +       frame.initial_adjust = frame.bytes_above_hard_fp;
> +-      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> ++      frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
> +       frame.final_adjust = frame.bytes_below_saved_regs;
> +     }
> +
> +@@ -9621,17 +9618,17 @@ aarch64_epilogue_uses (int regno)
> +       |  local variables              | <-- frame_pointer_rtx
> +       |                               |
> +       +-------------------------------+
> +-      |  padding                      | \
> +-      +-------------------------------+  |
> +-      |  callee-saved registers       |  | frame.saved_regs_size
> +-      +-------------------------------+  |
> +-      |  LR'                          |  |
> +-      +-------------------------------+  |
> +-      |  FP'                          |  |
> +-      +-------------------------------+  |<- hard_frame_pointer_rtx
> (aligned)
> +-      |  SVE vector registers         |  | \
> +-      +-------------------------------+  |  |
> below_hard_fp_saved_regs_size
> +-      |  SVE predicate registers      | /  /
> ++      |  padding                      |
> ++      +-------------------------------+
> ++      |  callee-saved registers       |
> ++      +-------------------------------+
> ++      |  LR'                          |
> ++      +-------------------------------+
> ++      |  FP'                          |
> ++      +-------------------------------+ <-- hard_frame_pointer_rtx
> (aligned)
> ++      |  SVE vector registers         |
> ++      +-------------------------------+
> ++      |  SVE predicate registers      |
> +       +-------------------------------+
> +       |  dynamic allocation           |
> +       +-------------------------------+
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 46d4693e206..01f7751bc78 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -871,18 +871,11 @@ struct GTY (()) aarch64_frame
> +      STACK_BOUNDARY.  */
> +   HOST_WIDE_INT saved_varargs_size;
> +
> +-  /* The size of the callee-save registers with a slot in REG_OFFSET.  */
> +-  poly_int64 saved_regs_size;
> +-
> +   /* The number of bytes between the bottom of the static frame (the
> bottom
> +      of the outgoing arguments) and the bottom of the register save area.
> +      This value is always a multiple of STACK_BOUNDARY.  */
> +   poly_int64 bytes_below_saved_regs;
> +
> +-  /* The size of the callee-save registers with a slot in REG_OFFSET that
> +-     are saved below the hard frame pointer.  */
> +-  poly_int64 below_hard_fp_saved_regs_size;
> +-
> +   /* The number of bytes between the bottom of the static frame (the
> bottom
> +      of the outgoing arguments) and the hard frame pointer.  This value
> is
> +      always a multiple of STACK_BOUNDARY.  */
> +--
> +2.34.1
> +
> +
> +From 75c37e031408262263442f5b4cdb83d3777b6422 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:57 +0100
> +Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved
> + registers
> +
> +AArch64 normally puts the saved registers near the bottom of the frame,
> +immediately above any dynamic allocations.  But this means that a
> +stack-smash attack on those dynamic allocations could overwrite the
> +saved registers without needing to reach as far as the stack smash
> +canary.
> +
> +The same thing could also happen for variable-sized arguments that are
> +passed by value, since those are allocated before a call and popped on
> +return.
> +
> +This patch avoids that by putting the locals (and thus the canary) below
> +the saved registers when stack smash protection is active.
> +
> +The patch fixes CVE-2023-4039.
> +
> +gcc/
> +       * config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p):
> +       New function.
> +       (aarch64_layout_frame): Use it to decide whether locals should
> +       go above or below the saved registers.
> +       (aarch64_expand_prologue): Update stack layout comment.
> +       Emit a stack tie after the final adjustment.
> +
> +gcc/testsuite/
> +       * gcc.target/aarch64/stack-protector-8.c: New test.
> +       * gcc.target/aarch64/stack-protector-9.c: Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc                 | 46 +++++++--
> + .../gcc.target/aarch64/stack-protector-8.c    | 95 +++++++++++++++++++
> + .../gcc.target/aarch64/stack-protector-9.c    | 33 +++++++
> + 3 files changed, 168 insertions(+), 6 deletions(-)
> + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
> + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index ac3d3b336a3..96c3f48fdc4 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8133,6 +8133,20 @@ aarch64_needs_frame_chain (void)
> +   return aarch64_use_frame_pointer;
> + }
> +
> ++/* Return true if the current function should save registers above
> ++   the locals area, rather than below it.  */
> ++
> ++static bool
> ++aarch64_save_regs_above_locals_p ()
> ++{
> ++  /* When using stack smash protection, make sure that the canary slot
> ++     comes between the locals and the saved registers.  Otherwise,
> ++     it would be possible for a carefully sized smash attack to change
> ++     the saved registers (particularly LR and FP) without reaching the
> ++     canary.  */
> ++  return crtl->stack_protect_guard;
> ++}
> ++
> + /* Mark the registers that need to be saved by the callee and calculate
> +    the size of the callee-saved registers area and frame record (both FP
> +    and LR may be omitted).  */
> +@@ -8144,6 +8158,7 @@ aarch64_layout_frame (void)
> +   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
> +   bool frame_related_fp_reg_p = false;
> +   aarch64_frame &frame = cfun->machine->frame;
> ++  poly_int64 top_of_locals = -1;
> +
> +   frame.emit_frame_chain = aarch64_needs_frame_chain ();
> +
> +@@ -8210,9 +8225,16 @@ aarch64_layout_frame (void)
> +       && !crtl->abi->clobbers_full_reg_p (regno))
> +       frame.reg_offset[regno] = SLOT_REQUIRED;
> +
> ++  bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
> +
> +   poly_int64 offset = crtl->outgoing_args_size;
> +   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> ++  if (regs_at_top_p)
> ++    {
> ++      offset += get_frame_size ();
> ++      offset = aligned_upper_bound (offset, STACK_BOUNDARY /
> BITS_PER_UNIT);
> ++      top_of_locals = offset;
> ++    }
> +   frame.bytes_below_saved_regs = offset;
> +   frame.sve_save_and_probe = INVALID_REGNUM;
> +
> +@@ -8352,15 +8374,18 @@ aarch64_layout_frame (void)
> +      at expand_prologue.  */
> +   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
> +
> +-  offset += get_frame_size ();
> +-  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +-  auto top_of_locals = offset;
> +-
> ++  if (!regs_at_top_p)
> ++    {
> ++      offset += get_frame_size ();
> ++      offset = aligned_upper_bound (offset, STACK_BOUNDARY /
> BITS_PER_UNIT);
> ++      top_of_locals = offset;
> ++    }
> +   offset += frame.saved_varargs_size;
> +   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> +   frame.frame_size = offset;
> +
> +   frame.bytes_above_hard_fp = frame.frame_size -
> frame.bytes_below_hard_fp;
> ++  gcc_assert (known_ge (top_of_locals, 0));
> +   frame.bytes_above_locals = frame.frame_size - top_of_locals;
> +
> +   frame.initial_adjust = 0;
> +@@ -9615,10 +9640,10 @@ aarch64_epilogue_uses (int regno)
> +       |  for register varargs         |
> +       |                               |
> +       +-------------------------------+
> +-      |  local variables              | <-- frame_pointer_rtx
> ++      |  local variables (1)          | <-- frame_pointer_rtx
> +       |                               |
> +       +-------------------------------+
> +-      |  padding                      |
> ++      |  padding (1)                  |
> +       +-------------------------------+
> +       |  callee-saved registers       |
> +       +-------------------------------+
> +@@ -9630,6 +9655,10 @@ aarch64_epilogue_uses (int regno)
> +       +-------------------------------+
> +       |  SVE predicate registers      |
> +       +-------------------------------+
> ++      |  local variables (2)          |
> ++      +-------------------------------+
> ++      |  padding (2)                  |
> ++      +-------------------------------+
> +       |  dynamic allocation           |
> +       +-------------------------------+
> +       |  padding                      |
> +@@ -9639,6 +9668,9 @@ aarch64_epilogue_uses (int regno)
> +       +-------------------------------+
> +       |                               | <-- stack_pointer_rtx (aligned)
> +
> ++   The regions marked (1) and (2) are mutually exclusive.  (2) is used
> ++   when aarch64_save_regs_above_locals_p is true.
> ++
> +    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
> +    but leave frame_pointer_rtx and hard_frame_pointer_rtx
> +    unchanged.
> +@@ -9834,6 +9866,8 @@ aarch64_expand_prologue (void)
> +   gcc_assert (known_eq (bytes_below_sp, final_adjust));
> +   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
> final_adjust,
> +                                         !frame_pointer_needed, true);
> ++  if (emit_frame_chain && maybe_ne (final_adjust, 0))
> ++    emit_insn (gen_stack_tie (stack_pointer_rtx,
> hard_frame_pointer_rtx));
> + }
> +
> + /* Return TRUE if we can use a simple_return insn.
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
> b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
> +new file mode 100644
> +index 00000000000..e71d820e365
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
> +@@ -0,0 +1,95 @@
> ++/* { dg-options " -O -fstack-protector-strong
> -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0
> -mstack-protector-guard-offset=16" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++void g(void *);
> ++__SVBool_t *h(void *);
> ++
> ++/*
> ++** test1:
> ++**    sub     sp, sp, #288
> ++**    stp     x29, x30, \[sp, #?272\]
> ++**    add     x29, sp, #?272
> ++**    mrs     (x[0-9]+), tpidr2_el0
> ++**    ldr     (x[0-9]+), \[\1, #?16\]
> ++**    str     \2, \[sp, #?264\]
> ++**    mov     \2, #?0
> ++**    add     x0, sp, #?8
> ++**    bl      g
> ++**    ...
> ++**    mrs     .*
> ++**    ...
> ++**    bne     .*
> ++**    ...
> ++**    ldp     x29, x30, \[sp, #?272\]
> ++**    add     sp, sp, #?288
> ++**    ret
> ++**    bl      __stack_chk_fail
> ++*/
> ++int test1() {
> ++  int y[0x40];
> ++  g(y);
> ++  return 1;
> ++}
> ++
> ++/*
> ++** test2:
> ++**    stp     x29, x30, \[sp, #?-16\]!
> ++**    mov     x29, sp
> ++**    sub     sp, sp, #1040
> ++**    mrs     (x[0-9]+), tpidr2_el0
> ++**    ldr     (x[0-9]+), \[\1, #?16\]
> ++**    str     \2, \[sp, #?1032\]
> ++**    mov     \2, #?0
> ++**    add     x0, sp, #?8
> ++**    bl      g
> ++**    ...
> ++**    mrs     .*
> ++**    ...
> ++**    bne     .*
> ++**    ...
> ++**    add     sp, sp, #?1040
> ++**    ldp     x29, x30, \[sp\], #?16
> ++**    ret
> ++**    bl      __stack_chk_fail
> ++*/
> ++int test2() {
> ++  int y[0x100];
> ++  g(y);
> ++  return 1;
> ++}
> ++
> ++#pragma GCC target "+sve"
> ++
> ++/*
> ++** test3:
> ++**    stp     x29, x30, \[sp, #?-16\]!
> ++**    mov     x29, sp
> ++**    addvl   sp, sp, #-18
> ++**    ...
> ++**    str     p4, \[sp\]
> ++**    ...
> ++**    sub     sp, sp, #272
> ++**    mrs     (x[0-9]+), tpidr2_el0
> ++**    ldr     (x[0-9]+), \[\1, #?16\]
> ++**    str     \2, \[sp, #?264\]
> ++**    mov     \2, #?0
> ++**    add     x0, sp, #?8
> ++**    bl      h
> ++**    ...
> ++**    mrs     .*
> ++**    ...
> ++**    bne     .*
> ++**    ...
> ++**    add     sp, sp, #?272
> ++**    ...
> ++**    ldr     p4, \[sp\]
> ++**    ...
> ++**    addvl   sp, sp, #18
> ++**    ldp     x29, x30, \[sp\], #?16
> ++**    ret
> ++**    bl      __stack_chk_fail
> ++*/
> ++__SVBool_t test3() {
> ++  int y[0x40];
> ++  return *h(y);
> ++}
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
> b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
> +new file mode 100644
> +index 00000000000..58f322aa480
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
> +@@ -0,0 +1,33 @@
> ++/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++/*
> ++** main:
> ++**    ...
> ++**    stp     x29, x30, \[sp, #?-[0-9]+\]!
> ++**    ...
> ++**    sub     sp, sp, #[0-9]+
> ++**    ...
> ++**    str     x[0-9]+, \[x29, #?-8\]
> ++**    ...
> ++*/
> ++int f(const char *);
> ++void g(void *);
> ++int main(int argc, char* argv[])
> ++{
> ++  int a;
> ++  int b;
> ++  char c[2+f(argv[1])];
> ++  int d[0x100];
> ++  char y;
> ++
> ++  y=42; a=4; b=10;
> ++  c[0] = 'h'; c[1] = '\0';
> ++
> ++  c[f(argv[2])] = '\0';
> ++
> ++  __builtin_printf("%d %d\n%s\n", a, b, c);
> ++  g(d);
> ++
> ++  return 0;
> ++}
> +--
> +2.34.1
> +
> --
> 2.34.1
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#187543):
> https://lists.openembedded.org/g/openembedded-core/message/187543
> Mute This Topic: https://lists.openembedded.org/mt/101319990/3617156
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [
> martin.jansa@gmail.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
>
Martin Jansa Sept. 14, 2023, 12:22 p.m. UTC | #2
On Thu, Sep 14, 2023 at 11:07 AM Martin Jansa <martin.jansa@gmail.com>
wrote:

> FYI: one of LGE proprietary components triggers ICE with this applied,
> I'll try to find minimal reproducer later, this is just for other people
> who might hit the same:
>
> error: unrecognizable insn:
>  2923 | }
>       | ^
> (insn 416 286 290 17 (parallel [
>             (set (mem/c:SI (plus:DI (reg/f:DI 29 x29)
>                         (const_int -260 [0xfffffffffffffefc])) [1
> redacted.pixel_format+0 S4 A32])
>                 (const_int 0 [0]))
>             (set (mem/c:SI (plus:DI (reg/f:DI 29 x29)
>                         (const_int -256 [0xffffffffffffff00])) [1
> redacted.pixel_value+0 S4 A128])
>                 (reg/v:SI 22 x22 [orig:141 color ] [141]))
>         ])
> "TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c":2903:45 -1
>      (expr_list:REG_DEAD (reg/v:SI 22 x22 [orig:141 color ] [141])
>         (nil)))
> during RTL pass: cprop_hardreg
> TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c:2923:1:
> internal compiler error: in extract_insn, at recog.cc:2791
> 0x191624a internal_error(char const*, ...)
> ???:0
> 0x6bee26 fancy_abort(char const*, int, char const*)
> ???:0
> 0x697469 _fatal_insn(char const*, rtx_def const*, char const*, int, char
> const*)
> ???:0
> 0x697485 _fatal_insn_not_found(rtx_def const*, char const*, int, char
> const*)
> ???:0
> 0xbef198 extract_constrain_insn(rtx_insn*)
> ???:0
>

And the same code fails like this only with gcc-12.3 in mickledore and
gcc-13.2 in nanbield. kirkstone with gcc-11.4 and your patch (as it is in
kirkstone-nut) builds the same code fine.
Ross Burton Sept. 14, 2023, 12:25 p.m. UTC | #3
On 14 Sep 2023, at 10:07, Martin Jansa via lists.openembedded.org <Martin.Jansa=gmail.com@lists.openembedded.org> wrote:
> 
> FYI: one of LGE proprietary components triggers ICE with this applied, I'll try to find minimal reproducer later, this is just for other people who might hit the same:

That’s… upsetting.

I’ve forwarded this to our toolchain team.  If you can whittle down a reproducer that would be _much_ appreciated, but I’ll see if they have any ideas about where the issue might be.

Ross
diff mbox series

Patch

diff --git a/meta/recipes-devtools/gcc/gcc-12.3.inc b/meta/recipes-devtools/gcc/gcc-12.3.inc
index 4ec03f925c8..5896f26e1af 100644
--- a/meta/recipes-devtools/gcc/gcc-12.3.inc
+++ b/meta/recipes-devtools/gcc/gcc-12.3.inc
@@ -63,6 +63,7 @@  SRC_URI = "${BASEURI} \
            file://0026-rust-recursion-limit.patch \
            file://prefix-map-realpath.patch \
            file://hardcoded-paths.patch \
+           file://CVE-2023-4039.patch \
 "
 SRC_URI[sha256sum] = "949a5d4f99e786421a93b532b22ffab5578de7321369975b91aec97adfda8c3b"
 
diff --git a/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
new file mode 100644
index 00000000000..8cb52849cd3
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
@@ -0,0 +1,3093 @@ 
+From: Richard Sandiford <richard.sandiford@arm.com>
+Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue
+Date: Tue, 12 Sep 2023 16:25:10 +0100
+
+This series of patches fixes deficiencies in GCC's -fstack-protector
+implementation for AArch64 when using dynamically allocated stack space.
+This is CVE-2023-4039.  See:
+
+https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
+https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
+
+for more details.
+
+The fix is to put the saved registers above the locals area when
+-fstack-protector is used.
+
+The series also fixes a stack-clash problem that I found while working
+on the CVE.  In unpatched sources, the stack-clash problem would only
+trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an
+equivalent).  But it would be a more significant issue with the new
+-fstack-protector frame layout.  It's therefore important that both
+problems are fixed together.
+
+Some reorganisation of the code seemed necessary to fix the problems in a
+cleanish way.  The series is therefore quite long, but only a handful of
+patches should have any effect on code generation.
+
+See the individual patches for a detailed description.
+
+Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches.
+I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039.
+
+CVE: CVE-2023-4039
+Upstream-Status: Backport
+Signed-off-by: Ross Burton <ross.burton@arm.com>
+  
+  
+From 62fbb215cc817e9f2c1ca80282a64f4ee30806bc Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:48 +0100
+Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping code
+
+aarch64_layout_frame uses a shorthand for referring to
+cfun->machine->frame:
+
+  aarch64_frame &frame = cfun->machine->frame;
+
+This patch does the same for some other heavy users of the structure.
+No functional change intended.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use
+	a local shorthand for cfun->machine->frame.
+	(aarch64_restore_callee_saves, aarch64_get_separate_components):
+	(aarch64_process_components): Likewise.
+	(aarch64_allocate_and_probe_stack_space): Likewise.
+	(aarch64_expand_prologue, aarch64_expand_epilogue): Likewise.
+	(aarch64_layout_frame): Use existing shorthand for one more case.
+---
+ gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++----------------
+ 1 file changed, 64 insertions(+), 59 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 226dc9dffd4..ae42ffdedbe 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8351,7 +8351,7 @@ aarch64_layout_frame (void)
+   frame.is_scs_enabled
+     = (!crtl->calls_eh_return
+        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
+-       && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
++       && known_ge (frame.reg_offset[LR_REGNUM], 0));
+ 
+   /* When shadow call stack is enabled, the scs_pop in the epilogue will
+      restore x30, and we don't need to pop x30 again in the traditional
+@@ -8763,6 +8763,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
+ 			   unsigned start, unsigned limit, bool skip_wb,
+ 			   bool hard_fp_valid_p)
+ {
++  aarch64_frame &frame = cfun->machine->frame;
+   rtx_insn *insn;
+   unsigned regno;
+   unsigned regno2;
+@@ -8777,8 +8778,8 @@ aarch64_save_callee_saves (poly_int64 start_offset,
+       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
+ 
+       if (skip_wb
+-	  && (regno == cfun->machine->frame.wb_push_candidate1
+-	      || regno == cfun->machine->frame.wb_push_candidate2))
++	  && (regno == frame.wb_push_candidate1
++	      || regno == frame.wb_push_candidate2))
+ 	continue;
+ 
+       if (cfun->machine->reg_is_wrapped_separately[regno])
+@@ -8786,7 +8787,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
+ 
+       machine_mode mode = aarch64_reg_save_mode (regno);
+       reg = gen_rtx_REG (mode, regno);
+-      offset = start_offset + cfun->machine->frame.reg_offset[regno];
++      offset = start_offset + frame.reg_offset[regno];
+       rtx base_rtx = stack_pointer_rtx;
+       poly_int64 sp_offset = offset;
+ 
+@@ -8799,7 +8800,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
+ 	{
+ 	  gcc_assert (known_eq (start_offset, 0));
+ 	  poly_int64 fp_offset
+-	    = cfun->machine->frame.below_hard_fp_saved_regs_size;
++	    = frame.below_hard_fp_saved_regs_size;
+ 	  if (hard_fp_valid_p)
+ 	    base_rtx = hard_frame_pointer_rtx;
+ 	  else
+@@ -8821,8 +8822,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
+ 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
+ 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
+ 	  && known_eq (GET_MODE_SIZE (mode),
+-		       cfun->machine->frame.reg_offset[regno2]
+-		       - cfun->machine->frame.reg_offset[regno]))
++		       frame.reg_offset[regno2] - frame.reg_offset[regno]))
+ 	{
+ 	  rtx reg2 = gen_rtx_REG (mode, regno2);
+ 	  rtx mem2;
+@@ -8872,6 +8872,7 @@ static void
+ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
+ 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
+ {
++  aarch64_frame &frame = cfun->machine->frame;
+   unsigned regno;
+   unsigned regno2;
+   poly_int64 offset;
+@@ -8888,13 +8889,13 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
+       rtx reg, mem;
+ 
+       if (skip_wb
+-	  && (regno == cfun->machine->frame.wb_pop_candidate1
+-	      || regno == cfun->machine->frame.wb_pop_candidate2))
++	  && (regno == frame.wb_pop_candidate1
++	      || regno == frame.wb_pop_candidate2))
+ 	continue;
+ 
+       machine_mode mode = aarch64_reg_save_mode (regno);
+       reg = gen_rtx_REG (mode, regno);
+-      offset = start_offset + cfun->machine->frame.reg_offset[regno];
++      offset = start_offset + frame.reg_offset[regno];
+       rtx base_rtx = stack_pointer_rtx;
+       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
+@@ -8905,8 +8906,7 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
+ 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
+ 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
+ 	  && known_eq (GET_MODE_SIZE (mode),
+-		       cfun->machine->frame.reg_offset[regno2]
+-		       - cfun->machine->frame.reg_offset[regno]))
++		       frame.reg_offset[regno2] - frame.reg_offset[regno]))
+ 	{
+ 	  rtx reg2 = gen_rtx_REG (mode, regno2);
+ 	  rtx mem2;
+@@ -9011,6 +9011,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
+ static sbitmap
+ aarch64_get_separate_components (void)
+ {
++  aarch64_frame &frame = cfun->machine->frame;
+   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
+   bitmap_clear (components);
+ 
+@@ -9027,18 +9028,18 @@ aarch64_get_separate_components (void)
+ 	if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ 	  continue;
+ 
+-	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
++	poly_int64 offset = frame.reg_offset[regno];
+ 
+ 	/* If the register is saved in the first SVE save slot, we use
+ 	   it as a stack probe for -fstack-clash-protection.  */
+ 	if (flag_stack_clash_protection
+-	    && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
++	    && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
+ 	    && known_eq (offset, 0))
+ 	  continue;
+ 
+ 	/* Get the offset relative to the register we'll use.  */
+ 	if (frame_pointer_needed)
+-	  offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
++	  offset -= frame.below_hard_fp_saved_regs_size;
+ 	else
+ 	  offset += crtl->outgoing_args_size;
+ 
+@@ -9057,11 +9058,11 @@ aarch64_get_separate_components (void)
+   /* If the spare predicate register used by big-endian SVE code
+      is call-preserved, it must be saved in the main prologue
+      before any saves that use it.  */
+-  if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
+-    bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
++  if (frame.spare_pred_reg != INVALID_REGNUM)
++    bitmap_clear_bit (components, frame.spare_pred_reg);
+ 
+-  unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
+-  unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
++  unsigned reg1 = frame.wb_push_candidate1;
++  unsigned reg2 = frame.wb_push_candidate2;
+   /* If registers have been chosen to be stored/restored with
+      writeback don't interfere with them to avoid having to output explicit
+      stack adjustment instructions.  */
+@@ -9170,6 +9171,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
+ static void
+ aarch64_process_components (sbitmap components, bool prologue_p)
+ {
++  aarch64_frame &frame = cfun->machine->frame;
+   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
+ 			     ? HARD_FRAME_POINTER_REGNUM
+ 			     : STACK_POINTER_REGNUM);
+@@ -9184,9 +9186,9 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       machine_mode mode = aarch64_reg_save_mode (regno);
+       
+       rtx reg = gen_rtx_REG (mode, regno);
+-      poly_int64 offset = cfun->machine->frame.reg_offset[regno];
++      poly_int64 offset = frame.reg_offset[regno];
+       if (frame_pointer_needed)
+-	offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
++	offset -= frame.below_hard_fp_saved_regs_size;
+       else
+ 	offset += crtl->outgoing_args_size;
+ 
+@@ -9211,14 +9213,14 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+ 	  break;
+ 	}
+ 
+-      poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
++      poly_int64 offset2 = frame.reg_offset[regno2];
+       /* The next register is not of the same class or its offset is not
+ 	 mergeable with the current one into a pair.  */
+       if (aarch64_sve_mode_p (mode)
+ 	  || !satisfies_constraint_Ump (mem)
+ 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+ 	  || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
+-	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
++	  || maybe_ne ((offset2 - frame.reg_offset[regno]),
+ 		       GET_MODE_SIZE (mode)))
+ 	{
+ 	  insn = emit_insn (set);
+@@ -9240,7 +9242,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       /* REGNO2 can be saved/restored in a pair with REGNO.  */
+       rtx reg2 = gen_rtx_REG (mode, regno2);
+       if (frame_pointer_needed)
+-	offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
++	offset2 -= frame.below_hard_fp_saved_regs_size;
+       else
+ 	offset2 += crtl->outgoing_args_size;
+       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
+@@ -9335,6 +9337,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ 					bool frame_related_p,
+ 					bool final_adjustment_p)
+ {
++  aarch64_frame &frame = cfun->machine->frame;
+   HOST_WIDE_INT guard_size
+     = 1 << param_stack_clash_protection_guard_size;
+   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+@@ -9355,25 +9358,25 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+        register as a probe.  We can't assume that LR was saved at position 0
+        though, so treat any space below it as unprobed.  */
+   if (final_adjustment_p
+-      && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
++      && known_eq (frame.below_hard_fp_saved_regs_size, 0))
+     {
+-      poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
++      poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
+       if (known_ge (lr_offset, 0))
+ 	min_probe_threshold -= lr_offset.to_constant ();
+       else
+ 	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
+     }
+ 
+-  poly_int64 frame_size = cfun->machine->frame.frame_size;
++  poly_int64 frame_size = frame.frame_size;
+ 
+   /* We should always have a positive probe threshold.  */
+   gcc_assert (min_probe_threshold > 0);
+ 
+   if (flag_stack_clash_protection && !final_adjustment_p)
+     {
+-      poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
+-      poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
+-      poly_int64 final_adjust = cfun->machine->frame.final_adjust;
++      poly_int64 initial_adjust = frame.initial_adjust;
++      poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
++      poly_int64 final_adjust = frame.final_adjust;
+ 
+       if (known_eq (frame_size, 0))
+ 	{
+@@ -9662,17 +9665,18 @@ aarch64_epilogue_uses (int regno)
+ void
+ aarch64_expand_prologue (void)
+ {
+-  poly_int64 frame_size = cfun->machine->frame.frame_size;
+-  poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
+-  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
+-  poly_int64 final_adjust = cfun->machine->frame.final_adjust;
+-  poly_int64 callee_offset = cfun->machine->frame.callee_offset;
+-  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
++  aarch64_frame &frame = cfun->machine->frame;
++  poly_int64 frame_size = frame.frame_size;
++  poly_int64 initial_adjust = frame.initial_adjust;
++  HOST_WIDE_INT callee_adjust = frame.callee_adjust;
++  poly_int64 final_adjust = frame.final_adjust;
++  poly_int64 callee_offset = frame.callee_offset;
++  poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
+   poly_int64 below_hard_fp_saved_regs_size
+-    = cfun->machine->frame.below_hard_fp_saved_regs_size;
+-  unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
+-  unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
+-  bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
++    = frame.below_hard_fp_saved_regs_size;
++  unsigned reg1 = frame.wb_push_candidate1;
++  unsigned reg2 = frame.wb_push_candidate2;
++  bool emit_frame_chain = frame.emit_frame_chain;
+   rtx_insn *insn;
+ 
+   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
+@@ -9703,7 +9707,7 @@ aarch64_expand_prologue (void)
+     }
+ 
+   /* Push return address to shadow call stack.  */
+-  if (cfun->machine->frame.is_scs_enabled)
++  if (frame.is_scs_enabled)
+     emit_insn (gen_scs_push ());
+ 
+   if (flag_stack_usage_info)
+@@ -9742,7 +9746,7 @@ aarch64_expand_prologue (void)
+ 
+   /* The offset of the frame chain record (if any) from the current SP.  */
+   poly_int64 chain_offset = (initial_adjust + callee_adjust
+-			     - cfun->machine->frame.hard_fp_offset);
++			     - frame.hard_fp_offset);
+   gcc_assert (known_ge (chain_offset, 0));
+ 
+   /* The offset of the bottom of the save area from the current SP.  */
+@@ -9845,16 +9849,17 @@ aarch64_use_return_insn_p (void)
+ void
+ aarch64_expand_epilogue (bool for_sibcall)
+ {
+-  poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
+-  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
+-  poly_int64 final_adjust = cfun->machine->frame.final_adjust;
+-  poly_int64 callee_offset = cfun->machine->frame.callee_offset;
+-  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
++  aarch64_frame &frame = cfun->machine->frame;
++  poly_int64 initial_adjust = frame.initial_adjust;
++  HOST_WIDE_INT callee_adjust = frame.callee_adjust;
++  poly_int64 final_adjust = frame.final_adjust;
++  poly_int64 callee_offset = frame.callee_offset;
++  poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
+   poly_int64 below_hard_fp_saved_regs_size
+-    = cfun->machine->frame.below_hard_fp_saved_regs_size;
+-  unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
+-  unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
+-  unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
++    = frame.below_hard_fp_saved_regs_size;
++  unsigned reg1 = frame.wb_pop_candidate1;
++  unsigned reg2 = frame.wb_pop_candidate2;
++  unsigned int last_gpr = (frame.is_scs_enabled
+ 			   ? R29_REGNUM : R30_REGNUM);
+   rtx cfi_ops = NULL;
+   rtx_insn *insn;
+@@ -9888,7 +9893,7 @@ aarch64_expand_epilogue (bool for_sibcall)
+   /* We need to add memory barrier to prevent read from deallocated stack.  */
+   bool need_barrier_p
+     = maybe_ne (get_frame_size ()
+-		+ cfun->machine->frame.saved_varargs_size, 0);
++		+ frame.saved_varargs_size, 0);
+ 
+   /* Emit a barrier to prevent loads from a deallocated stack.  */
+   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
+@@ -9969,7 +9974,7 @@ aarch64_expand_epilogue (bool for_sibcall)
+     }
+ 
+   /* Pop return address from shadow call stack.  */
+-  if (cfun->machine->frame.is_scs_enabled)
++  if (frame.is_scs_enabled)
+     {
+       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
+       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
+@@ -12564,24 +12569,24 @@ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
+ poly_int64
+ aarch64_initial_elimination_offset (unsigned from, unsigned to)
+ {
++  aarch64_frame &frame = cfun->machine->frame;
++
+   if (to == HARD_FRAME_POINTER_REGNUM)
+     {
+       if (from == ARG_POINTER_REGNUM)
+-	return cfun->machine->frame.hard_fp_offset;
++	return frame.hard_fp_offset;
+ 
+       if (from == FRAME_POINTER_REGNUM)
+-	return cfun->machine->frame.hard_fp_offset
+-	       - cfun->machine->frame.locals_offset;
++	return frame.hard_fp_offset - frame.locals_offset;
+     }
+ 
+   if (to == STACK_POINTER_REGNUM)
+     {
+       if (from == FRAME_POINTER_REGNUM)
+-	  return cfun->machine->frame.frame_size
+-		 - cfun->machine->frame.locals_offset;
++	return frame.frame_size - frame.locals_offset;
+     }
+ 
+-  return cfun->machine->frame.frame_size;
++  return frame.frame_size;
+ }
+ 
+ 
+-- 
+2.34.1
+
+
+From 12a8889de169f892d2e927584c00d20b8b7e456f Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:49 +0100
+Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset
+
+When we emit the frame chain, i.e. when we reach Here in this statement
+of aarch64_expand_prologue:
+
+  if (emit_frame_chain)
+    {
+      // Here
+      ...
+    }
+
+the stack is in one of two states:
+
+- We've allocated up to the frame chain, but no more.
+
+- We've allocated the whole frame, and the frame chain is within easy
+  reach of the new SP.
+
+The offset of the frame chain from the current SP is available
+in aarch64_frame as callee_offset.  It is also available as the
+chain_offset local variable, where the latter is calculated from other
+data.  (However, chain_offset is not always equal to callee_offset when
+!emit_frame_chain, so chain_offset isn't redundant.)
+
+In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using
+chain_offset for the initialisation of the hard frame pointer:
+
+       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
+-                         stack_pointer_rtx, callee_offset,
++                         stack_pointer_rtx, chain_offset,
+                          tmp1_rtx, tmp0_rtx, frame_pointer_needed);
+
+But the later REG_CFA_ADJUST_CFA handling still used callee_offset.
+
+I think the difference is harmless, but it's more logical for the
+CFA note to be in sync, and it's more convenient for later patches
+if it uses chain_offset.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_expand_prologue): Use
+	chain_offset rather than callee_offset.
+---
+ gcc/config/aarch64/aarch64.cc | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index ae42ffdedbe..79253322fd7 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -9670,7 +9670,6 @@ aarch64_expand_prologue (void)
+   poly_int64 initial_adjust = frame.initial_adjust;
+   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
+   poly_int64 final_adjust = frame.final_adjust;
+-  poly_int64 callee_offset = frame.callee_offset;
+   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
+   poly_int64 below_hard_fp_saved_regs_size
+     = frame.below_hard_fp_saved_regs_size;
+@@ -9783,8 +9782,7 @@ aarch64_expand_prologue (void)
+ 	     implicit.  */
+ 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
+ 	    {
+-	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
+-				       callee_offset);
++	      rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
+ 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
+ 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
+ 	    }
+-- 
+2.34.1
+
+
+From 03d5e89e7f3be53fd7142556e8e0a2774c653dca Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:49 +0100
+Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved
+ registers
+
+If a frame has no saved registers, it can be allocated in one go.
+There is no need to treat the areas below and above the saved
+registers as separate.
+
+And if we allocate the frame in one go, it should be allocated
+as the initial_adjust rather than the final_adjust.  This allows the
+frame size to grow to guard_size - guard_used_by_caller before a stack
+probe is needed.  (A frame with no register saves is necessarily a
+leaf frame.)
+
+This is a no-op as thing stand, since a leaf function will have
+no outgoing arguments, and so all the frame will be above where
+the saved registers normally go.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly
+	allocate the frame in one go if there are no saved registers.
+---
+ gcc/config/aarch64/aarch64.cc | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 79253322fd7..e1f21230c15 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8378,9 +8378,11 @@ aarch64_layout_frame (void)
+ 
+   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
+   HOST_WIDE_INT const_saved_regs_size;
+-  if (frame.frame_size.is_constant (&const_size)
+-      && const_size < max_push_offset
+-      && known_eq (frame.hard_fp_offset, const_size))
++  if (known_eq (frame.saved_regs_size, 0))
++    frame.initial_adjust = frame.frame_size;
++  else if (frame.frame_size.is_constant (&const_size)
++	   && const_size < max_push_offset
++	   && known_eq (frame.hard_fp_offset, const_size))
+     {
+       /* Simple, small frame with no outgoing arguments:
+ 
+-- 
+2.34.1
+
+
+From 49c2eb7616756c323b7f6b18d8616ec945eb1263 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:49 +0100
+Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info
+
+The frame layout code currently hard-codes the assumption that
+the number of bytes below the saved registers is equal to the
+size of the outgoing arguments.  This patch abstracts that
+value into a new field of aarch64_frame.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::bytes_below_saved_regs): New
+	field.
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it,
+	and use it instead of crtl->outgoing_args_size.
+	(aarch64_get_separate_components): Use bytes_below_saved_regs instead
+	of outgoing_args_size.
+	(aarch64_process_components): Likewise.
+---
+ gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++-----------------
+ gcc/config/aarch64/aarch64.h  |  5 +++
+ 2 files changed, 41 insertions(+), 35 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index e1f21230c15..94e1b686584 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8217,6 +8217,8 @@ aarch64_layout_frame (void)
+   gcc_assert (crtl->is_leaf
+ 	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
+ 
++  frame.bytes_below_saved_regs = crtl->outgoing_args_size;
++
+   /* Now assign stack slots for the registers.  Start with the predicate
+      registers, since predicate LDR and STR have a relatively small
+      offset range.  These saves happen below the hard frame pointer.  */
+@@ -8321,18 +8323,18 @@ aarch64_layout_frame (void)
+ 
+   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
+ 
+-  poly_int64 above_outgoing_args
++  poly_int64 saved_regs_and_above
+     = aligned_upper_bound (varargs_and_saved_regs_size
+ 			   + get_frame_size (),
+ 			   STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+   frame.hard_fp_offset
+-    = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
++    = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
+ 
+   /* Both these values are already aligned.  */
+-  gcc_assert (multiple_p (crtl->outgoing_args_size,
++  gcc_assert (multiple_p (frame.bytes_below_saved_regs,
+ 			  STACK_BOUNDARY / BITS_PER_UNIT));
+-  frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
++  frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
+ 
+   frame.locals_offset = frame.saved_varargs_size;
+ 
+@@ -8376,7 +8378,7 @@ aarch64_layout_frame (void)
+   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
+     max_push_offset = 256;
+ 
+-  HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
++  HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
+   HOST_WIDE_INT const_saved_regs_size;
+   if (known_eq (frame.saved_regs_size, 0))
+     frame.initial_adjust = frame.frame_size;
+@@ -8384,31 +8386,31 @@ aarch64_layout_frame (void)
+ 	   && const_size < max_push_offset
+ 	   && known_eq (frame.hard_fp_offset, const_size))
+     {
+-      /* Simple, small frame with no outgoing arguments:
++      /* Simple, small frame with no data below the saved registers.
+ 
+ 	 stp reg1, reg2, [sp, -frame_size]!
+ 	 stp reg3, reg4, [sp, 16]  */
+       frame.callee_adjust = const_size;
+     }
+-  else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
++  else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
+ 	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
+-	   && const_outgoing_args_size + const_saved_regs_size < 512
+-	   /* We could handle this case even with outgoing args, provided
+-	      that the number of args left us with valid offsets for all
+-	      predicate and vector save slots.  It's such a rare case that
+-	      it hardly seems worth the effort though.  */
+-	   && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
++	   && const_below_saved_regs + const_saved_regs_size < 512
++	   /* We could handle this case even with data below the saved
++	      registers, provided that that data left us with valid offsets
++	      for all predicate and vector save slots.  It's such a rare
++	      case that it hardly seems worth the effort though.  */
++	   && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
+ 	   && !(cfun->calls_alloca
+ 		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
+ 		&& const_fp_offset < max_push_offset))
+     {
+-      /* Frame with small outgoing arguments:
++      /* Frame with small area below the saved registers:
+ 
+ 	 sub sp, sp, frame_size
+-	 stp reg1, reg2, [sp, outgoing_args_size]
+-	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
++	 stp reg1, reg2, [sp, bytes_below_saved_regs]
++	 stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
+       frame.initial_adjust = frame.frame_size;
+-      frame.callee_offset = const_outgoing_args_size;
++      frame.callee_offset = const_below_saved_regs;
+     }
+   else if (saves_below_hard_fp_p
+ 	   && known_eq (frame.saved_regs_size,
+@@ -8418,30 +8420,29 @@ aarch64_layout_frame (void)
+ 
+ 	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
+ 	 save SVE registers relative to SP
+-	 sub sp, sp, outgoing_args_size  */
++	 sub sp, sp, bytes_below_saved_regs  */
+       frame.initial_adjust = (frame.hard_fp_offset
+ 			      + frame.below_hard_fp_saved_regs_size);
+-      frame.final_adjust = crtl->outgoing_args_size;
++      frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
+ 	   && const_fp_offset < max_push_offset)
+     {
+-      /* Frame with large outgoing arguments or SVE saves, but with
+-	 a small local area:
++      /* Frame with large area below the saved registers, or with SVE saves,
++	 but with a small area above:
+ 
+ 	 stp reg1, reg2, [sp, -hard_fp_offset]!
+ 	 stp reg3, reg4, [sp, 16]
+ 	 [sub sp, sp, below_hard_fp_saved_regs_size]
+ 	 [save SVE registers relative to SP]
+-	 sub sp, sp, outgoing_args_size  */
++	 sub sp, sp, bytes_below_saved_regs  */
+       frame.callee_adjust = const_fp_offset;
+       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
+-      frame.final_adjust = crtl->outgoing_args_size;
++      frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+   else
+     {
+-      /* Frame with large local area and outgoing arguments or SVE saves,
+-	 using frame pointer:
++      /* General case:
+ 
+ 	 sub sp, sp, hard_fp_offset
+ 	 stp x29, x30, [sp, 0]
+@@ -8449,10 +8450,10 @@ aarch64_layout_frame (void)
+ 	 stp reg3, reg4, [sp, 16]
+ 	 [sub sp, sp, below_hard_fp_saved_regs_size]
+ 	 [save SVE registers relative to SP]
+-	 sub sp, sp, outgoing_args_size  */
++	 sub sp, sp, bytes_below_saved_regs  */
+       frame.initial_adjust = frame.hard_fp_offset;
+       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
+-      frame.final_adjust = crtl->outgoing_args_size;
++      frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+ 
+   /* Make sure the individual adjustments add up to the full frame size.  */
+@@ -9043,7 +9044,7 @@ aarch64_get_separate_components (void)
+ 	if (frame_pointer_needed)
+ 	  offset -= frame.below_hard_fp_saved_regs_size;
+ 	else
+-	  offset += crtl->outgoing_args_size;
++	  offset += frame.bytes_below_saved_regs;
+ 
+ 	/* Check that we can access the stack slot of the register with one
+ 	   direct load with no adjustments needed.  */
+@@ -9192,7 +9193,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       if (frame_pointer_needed)
+ 	offset -= frame.below_hard_fp_saved_regs_size;
+       else
+-	offset += crtl->outgoing_args_size;
++	offset += frame.bytes_below_saved_regs;
+ 
+       rtx addr = plus_constant (Pmode, ptr_reg, offset);
+       rtx mem = gen_frame_mem (mode, addr);
+@@ -9246,7 +9247,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       if (frame_pointer_needed)
+ 	offset2 -= frame.below_hard_fp_saved_regs_size;
+       else
+-	offset2 += crtl->outgoing_args_size;
++	offset2 += frame.bytes_below_saved_regs;
+       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
+       rtx mem2 = gen_frame_mem (mode, addr2);
+       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
+@@ -9320,10 +9321,10 @@ aarch64_stack_clash_protection_alloca_probe_range (void)
+    registers.  If POLY_SIZE is not large enough to require a probe this function
+    will only adjust the stack.  When allocating the stack space
+    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
+-   FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
+-   arguments.  If we are then we ensure that any allocation larger than the ABI
+-   defined buffer needs a probe so that the invariant of having a 1KB buffer is
+-   maintained.
++   FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
++   the saved registers.  If we are then we ensure that any allocation
++   larger than the ABI defined buffer needs a probe so that the
++   invariant of having a 1KB buffer is maintained.
+ 
+    We emit barriers after each stack adjustment to prevent optimizations from
+    breaking the invariant that we never drop the stack more than a page.  This
+@@ -9532,7 +9533,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
+      be probed.  This maintains the requirement that each page is probed at
+      least once.  For initial probing we probe only if the allocation is
+-     more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
++     more than GUARD_SIZE - buffer, and below the saved registers we probe
+      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
+      GUARD_SIZE.  This works that for any allocation that is large enough to
+      trigger a probe here, we'll have at least one, and if they're not large
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 6834c3e9922..1e105e12db8 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -871,6 +871,11 @@ struct GTY (()) aarch64_frame
+   /* The size of the callee-save registers with a slot in REG_OFFSET.  */
+   poly_int64 saved_regs_size;
+ 
++  /* The number of bytes between the bottom of the static frame (the bottom
++     of the outgoing arguments) and the bottom of the register save area.
++     This value is always a multiple of STACK_BOUNDARY.  */
++  poly_int64 bytes_below_saved_regs;
++
+   /* The size of the callee-save registers with a slot in REG_OFFSET that
+      are saved below the hard frame pointer.  */
+   poly_int64 below_hard_fp_saved_regs_size;
+-- 
+2.34.1
+
+
+From 34081079ea4de0c98331843f574b5f6f94d7b234 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:50 +0100
+Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info
+
+Following on from the previous bytes_below_saved_regs patch, this one
+records the number of bytes that are below the hard frame pointer.
+This eventually replaces below_hard_fp_saved_regs_size.
+
+If a frame pointer is not needed, the epilogue adds final_adjust
+to the stack pointer before restoring registers:
+
+     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
+
+Therefore, if the epilogue needs to restore the stack pointer from
+the hard frame pointer, the directly corresponding offset is:
+
+     -bytes_below_hard_fp + final_adjust
+
+i.e. go from the hard frame pointer to the bottom of the frame,
+then add the same amount as if we were using the stack pointer
+from the outset.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): New
+	field.
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it.
+	(aarch64_expand_epilogue): Use it instead of
+	below_hard_fp_saved_regs_size.
+---
+ gcc/config/aarch64/aarch64.cc | 6 +++---
+ gcc/config/aarch64/aarch64.h  | 5 +++++
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 94e1b686584..c7d84245fbf 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8269,6 +8269,7 @@ aarch64_layout_frame (void)
+      of the callee save area.  */
+   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
+   frame.below_hard_fp_saved_regs_size = offset;
++  frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
+   if (frame.emit_frame_chain)
+     {
+       /* FP and LR are placed in the linkage record.  */
+@@ -9856,8 +9857,7 @@ aarch64_expand_epilogue (bool for_sibcall)
+   poly_int64 final_adjust = frame.final_adjust;
+   poly_int64 callee_offset = frame.callee_offset;
+   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
+-  poly_int64 below_hard_fp_saved_regs_size
+-    = frame.below_hard_fp_saved_regs_size;
++  poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
+   unsigned reg1 = frame.wb_pop_candidate1;
+   unsigned reg2 = frame.wb_pop_candidate2;
+   unsigned int last_gpr = (frame.is_scs_enabled
+@@ -9915,7 +9915,7 @@ aarch64_expand_epilogue (bool for_sibcall)
+        is restored on the instruction doing the writeback.  */
+     aarch64_add_offset (Pmode, stack_pointer_rtx,
+ 			hard_frame_pointer_rtx,
+-			-callee_offset - below_hard_fp_saved_regs_size,
++			-bytes_below_hard_fp + final_adjust,
+ 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
+   else
+      /* The case where we need to re-use the register here is very rare, so
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 1e105e12db8..de68ff7202f 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -880,6 +880,11 @@ struct GTY (()) aarch64_frame
+      are saved below the hard frame pointer.  */
+   poly_int64 below_hard_fp_saved_regs_size;
+ 
++  /* The number of bytes between the bottom of the static frame (the bottom
++     of the outgoing arguments) and the hard frame pointer.  This value is
++     always a multiple of STACK_BOUNDARY.  */
++  poly_int64 bytes_below_hard_fp;
++
+   /* Offset from the base of the frame (incomming SP) to the
+      top of the locals area.  This value is always a multiple of
+      STACK_BOUNDARY.  */
+-- 
+2.34.1
+
+
+From 187861af7c51db9eddc6f954b589c121b210fc74 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:50 +0100
+Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves
+
+aarch64_save_callee_saves and aarch64_restore_callee_saves took
+a parameter called start_offset that gives the offset of the
+bottom of the saved register area from the current stack pointer.
+However, it's more convenient for later patches if we use the
+bottom of the entire frame as the reference point, rather than
+the bottom of the saved registers.
+
+Doing that removes the need for the callee_offset field.
+Other than that, this is not a win on its own.  It only really
+makes sense in combination with the follow-on patches.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete.
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Remove
+	callee_offset handling.
+	(aarch64_save_callee_saves): Replace the start_offset parameter
+	with a bytes_below_sp parameter.
+	(aarch64_restore_callee_saves): Likewise.
+	(aarch64_expand_prologue): Update accordingly.
+	(aarch64_expand_epilogue): Likewise.
+---
+ gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------
+ gcc/config/aarch64/aarch64.h  |  4 ---
+ 2 files changed, 28 insertions(+), 32 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index c7d84245fbf..e79551af41d 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8343,7 +8343,6 @@ aarch64_layout_frame (void)
+   frame.final_adjust = 0;
+   frame.callee_adjust = 0;
+   frame.sve_callee_adjust = 0;
+-  frame.callee_offset = 0;
+ 
+   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
+   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
+@@ -8411,7 +8410,6 @@ aarch64_layout_frame (void)
+ 	 stp reg1, reg2, [sp, bytes_below_saved_regs]
+ 	 stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
+       frame.initial_adjust = frame.frame_size;
+-      frame.callee_offset = const_below_saved_regs;
+     }
+   else if (saves_below_hard_fp_p
+ 	   && known_eq (frame.saved_regs_size,
+@@ -8758,12 +8756,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
+ }
+ 
+ /* Emit code to save the callee-saved registers from register number START
+-   to LIMIT to the stack at the location starting at offset START_OFFSET,
+-   skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
+-   is true if the hard frame pointer has been set up.  */
++   to LIMIT to the stack.  The stack pointer is currently BYTES_BELOW_SP
++   bytes above the bottom of the static frame.  Skip any write-back
++   candidates if SKIP_WB is true.  HARD_FP_VALID_P is true if the hard
++   frame pointer has been set up.  */
+ 
+ static void
+-aarch64_save_callee_saves (poly_int64 start_offset,
++aarch64_save_callee_saves (poly_int64 bytes_below_sp,
+ 			   unsigned start, unsigned limit, bool skip_wb,
+ 			   bool hard_fp_valid_p)
+ {
+@@ -8791,7 +8790,9 @@ aarch64_save_callee_saves (poly_int64 start_offset,
+ 
+       machine_mode mode = aarch64_reg_save_mode (regno);
+       reg = gen_rtx_REG (mode, regno);
+-      offset = start_offset + frame.reg_offset[regno];
++      offset = (frame.reg_offset[regno]
++		+ frame.bytes_below_saved_regs
++		- bytes_below_sp);
+       rtx base_rtx = stack_pointer_rtx;
+       poly_int64 sp_offset = offset;
+ 
+@@ -8802,9 +8803,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
+       else if (GP_REGNUM_P (regno)
+ 	       && (!offset.is_constant (&const_offset) || const_offset >= 512))
+ 	{
+-	  gcc_assert (known_eq (start_offset, 0));
+-	  poly_int64 fp_offset
+-	    = frame.below_hard_fp_saved_regs_size;
++	  poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
+ 	  if (hard_fp_valid_p)
+ 	    base_rtx = hard_frame_pointer_rtx;
+ 	  else
+@@ -8868,12 +8867,13 @@ aarch64_save_callee_saves (poly_int64 start_offset,
+ }
+ 
+ /* Emit code to restore the callee registers from register number START
+-   up to and including LIMIT.  Restore from the stack offset START_OFFSET,
+-   skipping any write-back candidates if SKIP_WB is true.  Write the
+-   appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
++   up to and including LIMIT.  The stack pointer is currently BYTES_BELOW_SP
++   bytes above the bottom of the static frame.  Skip any write-back
++   candidates if SKIP_WB is true.  Write the appropriate REG_CFA_RESTORE
++   notes into CFI_OPS.  */
+ 
+ static void
+-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
++aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
+ 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
+ {
+   aarch64_frame &frame = cfun->machine->frame;
+@@ -8899,7 +8899,9 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
+ 
+       machine_mode mode = aarch64_reg_save_mode (regno);
+       reg = gen_rtx_REG (mode, regno);
+-      offset = start_offset + frame.reg_offset[regno];
++      offset = (frame.reg_offset[regno]
++		+ frame.bytes_below_saved_regs
++		- bytes_below_sp);
+       rtx base_rtx = stack_pointer_rtx;
+       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
+@@ -9675,8 +9677,6 @@ aarch64_expand_prologue (void)
+   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
+   poly_int64 final_adjust = frame.final_adjust;
+   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
+-  poly_int64 below_hard_fp_saved_regs_size
+-    = frame.below_hard_fp_saved_regs_size;
+   unsigned reg1 = frame.wb_push_candidate1;
+   unsigned reg2 = frame.wb_push_candidate2;
+   bool emit_frame_chain = frame.emit_frame_chain;
+@@ -9752,8 +9752,8 @@ aarch64_expand_prologue (void)
+ 			     - frame.hard_fp_offset);
+   gcc_assert (known_ge (chain_offset, 0));
+ 
+-  /* The offset of the bottom of the save area from the current SP.  */
+-  poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
++  /* The offset of the current SP from the bottom of the static frame.  */
++  poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
+ 
+   if (emit_frame_chain)
+     {
+@@ -9761,7 +9761,7 @@ aarch64_expand_prologue (void)
+ 	{
+ 	  reg1 = R29_REGNUM;
+ 	  reg2 = R30_REGNUM;
+-	  aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
++	  aarch64_save_callee_saves (bytes_below_sp, reg1, reg2,
+ 				     false, false);
+ 	}
+       else
+@@ -9801,7 +9801,7 @@ aarch64_expand_prologue (void)
+       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
+     }
+ 
+-  aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
++  aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM,
+ 			     callee_adjust != 0 || emit_frame_chain,
+ 			     emit_frame_chain);
+   if (maybe_ne (sve_callee_adjust, 0))
+@@ -9811,16 +9811,17 @@ aarch64_expand_prologue (void)
+       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
+ 					      sve_callee_adjust,
+ 					      !frame_pointer_needed, false);
+-      saved_regs_offset += sve_callee_adjust;
++      bytes_below_sp -= sve_callee_adjust;
+     }
+-  aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
++  aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM,
+ 			     false, emit_frame_chain);
+-  aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
++  aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM,
+ 			     callee_adjust != 0 || emit_frame_chain,
+ 			     emit_frame_chain);
+ 
+   /* We may need to probe the final adjustment if it is larger than the guard
+      that is assumed by the called.  */
++  gcc_assert (known_eq (bytes_below_sp, final_adjust));
+   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
+ 					  !frame_pointer_needed, true);
+ }
+@@ -9855,7 +9856,6 @@ aarch64_expand_epilogue (bool for_sibcall)
+   poly_int64 initial_adjust = frame.initial_adjust;
+   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
+   poly_int64 final_adjust = frame.final_adjust;
+-  poly_int64 callee_offset = frame.callee_offset;
+   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
+   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
+   unsigned reg1 = frame.wb_pop_candidate1;
+@@ -9925,9 +9925,9 @@ aarch64_expand_epilogue (bool for_sibcall)
+ 
+   /* Restore the vector registers before the predicate registers,
+      so that we can use P4 as a temporary for big-endian SVE frames.  */
+-  aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
++  aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM,
+ 				callee_adjust != 0, &cfi_ops);
+-  aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
++  aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM,
+ 				false, &cfi_ops);
+   if (maybe_ne (sve_callee_adjust, 0))
+     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+@@ -9935,7 +9935,7 @@ aarch64_expand_epilogue (bool for_sibcall)
+   /* When shadow call stack is enabled, the scs_pop in the epilogue will
+      restore x30, we don't need to restore x30 again in the traditional
+      way.  */
+-  aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
++  aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
+ 				R0_REGNUM, last_gpr,
+ 				callee_adjust != 0, &cfi_ops);
+ 
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index de68ff7202f..94fca4b9471 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -907,10 +907,6 @@ struct GTY (()) aarch64_frame
+      It is zero when no push is used.  */
+   HOST_WIDE_INT callee_adjust;
+ 
+-  /* The offset from SP to the callee-save registers after initial_adjust.
+-     It may be non-zero if no push is used (ie. callee_adjust == 0).  */
+-  poly_int64 callee_offset;
+-
+   /* The size of the stack adjustment before saving or after restoring
+      SVE registers.  */
+   poly_int64 sve_callee_adjust;
+-- 
+2.34.1
+
+
+From 2b983f9064d808daf909bde1d4a13980934a7e6e Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:51 +0100
+Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a
+ chain
+
+After previous patches, it is no longer necessary to calculate
+a chain_offset in cases where there is no chain record.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the
+	calculation of chain_offset into the emit_frame_chain block.
+---
+ gcc/config/aarch64/aarch64.cc | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index e79551af41d..d71a042d611 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -9747,16 +9747,16 @@ aarch64_expand_prologue (void)
+   if (callee_adjust != 0)
+     aarch64_push_regs (reg1, reg2, callee_adjust);
+ 
+-  /* The offset of the frame chain record (if any) from the current SP.  */
+-  poly_int64 chain_offset = (initial_adjust + callee_adjust
+-			     - frame.hard_fp_offset);
+-  gcc_assert (known_ge (chain_offset, 0));
+-
+   /* The offset of the current SP from the bottom of the static frame.  */
+   poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
+ 
+   if (emit_frame_chain)
+     {
++      /* The offset of the frame chain record (if any) from the current SP.  */
++      poly_int64 chain_offset = (initial_adjust + callee_adjust
++				 - frame.hard_fp_offset);
++      gcc_assert (known_ge (chain_offset, 0));
++
+       if (callee_adjust == 0)
+ 	{
+ 	  reg1 = R29_REGNUM;
+-- 
+2.34.1
+
+
+From 0a0a824808d1dec51004fb5805c1a0ae2a35433f Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:51 +0100
+Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+locals_offset was described as:
+
+  /* Offset from the base of the frame (incomming SP) to the
+     top of the locals area.  This value is always a multiple of
+     STACK_BOUNDARY.  */
+
+This is implicitly an “upside down” view of the frame: the incoming
+SP is at offset 0, and anything N bytes below the incoming SP is at
+offset N (rather than -N).
+
+However, reg_offset instead uses a “right way up” view; that is,
+it views offsets in address terms.  Something above X is at a
+positive offset from X and something below X is at a negative
+offset from X.
+
+Also, even on FRAME_GROWS_DOWNWARD targets like AArch64,
+target-independent code views offsets in address terms too:
+locals are allocated at negative offsets to virtual_stack_vars.
+
+It seems confusing to have *_offset fields of the same structure
+using different polarities like this.  This patch tries to avoid
+that by renaming locals_offset to bytes_above_locals.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename to...
+	(aarch64_frame::bytes_above_locals): ...this.
+	* config/aarch64/aarch64.cc (aarch64_layout_frame)
+	(aarch64_initial_elimination_offset): Update accordingly.
+---
+ gcc/config/aarch64/aarch64.cc | 6 +++---
+ gcc/config/aarch64/aarch64.h  | 6 +++---
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index d71a042d611..d4ec352ba98 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8337,7 +8337,7 @@ aarch64_layout_frame (void)
+ 			  STACK_BOUNDARY / BITS_PER_UNIT));
+   frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
+ 
+-  frame.locals_offset = frame.saved_varargs_size;
++  frame.bytes_above_locals = frame.saved_varargs_size;
+ 
+   frame.initial_adjust = 0;
+   frame.final_adjust = 0;
+@@ -12578,13 +12578,13 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
+ 	return frame.hard_fp_offset;
+ 
+       if (from == FRAME_POINTER_REGNUM)
+-	return frame.hard_fp_offset - frame.locals_offset;
++	return frame.hard_fp_offset - frame.bytes_above_locals;
+     }
+ 
+   if (to == STACK_POINTER_REGNUM)
+     {
+       if (from == FRAME_POINTER_REGNUM)
+-	return frame.frame_size - frame.locals_offset;
++	return frame.frame_size - frame.bytes_above_locals;
+     }
+ 
+   return frame.frame_size;
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 94fca4b9471..bf46e6124aa 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -885,10 +885,10 @@ struct GTY (()) aarch64_frame
+      always a multiple of STACK_BOUNDARY.  */
+   poly_int64 bytes_below_hard_fp;
+ 
+-  /* Offset from the base of the frame (incomming SP) to the
+-     top of the locals area.  This value is always a multiple of
++  /* The number of bytes between the top of the locals area and the top
++     of the frame (the incomming SP).  This value is always a multiple of
+      STACK_BOUNDARY.  */
+-  poly_int64 locals_offset;
++  poly_int64 bytes_above_locals;
+ 
+   /* Offset from the base of the frame (incomming SP) to the
+      hard_frame_pointer.  This value is always a multiple of
+-- 
+2.34.1
+
+
+From 3fbf0789202b30a67b12e1fb785c7130f098d665 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:52 +0100
+Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to bytes_above_hard_fp
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Similarly to the previous locals_offset patch, hard_fp_offset
+was described as:
+
+  /* Offset from the base of the frame (incomming SP) to the
+     hard_frame_pointer.  This value is always a multiple of
+     STACK_BOUNDARY.  */
+  poly_int64 hard_fp_offset;
+
+which again took an “upside-down” view: higher offsets meant lower
+addresses.  This patch renames the field to bytes_above_hard_fp instead.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename
+	to...
+	(aarch64_frame::bytes_above_hard_fp): ...this.
+	* config/aarch64/aarch64.cc (aarch64_layout_frame)
+	(aarch64_expand_prologue): Update accordingly.
+	(aarch64_initial_elimination_offset): Likewise.
+---
+ gcc/config/aarch64/aarch64.cc | 26 +++++++++++++-------------
+ gcc/config/aarch64/aarch64.h  |  6 +++---
+ 2 files changed, 16 insertions(+), 16 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index d4ec352ba98..3c4052740e7 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8329,7 +8329,7 @@ aarch64_layout_frame (void)
+ 			   + get_frame_size (),
+ 			   STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+-  frame.hard_fp_offset
++  frame.bytes_above_hard_fp
+     = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
+ 
+   /* Both these values are already aligned.  */
+@@ -8378,13 +8378,13 @@ aarch64_layout_frame (void)
+   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
+     max_push_offset = 256;
+ 
+-  HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
++  HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
+   HOST_WIDE_INT const_saved_regs_size;
+   if (known_eq (frame.saved_regs_size, 0))
+     frame.initial_adjust = frame.frame_size;
+   else if (frame.frame_size.is_constant (&const_size)
+ 	   && const_size < max_push_offset
+-	   && known_eq (frame.hard_fp_offset, const_size))
++	   && known_eq (frame.bytes_above_hard_fp, const_size))
+     {
+       /* Simple, small frame with no data below the saved registers.
+ 
+@@ -8401,8 +8401,8 @@ aarch64_layout_frame (void)
+ 	      case that it hardly seems worth the effort though.  */
+ 	   && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
+ 	   && !(cfun->calls_alloca
+-		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
+-		&& const_fp_offset < max_push_offset))
++		&& frame.bytes_above_hard_fp.is_constant (&const_above_fp)
++		&& const_above_fp < max_push_offset))
+     {
+       /* Frame with small area below the saved registers:
+ 
+@@ -8420,12 +8420,12 @@ aarch64_layout_frame (void)
+ 	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
+ 	 save SVE registers relative to SP
+ 	 sub sp, sp, bytes_below_saved_regs  */
+-      frame.initial_adjust = (frame.hard_fp_offset
++      frame.initial_adjust = (frame.bytes_above_hard_fp
+ 			      + frame.below_hard_fp_saved_regs_size);
+       frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+-  else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
+-	   && const_fp_offset < max_push_offset)
++  else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
++	   && const_above_fp < max_push_offset)
+     {
+       /* Frame with large area below the saved registers, or with SVE saves,
+ 	 but with a small area above:
+@@ -8435,7 +8435,7 @@ aarch64_layout_frame (void)
+ 	 [sub sp, sp, below_hard_fp_saved_regs_size]
+ 	 [save SVE registers relative to SP]
+ 	 sub sp, sp, bytes_below_saved_regs  */
+-      frame.callee_adjust = const_fp_offset;
++      frame.callee_adjust = const_above_fp;
+       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
+       frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+@@ -8450,7 +8450,7 @@ aarch64_layout_frame (void)
+ 	 [sub sp, sp, below_hard_fp_saved_regs_size]
+ 	 [save SVE registers relative to SP]
+ 	 sub sp, sp, bytes_below_saved_regs  */
+-      frame.initial_adjust = frame.hard_fp_offset;
++      frame.initial_adjust = frame.bytes_above_hard_fp;
+       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
+       frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+@@ -9754,7 +9754,7 @@ aarch64_expand_prologue (void)
+     {
+       /* The offset of the frame chain record (if any) from the current SP.  */
+       poly_int64 chain_offset = (initial_adjust + callee_adjust
+-				 - frame.hard_fp_offset);
++				 - frame.bytes_above_hard_fp);
+       gcc_assert (known_ge (chain_offset, 0));
+ 
+       if (callee_adjust == 0)
+@@ -12575,10 +12575,10 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
+   if (to == HARD_FRAME_POINTER_REGNUM)
+     {
+       if (from == ARG_POINTER_REGNUM)
+-	return frame.hard_fp_offset;
++	return frame.bytes_above_hard_fp;
+ 
+       if (from == FRAME_POINTER_REGNUM)
+-	return frame.hard_fp_offset - frame.bytes_above_locals;
++	return frame.bytes_above_hard_fp - frame.bytes_above_locals;
+     }
+ 
+   if (to == STACK_POINTER_REGNUM)
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index bf46e6124aa..dd1f403f939 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -890,10 +890,10 @@ struct GTY (()) aarch64_frame
+      STACK_BOUNDARY.  */
+   poly_int64 bytes_above_locals;
+ 
+-  /* Offset from the base of the frame (incomming SP) to the
+-     hard_frame_pointer.  This value is always a multiple of
++  /* The number of bytes between the hard_frame_pointer and the top of
++     the frame (the incomming SP).  This value is always a multiple of
+      STACK_BOUNDARY.  */
+-  poly_int64 hard_fp_offset;
++  poly_int64 bytes_above_hard_fp;
+ 
+   /* The size of the frame.  This value is the offset from base of the
+      frame (incomming SP) to the stack_pointer.  This value is always
+-- 
+2.34.1
+
+
+From aac8b31379ac3bbd14fc6427dce23f56e54e8485 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:52 +0100
+Subject: [PATCH 10/19] aarch64: Tweak frame_size comment
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This patch fixes another case in which a value was described with
+an “upside-down” view.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak comment.
+---
+ gcc/config/aarch64/aarch64.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index dd1f403f939..700524ae22b 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -895,8 +895,8 @@ struct GTY (()) aarch64_frame
+      STACK_BOUNDARY.  */
+   poly_int64 bytes_above_hard_fp;
+ 
+-  /* The size of the frame.  This value is the offset from base of the
+-     frame (incomming SP) to the stack_pointer.  This value is always
++  /* The size of the frame, i.e. the number of bytes between the bottom
++     of the outgoing arguments and the incoming SP.  This value is always
+      a multiple of STACK_BOUNDARY.  */
+   poly_int64 frame_size;
+ 
+-- 
+2.34.1
+
+
+From 8d5506a8aeb8dd7e8b209a3663b07688478f76b9 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:53 +0100
+Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the
+ frame
+
+reg_offset was measured from the bottom of the saved register area.
+This made perfect sense with the original layout, since the bottom
+of the saved register area was also the hard frame pointer address.
+It became slightly less obvious with SVE, since we save SVE
+registers below the hard frame pointer, but it still made sense.
+
+However, if we want to allow different frame layouts, it's more
+convenient and obvious to measure reg_offset from the bottom of
+the frame.  After previous patches, it's also a slight simplification
+in its own right.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame): Add comment above
+	reg_offset.
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets
+	from the bottom of the frame, rather than the bottom of the saved
+	register area.  Measure reg_offset from the bottom of the frame
+	rather than the bottom of the saved register area.
+	(aarch64_save_callee_saves): Update accordingly.
+	(aarch64_restore_callee_saves): Likewise.
+	(aarch64_get_separate_components): Likewise.
+	(aarch64_process_components): Likewise.
+---
+ gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++-------------------
+ gcc/config/aarch64/aarch64.h  |  3 ++
+ 2 files changed, 27 insertions(+), 29 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 3c4052740e7..97dd077844b 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8139,7 +8139,6 @@ aarch64_needs_frame_chain (void)
+ static void
+ aarch64_layout_frame (void)
+ {
+-  poly_int64 offset = 0;
+   int regno, last_fp_reg = INVALID_REGNUM;
+   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
+   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
+@@ -8217,7 +8216,9 @@ aarch64_layout_frame (void)
+   gcc_assert (crtl->is_leaf
+ 	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
+ 
+-  frame.bytes_below_saved_regs = crtl->outgoing_args_size;
++  poly_int64 offset = crtl->outgoing_args_size;
++  gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
++  frame.bytes_below_saved_regs = offset;
+ 
+   /* Now assign stack slots for the registers.  Start with the predicate
+      registers, since predicate LDR and STR have a relatively small
+@@ -8229,7 +8230,8 @@ aarch64_layout_frame (void)
+ 	offset += BYTES_PER_SVE_PRED;
+       }
+ 
+-  if (maybe_ne (offset, 0))
++  poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
++  if (maybe_ne (saved_prs_size, 0))
+     {
+       /* If we have any vector registers to save above the predicate registers,
+ 	 the offset of the vector register save slots need to be a multiple
+@@ -8247,10 +8249,10 @@ aarch64_layout_frame (void)
+ 	offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+       else
+ 	{
+-	  if (known_le (offset, vector_save_size))
+-	    offset = vector_save_size;
+-	  else if (known_le (offset, vector_save_size * 2))
+-	    offset = vector_save_size * 2;
++	  if (known_le (saved_prs_size, vector_save_size))
++	    offset = frame.bytes_below_saved_regs + vector_save_size;
++	  else if (known_le (saved_prs_size, vector_save_size * 2))
++	    offset = frame.bytes_below_saved_regs + vector_save_size * 2;
+ 	  else
+ 	    gcc_unreachable ();
+ 	}
+@@ -8267,9 +8269,10 @@ aarch64_layout_frame (void)
+ 
+   /* OFFSET is now the offset of the hard frame pointer from the bottom
+      of the callee save area.  */
+-  bool saves_below_hard_fp_p = maybe_ne (offset, 0);
+-  frame.below_hard_fp_saved_regs_size = offset;
+-  frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
++  frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
++  bool saves_below_hard_fp_p
++    = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
++  frame.bytes_below_hard_fp = offset;
+   if (frame.emit_frame_chain)
+     {
+       /* FP and LR are placed in the linkage record.  */
+@@ -8320,9 +8323,10 @@ aarch64_layout_frame (void)
+ 
+   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+-  frame.saved_regs_size = offset;
++  frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
+ 
+-  poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
++  poly_int64 varargs_and_saved_regs_size
++    = frame.saved_regs_size + frame.saved_varargs_size;
+ 
+   poly_int64 saved_regs_and_above
+     = aligned_upper_bound (varargs_and_saved_regs_size
+@@ -8790,9 +8794,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
+ 
+       machine_mode mode = aarch64_reg_save_mode (regno);
+       reg = gen_rtx_REG (mode, regno);
+-      offset = (frame.reg_offset[regno]
+-		+ frame.bytes_below_saved_regs
+-		- bytes_below_sp);
++      offset = frame.reg_offset[regno] - bytes_below_sp;
+       rtx base_rtx = stack_pointer_rtx;
+       poly_int64 sp_offset = offset;
+ 
+@@ -8899,9 +8901,7 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
+ 
+       machine_mode mode = aarch64_reg_save_mode (regno);
+       reg = gen_rtx_REG (mode, regno);
+-      offset = (frame.reg_offset[regno]
+-		+ frame.bytes_below_saved_regs
+-		- bytes_below_sp);
++      offset = frame.reg_offset[regno] - bytes_below_sp;
+       rtx base_rtx = stack_pointer_rtx;
+       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
+@@ -9040,14 +9040,12 @@ aarch64_get_separate_components (void)
+ 	   it as a stack probe for -fstack-clash-protection.  */
+ 	if (flag_stack_clash_protection
+ 	    && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
+-	    && known_eq (offset, 0))
++	    && known_eq (offset, frame.bytes_below_saved_regs))
+ 	  continue;
+ 
+ 	/* Get the offset relative to the register we'll use.  */
+ 	if (frame_pointer_needed)
+-	  offset -= frame.below_hard_fp_saved_regs_size;
+-	else
+-	  offset += frame.bytes_below_saved_regs;
++	  offset -= frame.bytes_below_hard_fp;
+ 
+ 	/* Check that we can access the stack slot of the register with one
+ 	   direct load with no adjustments needed.  */
+@@ -9194,9 +9192,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       rtx reg = gen_rtx_REG (mode, regno);
+       poly_int64 offset = frame.reg_offset[regno];
+       if (frame_pointer_needed)
+-	offset -= frame.below_hard_fp_saved_regs_size;
+-      else
+-	offset += frame.bytes_below_saved_regs;
++	offset -= frame.bytes_below_hard_fp;
+ 
+       rtx addr = plus_constant (Pmode, ptr_reg, offset);
+       rtx mem = gen_frame_mem (mode, addr);
+@@ -9248,9 +9244,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       /* REGNO2 can be saved/restored in a pair with REGNO.  */
+       rtx reg2 = gen_rtx_REG (mode, regno2);
+       if (frame_pointer_needed)
+-	offset2 -= frame.below_hard_fp_saved_regs_size;
+-      else
+-	offset2 += frame.bytes_below_saved_regs;
++	offset2 -= frame.bytes_below_hard_fp;
+       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
+       rtx mem2 = gen_frame_mem (mode, addr2);
+       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
+@@ -9366,7 +9360,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   if (final_adjustment_p
+       && known_eq (frame.below_hard_fp_saved_regs_size, 0))
+     {
+-      poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
++      poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
++			      - frame.bytes_below_saved_regs);
+       if (known_ge (lr_offset, 0))
+ 	min_probe_threshold -= lr_offset.to_constant ();
+       else
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 700524ae22b..b6135837073 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -860,6 +860,9 @@ extern enum aarch64_processor aarch64_tune;
+ #ifdef HAVE_POLY_INT_H
+ struct GTY (()) aarch64_frame
+ {
++  /* The offset from the bottom of the static frame (the bottom of the
++     outgoing arguments) of each register save slot, or -2 if no save is
++     needed.  */
+   poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
+ 
+   /* The number of extra stack bytes taken up by register varargs.
+-- 
+2.34.1
+
+
+From b47766614df3b9df878262efb2ad73aaac108363 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:53 +0100
+Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation
+
+After previous patches, it no longer really makes sense to allocate
+the top of the frame in terms of varargs_and_saved_regs_size and
+saved_regs_and_above.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify
+	the allocation of the top of the frame.
+---
+ gcc/config/aarch64/aarch64.cc | 23 ++++++++---------------
+ 1 file changed, 8 insertions(+), 15 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 97dd077844b..81935852d5b 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8325,23 +8325,16 @@ aarch64_layout_frame (void)
+ 
+   frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
+ 
+-  poly_int64 varargs_and_saved_regs_size
+-    = frame.saved_regs_size + frame.saved_varargs_size;
+-
+-  poly_int64 saved_regs_and_above
+-    = aligned_upper_bound (varargs_and_saved_regs_size
+-			   + get_frame_size (),
+-			   STACK_BOUNDARY / BITS_PER_UNIT);
+-
+-  frame.bytes_above_hard_fp
+-    = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
++  offset += get_frame_size ();
++  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
++  auto top_of_locals = offset;
+ 
+-  /* Both these values are already aligned.  */
+-  gcc_assert (multiple_p (frame.bytes_below_saved_regs,
+-			  STACK_BOUNDARY / BITS_PER_UNIT));
+-  frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
++  offset += frame.saved_varargs_size;
++  gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
++  frame.frame_size = offset;
+ 
+-  frame.bytes_above_locals = frame.saved_varargs_size;
++  frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
++  frame.bytes_above_locals = frame.frame_size - top_of_locals;
+ 
+   frame.initial_adjust = 0;
+   frame.final_adjust = 0;
+-- 
+2.34.1
+
+
+From 08f71b4bb28fb74d20e8d2927a557e8119ce9f4d Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:54 +0100
+Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak
+
+This patch just changes a calculation of initial_adjust
+to one that makes it slightly more obvious that the total
+adjustment is frame.frame_size.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak
+	calculation of initial_adjust for frames in which all saves
+	are SVE saves.
+---
+ gcc/config/aarch64/aarch64.cc | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 81935852d5b..4d9fcf3d162 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8414,11 +8414,10 @@ aarch64_layout_frame (void)
+     {
+       /* Frame in which all saves are SVE saves:
+ 
+-	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
++	 sub sp, sp, frame_size - bytes_below_saved_regs
+ 	 save SVE registers relative to SP
+ 	 sub sp, sp, bytes_below_saved_regs  */
+-      frame.initial_adjust = (frame.bytes_above_hard_fp
+-			      + frame.below_hard_fp_saved_regs_size);
++      frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
+       frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+   else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
+-- 
+2.34.1
+
+
+From f22315d5c19e8310e4dc880fd509678fd291fca8 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:54 +0100
+Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition
+
+The AArch64 ABI says that, when stack clash protection is used,
+there can be a maximum of 1KiB of unprobed space at sp on entry
+to a function.  Therefore, we need to probe when allocating
+>= guard_size - 1KiB of data (>= rather than >).  This is what
+GCC does.
+
+If an allocation is exactly guard_size bytes, it is enough to allocate
+those bytes and probe once at offset 1024.  It isn't possible to use a
+single probe at any other offset: higher would conmplicate later code,
+by leaving more unprobed space than usual, while lower would risk
+leaving an entire page unprobed.  For simplicity, the code probes all
+allocations at offset 1024.
+
+Some register saves also act as probes.  If we need to allocate
+more space below the last such register save probe, we need to
+probe the allocation if it is > 1KiB.  Again, this allocation is
+then sometimes (but not always) probed at offset 1024.  This sort of
+allocation is currently only used for outgoing arguments, which are
+rarely this big.
+
+However, the code also probed if this final outgoing-arguments
+allocation was == 1KiB, rather than just > 1KiB.  This isn't
+necessary, since the register save then probes at offset 1024
+as required.  Continuing to probe allocations of exactly 1KiB
+would complicate later patches.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space):
+	Don't probe final allocations that are exactly 1KiB in size (after
+	unprobed space above the final allocation has been deducted).
+
+gcc/testsuite/
+	* gcc.target/aarch64/stack-check-prologue-17.c: New test.
+---
+ gcc/config/aarch64/aarch64.cc                 |  4 +-
+ .../aarch64/stack-check-prologue-17.c         | 55 +++++++++++++++++++
+ 2 files changed, 58 insertions(+), 1 deletion(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 4d9fcf3d162..34c1d8614cd 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -9333,9 +9333,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   HOST_WIDE_INT guard_size
+     = 1 << param_stack_clash_protection_guard_size;
+   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
++  HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
++  gcc_assert (multiple_p (poly_size, byte_sp_alignment));
+   HOST_WIDE_INT min_probe_threshold
+     = (final_adjustment_p
+-       ? guard_used_by_caller
++       ? guard_used_by_caller + byte_sp_alignment
+        : guard_size - guard_used_by_caller);
+   /* When doing the final adjustment for the outgoing arguments, take into
+      account any unprobed space there is above the current SP.  There are
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+new file mode 100644
+index 00000000000..0d8a25d73a2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+@@ -0,0 +1,55 @@
++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void f(int, ...);
++void g();
++
++/*
++** test1:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1024
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test1(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
++    }
++  g();
++  return 1;
++}
++
++/*
++** test2:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1040
++**	str	xzr, \[sp\]
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test2(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x);
++    }
++  g();
++  return 1;
++}
+-- 
+2.34.1
+
+
+From 15e18831bf98fd25af098b970ebf0c9a6200a34b Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:55 +0100
+Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes
+
+-fstack-clash-protection uses the save of LR as a probe for the next
+allocation.  The next allocation could be:
+
+* another part of the static frame, e.g. when allocating SVE save slots
+  or outgoing arguments
+
+* an alloca in the same function
+
+* an allocation made by a callee function
+
+However, when -fomit-frame-pointer is used, the LR save slot is placed
+above the other GPR save slots.  It could therefore be up to 80 bytes
+above the base of the GPR save area (which is also the hard fp address).
+
+aarch64_allocate_and_probe_stack_space took this into account when
+deciding how much subsequent space could be allocated without needing
+a probe.  However, it interacted badly with:
+
+      /* If doing a small final adjustment, we always probe at offset 0.
+	 This is done to avoid issues when LR is not at position 0 or when
+	 the final adjustment is smaller than the probing offset.  */
+      else if (final_adjustment_p && rounded_size == 0)
+	residual_probe_offset = 0;
+
+which forces any allocation that is smaller than the guard page size
+to be probed at offset 0 rather than the usual offset 1024.  It was
+therefore possible to construct cases in which we had:
+
+* a probe using LR at SP + 80 bytes (or some other value >= 16)
+* an allocation of the guard page size - 16 bytes
+* a probe at SP + 0
+
+which allocates guard page size + 64 consecutive unprobed bytes.
+
+This patch requires the LR probe to be in the first 16 bytes of the
+save area when stack clash protection is active.  Doing it
+unconditionally would cause code-quality regressions.
+
+Putting LR before other registers prevents push/pop allocation
+when shadow call stacks are enabled, since LR is restored
+separately from the other callee-saved registers.
+
+The new comment doesn't say that the probe register is required
+to be LR, since a later patch removes that restriction.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that
+	the LR save slot is in the first 16 bytes of the register save area.
+	Only form STP/LDP push/pop candidates if both registers are valid.
+	(aarch64_allocate_and_probe_stack_space): Remove workaround for
+	when LR was not in the first 16 bytes.
+
+gcc/testsuite/
+	* gcc.target/aarch64/stack-check-prologue-18.c: New test.
+	* gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
+	* gcc.target/aarch64/stack-check-prologue-20.c: Likewise.
+---
+ gcc/config/aarch64/aarch64.cc                 |  72 ++++++-------
+ .../aarch64/stack-check-prologue-18.c         | 100 ++++++++++++++++++
+ .../aarch64/stack-check-prologue-19.c         | 100 ++++++++++++++++++
+ .../aarch64/stack-check-prologue-20.c         |   3 +
+ 4 files changed, 233 insertions(+), 42 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 34c1d8614cd..16433fb70f4 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8273,26 +8273,34 @@ aarch64_layout_frame (void)
+   bool saves_below_hard_fp_p
+     = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
+   frame.bytes_below_hard_fp = offset;
++
++  auto allocate_gpr_slot = [&](unsigned int regno)
++    {
++      frame.reg_offset[regno] = offset;
++      if (frame.wb_push_candidate1 == INVALID_REGNUM)
++	frame.wb_push_candidate1 = regno;
++      else if (frame.wb_push_candidate2 == INVALID_REGNUM)
++	frame.wb_push_candidate2 = regno;
++      offset += UNITS_PER_WORD;
++    };
++
+   if (frame.emit_frame_chain)
+     {
+       /* FP and LR are placed in the linkage record.  */
+-      frame.reg_offset[R29_REGNUM] = offset;
+-      frame.wb_push_candidate1 = R29_REGNUM;
+-      frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
+-      frame.wb_push_candidate2 = R30_REGNUM;
+-      offset += 2 * UNITS_PER_WORD;
++      allocate_gpr_slot (R29_REGNUM);
++      allocate_gpr_slot (R30_REGNUM);
+     }
++  else if (flag_stack_clash_protection
++	   && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
++    /* Put the LR save slot first, since it makes a good choice of probe
++       for stack clash purposes.  The idea is that the link register usually
++       has to be saved before a call anyway, and so we lose little by
++       stopping it from being individually shrink-wrapped.  */
++    allocate_gpr_slot (R30_REGNUM);
+ 
+   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
+     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+-      {
+-	frame.reg_offset[regno] = offset;
+-	if (frame.wb_push_candidate1 == INVALID_REGNUM)
+-	  frame.wb_push_candidate1 = regno;
+-	else if (frame.wb_push_candidate2 == INVALID_REGNUM)
+-	  frame.wb_push_candidate2 = regno;
+-	offset += UNITS_PER_WORD;
+-      }
++      allocate_gpr_slot (regno);
+ 
+   poly_int64 max_int_offset = offset;
+   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+@@ -8370,10 +8378,13 @@ aarch64_layout_frame (void)
+      max_push_offset to 0, because no registers are popped at this time,
+      so callee_adjust cannot be adjusted.  */
+   HOST_WIDE_INT max_push_offset = 0;
+-  if (frame.wb_pop_candidate2 != INVALID_REGNUM)
+-    max_push_offset = 512;
+-  else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
+-    max_push_offset = 256;
++  if (frame.wb_pop_candidate1 != INVALID_REGNUM)
++    {
++      if (frame.wb_pop_candidate2 != INVALID_REGNUM)
++	max_push_offset = 512;
++      else
++	max_push_offset = 256;
++    }
+ 
+   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
+   HOST_WIDE_INT const_saved_regs_size;
+@@ -9339,29 +9350,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+     = (final_adjustment_p
+        ? guard_used_by_caller + byte_sp_alignment
+        : guard_size - guard_used_by_caller);
+-  /* When doing the final adjustment for the outgoing arguments, take into
+-     account any unprobed space there is above the current SP.  There are
+-     two cases:
+-
+-     - When saving SVE registers below the hard frame pointer, we force
+-       the lowest save to take place in the prologue before doing the final
+-       adjustment (i.e. we don't allow the save to be shrink-wrapped).
+-       This acts as a probe at SP, so there is no unprobed space.
+-
+-     - When there are no SVE register saves, we use the store of the link
+-       register as a probe.  We can't assume that LR was saved at position 0
+-       though, so treat any space below it as unprobed.  */
+-  if (final_adjustment_p
+-      && known_eq (frame.below_hard_fp_saved_regs_size, 0))
+-    {
+-      poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
+-			      - frame.bytes_below_saved_regs);
+-      if (known_ge (lr_offset, 0))
+-	min_probe_threshold -= lr_offset.to_constant ();
+-      else
+-	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
+-    }
+-
+   poly_int64 frame_size = frame.frame_size;
+ 
+   /* We should always have a positive probe threshold.  */
+@@ -9541,8 +9529,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+       if (final_adjustment_p && rounded_size != 0)
+ 	min_probe_threshold = 0;
+       /* If doing a small final adjustment, we always probe at offset 0.
+-	 This is done to avoid issues when LR is not at position 0 or when
+-	 the final adjustment is smaller than the probing offset.  */
++	 This is done to avoid issues when the final adjustment is smaller
++	 than the probing offset.  */
+       else if (final_adjustment_p && rounded_size == 0)
+ 	residual_probe_offset = 0;
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+new file mode 100644
+index 00000000000..82447d20fff
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+@@ -0,0 +1,100 @@
++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void f(int, ...);
++void g();
++
++/*
++** test1:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #4064
++**	str	xzr, \[sp\]
++**	cbnz	w0, .*
++**	bl	g
++**	...
++**	str	x26, \[sp, #?4128\]
++**	...
++*/
++int test1(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x);
++    }
++  g();
++  return 1;
++}
++
++/*
++** test2:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1040
++**	str	xzr, \[sp\]
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test2(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x);
++    }
++  g();
++  return 1;
++}
++
++/*
++** test3:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1024
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test3(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
++    }
++  g();
++  return 1;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
+new file mode 100644
+index 00000000000..73ac3e4e4eb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
+@@ -0,0 +1,100 @@
++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void f(int, ...);
++void g();
++
++/*
++** test1:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #4064
++**	str	xzr, \[sp\]
++**	cbnz	w0, .*
++**	bl	g
++**	...
++**	str	x26, \[sp, #?4128\]
++**	...
++*/
++int test1(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x);
++    }
++  g();
++  return 1;
++}
++
++/*
++** test2:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1040
++**	str	xzr, \[sp\]
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test2(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x);
++    }
++  g();
++  return 1;
++}
++
++/*
++** test3:
++**	...
++**	str	x30, \[sp\]
++**	sub	sp, sp, #1024
++**	cbnz	w0, .*
++**	bl	g
++**	...
++*/
++int test3(int z) {
++  __uint128_t x = 0;
++  int y[0x400];
++  if (z)
++    {
++      asm volatile ("" :::
++		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
++      f(0, 0, 0, 0, 0, 0, 0, &y,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
++	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
++    }
++  g();
++  return 1;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
+new file mode 100644
+index 00000000000..690aae8dfd5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
+@@ -0,0 +1,3 @@
++/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */
++
++#include "stack-check-prologue-19.c"
+-- 
+2.34.1
+
+
+From c4f0e121faa36342f1d21919e54a05ad841c4f86 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:55 +0100
+Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation
+
+Previous patches ensured that the final frame allocation only needs
+a probe when the size is strictly greater than 1KiB.  It's therefore
+safe to use the normal 1024 probe offset in all cases.
+
+The main motivation for doing this is to simplify the code and
+remove the number of special cases.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space):
+	Always probe the residual allocation at offset 1024, asserting
+	that that is in range.
+
+gcc/testsuite/
+	* gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe
+	to be at offset 1024 rather than offset 0.
+	* gcc.target/aarch64/stack-check-prologue-18.c: Likewise.
+	* gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
+---
+ gcc/config/aarch64/aarch64.cc                        | 12 ++++--------
+ .../gcc.target/aarch64/stack-check-prologue-17.c     |  2 +-
+ .../gcc.target/aarch64/stack-check-prologue-18.c     |  4 ++--
+ .../gcc.target/aarch64/stack-check-prologue-19.c     |  4 ++--
+ 4 files changed, 9 insertions(+), 13 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 16433fb70f4..8abf3d7a1e2 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -9523,16 +9523,12 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+      are still safe.  */
+   if (residual)
+     {
+-      HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
++      gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
++
+       /* If we're doing final adjustments, and we've done any full page
+ 	 allocations then any residual needs to be probed.  */
+       if (final_adjustment_p && rounded_size != 0)
+ 	min_probe_threshold = 0;
+-      /* If doing a small final adjustment, we always probe at offset 0.
+-	 This is done to avoid issues when the final adjustment is smaller
+-	 than the probing offset.  */
+-      else if (final_adjustment_p && rounded_size == 0)
+-	residual_probe_offset = 0;
+ 
+       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
+       if (residual >= min_probe_threshold)
+@@ -9543,8 +9539,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ 		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
+ 		     "\n", residual);
+ 
+-	    emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+-					     residual_probe_offset));
++	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
++					   guard_used_by_caller));
+ 	  emit_insn (gen_blockage ());
+ 	}
+     }
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+index 0d8a25d73a2..f0ec1389771 100644
+--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
+@@ -33,7 +33,7 @@ int test1(int z) {
+ **	...
+ **	str	x30, \[sp\]
+ **	sub	sp, sp, #1040
+-**	str	xzr, \[sp\]
++**	str	xzr, \[sp, #?1024\]
+ **	cbnz	w0, .*
+ **	bl	g
+ **	...
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+index 82447d20fff..6383bec5ebc 100644
+--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
+@@ -9,7 +9,7 @@ void g();
+ **	...
+ **	str	x30, \[sp\]
+ **	sub	sp, sp, #4064
+-**	str	xzr, \[sp\]
++**	str	xzr, \[sp, #?1024\]
+ **	cbnz	w0, .*
+ **	bl	g
+ **	...
+@@ -50,7 +50,7 @@ int test1(int z) {
+ **	...
+ **	str	x30, \[sp\]
+ **	sub	sp, sp, #1040
+-**	str	xzr, \[sp\]
++**	str	xzr, \[sp, #?1024\]
+ **	cbnz	w0, .*
+ **	bl	g
+ **	...
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
+index 73ac3e4e4eb..562039b5e9b 100644
+--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
+@@ -9,7 +9,7 @@ void g();
+ **	...
+ **	str	x30, \[sp\]
+ **	sub	sp, sp, #4064
+-**	str	xzr, \[sp\]
++**	str	xzr, \[sp, #?1024\]
+ **	cbnz	w0, .*
+ **	bl	g
+ **	...
+@@ -50,7 +50,7 @@ int test1(int z) {
+ **	...
+ **	str	x30, \[sp\]
+ **	sub	sp, sp, #1040
+-**	str	xzr, \[sp\]
++**	str	xzr, \[sp, #?1024\]
+ **	cbnz	w0, .*
+ **	bl	g
+ **	...
+-- 
+2.34.1
+
+
+From 6f0ab0a9f46a17b68349ff6035aa776bf65f0575 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:56 +0100
+Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame
+ info
+
+The stack frame is currently divided into three areas:
+
+A: the area above the hard frame pointer
+B: the SVE saves below the hard frame pointer
+C: the outgoing arguments
+
+If the stack frame is allocated in one chunk, the allocation needs a
+probe if the frame size is >= guard_size - 1KiB.  In addition, if the
+function is not a leaf function, it must probe an address no more than
+1KiB above the outgoing SP.  We ensured the second condition by
+
+(1) using single-chunk allocations for non-leaf functions only if
+    the link register save slot is within 512 bytes of the bottom
+    of the frame; and
+
+(2) using the link register save as a probe (meaning, for instance,
+    that it can't be individually shrink wrapped)
+
+If instead the stack is allocated in multiple chunks, then:
+
+* an allocation involving only the outgoing arguments (C above) requires
+  a probe if the allocation size is > 1KiB
+
+* any other allocation requires a probe if the allocation size
+  is >= guard_size - 1KiB
+
+* second and subsequent allocations require the previous allocation
+  to probe at the bottom of the allocated area, regardless of the size
+  of that previous allocation
+
+The final point means that, unlike for single allocations,
+it can be necessary to have both a non-SVE register probe and
+an SVE register probe.  For example:
+
+* allocate A, probe using a non-SVE register save
+* allocate B, probe using an SVE register save
+* allocate C
+
+The non-SVE register used in this case was again the link register.
+It was previously used even if the link register save slot was some
+bytes above the bottom of the non-SVE register saves, but an earlier
+patch avoided that by putting the link register save slot first.
+
+As a belt-and-braces fix, this patch explicitly records which
+probe registers we're using and allows the non-SVE probe to be
+whichever register comes first (as for SVE).
+
+The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
+	(aarch64_frame::hard_fp_save_and_probe): New fields.
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize them.
+	Rather than asserting that a leaf function saves LR, instead assert
+	that a leaf function saves something.
+	(aarch64_get_separate_components): Prevent the chosen probe
+	registers from being individually shrink-wrapped.
+	(aarch64_allocate_and_probe_stack_space): Remove workaround for
+	probe registers that aren't at the bottom of the previous allocation.
+
+gcc/testsuite/
+	* gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes.
+---
+ gcc/config/aarch64/aarch64.cc                 | 68 +++++++++++++++----
+ gcc/config/aarch64/aarch64.h                  |  8 +++
+ .../aarch64/sve/pcs/stack_clash_3.c           |  6 +-
+ 3 files changed, 64 insertions(+), 18 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 8abf3d7a1e2..a8d907df884 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8210,15 +8210,11 @@ aarch64_layout_frame (void)
+ 	&& !crtl->abi->clobbers_full_reg_p (regno))
+       frame.reg_offset[regno] = SLOT_REQUIRED;
+ 
+-  /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
+-     LR counts as an implicit probe which allows us to maintain the invariant
+-     described in the comment at expand_prologue.  */
+-  gcc_assert (crtl->is_leaf
+-	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
+ 
+   poly_int64 offset = crtl->outgoing_args_size;
+   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
+   frame.bytes_below_saved_regs = offset;
++  frame.sve_save_and_probe = INVALID_REGNUM;
+ 
+   /* Now assign stack slots for the registers.  Start with the predicate
+      registers, since predicate LDR and STR have a relatively small
+@@ -8226,6 +8222,8 @@ aarch64_layout_frame (void)
+   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+       {
++	if (frame.sve_save_and_probe == INVALID_REGNUM)
++	  frame.sve_save_and_probe = regno;
+ 	frame.reg_offset[regno] = offset;
+ 	offset += BYTES_PER_SVE_PRED;
+       }
+@@ -8263,6 +8261,8 @@ aarch64_layout_frame (void)
+     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ 	{
++	  if (frame.sve_save_and_probe == INVALID_REGNUM)
++	    frame.sve_save_and_probe = regno;
+ 	  frame.reg_offset[regno] = offset;
+ 	  offset += vector_save_size;
+ 	}
+@@ -8272,10 +8272,18 @@ aarch64_layout_frame (void)
+   frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
+   bool saves_below_hard_fp_p
+     = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
++  gcc_assert (!saves_below_hard_fp_p
++	      || (frame.sve_save_and_probe != INVALID_REGNUM
++		  && known_eq (frame.reg_offset[frame.sve_save_and_probe],
++			       frame.bytes_below_saved_regs)));
++
+   frame.bytes_below_hard_fp = offset;
++  frame.hard_fp_save_and_probe = INVALID_REGNUM;
+ 
+   auto allocate_gpr_slot = [&](unsigned int regno)
+     {
++      if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
++	frame.hard_fp_save_and_probe = regno;
+       frame.reg_offset[regno] = offset;
+       if (frame.wb_push_candidate1 == INVALID_REGNUM)
+ 	frame.wb_push_candidate1 = regno;
+@@ -8309,6 +8317,8 @@ aarch64_layout_frame (void)
+   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+       {
++	if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
++	  frame.hard_fp_save_and_probe = regno;
+ 	/* If there is an alignment gap between integer and fp callee-saves,
+ 	   allocate the last fp register to it if possible.  */
+ 	if (regno == last_fp_reg
+@@ -8332,6 +8342,17 @@ aarch64_layout_frame (void)
+   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+   frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
++  gcc_assert (known_eq (frame.saved_regs_size,
++			frame.below_hard_fp_saved_regs_size)
++	      || (frame.hard_fp_save_and_probe != INVALID_REGNUM
++		  && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
++			       frame.bytes_below_hard_fp)));
++
++  /* With stack-clash, a register must be saved in non-leaf functions.
++     The saving of the bottommost register counts as an implicit probe,
++     which allows us to maintain the invariant described in the comment
++     at expand_prologue.  */
++  gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
+ 
+   offset += get_frame_size ();
+   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+@@ -8462,6 +8483,25 @@ aarch64_layout_frame (void)
+       frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+ 
++  /* The frame is allocated in pieces, with each non-final piece
++     including a register save at offset 0 that acts as a probe for
++     the following piece.  In addition, the save of the bottommost register
++     acts as a probe for callees and allocas.  Roll back any probes that
++     aren't needed.
++
++     A probe isn't needed if it is associated with the final allocation
++     (including callees and allocas) that happens before the epilogue is
++     executed.  */
++  if (crtl->is_leaf
++      && !cfun->calls_alloca
++      && known_eq (frame.final_adjust, 0))
++    {
++      if (maybe_ne (frame.sve_callee_adjust, 0))
++	frame.sve_save_and_probe = INVALID_REGNUM;
++      else
++	frame.hard_fp_save_and_probe = INVALID_REGNUM;
++    }
++
+   /* Make sure the individual adjustments add up to the full frame size.  */
+   gcc_assert (known_eq (frame.initial_adjust
+ 			+ frame.callee_adjust
+@@ -9039,13 +9079,6 @@ aarch64_get_separate_components (void)
+ 
+ 	poly_int64 offset = frame.reg_offset[regno];
+ 
+-	/* If the register is saved in the first SVE save slot, we use
+-	   it as a stack probe for -fstack-clash-protection.  */
+-	if (flag_stack_clash_protection
+-	    && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
+-	    && known_eq (offset, frame.bytes_below_saved_regs))
+-	  continue;
+-
+ 	/* Get the offset relative to the register we'll use.  */
+ 	if (frame_pointer_needed)
+ 	  offset -= frame.bytes_below_hard_fp;
+@@ -9080,6 +9113,13 @@ aarch64_get_separate_components (void)
+ 
+   bitmap_clear_bit (components, LR_REGNUM);
+   bitmap_clear_bit (components, SP_REGNUM);
++  if (flag_stack_clash_protection)
++    {
++      if (frame.sve_save_and_probe != INVALID_REGNUM)
++	bitmap_clear_bit (components, frame.sve_save_and_probe);
++      if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
++	bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
++    }
+ 
+   return components;
+ }
+@@ -9616,8 +9656,8 @@ aarch64_epilogue_uses (int regno)
+    When probing is needed, we emit a probe at the start of the prologue
+    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
+ 
+-   We have to track how much space has been allocated and the only stores
+-   to the stack we track as implicit probes are the FP/LR stores.
++   We can also use register saves as probes.  These are stored in
++   sve_save_and_probe and hard_fp_save_and_probe.
+ 
+    For outgoing arguments we probe if the size is larger than 1KB, such that
+    the ABI specified buffer is maintained for the next callee.
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index b6135837073..46d4693e206 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -957,6 +957,14 @@ struct GTY (()) aarch64_frame
+      This is the register they should use.  */
+   unsigned spare_pred_reg;
+ 
++  /* An SVE register that is saved below the hard frame pointer and that acts
++     as a probe for later allocations, or INVALID_REGNUM if none.  */
++  unsigned sve_save_and_probe;
++
++  /* A register that is saved at the hard frame pointer and that acts
++     as a probe for later allocations, or INVALID_REGNUM if none.  */
++  unsigned hard_fp_save_and_probe;
++
+   bool laid_out;
+ 
+   /* True if shadow call stack should be enabled for the current function.  */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
+index 3e01ec36c3a..3530a0d504b 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
+@@ -11,11 +11,10 @@
+ **	mov	x11, sp
+ **	...
+ **	sub	sp, sp, x13
+-**	str	p4, \[sp\]
+ **	cbz	w0, [^\n]*
++**	str	p4, \[sp\]
+ **	...
+ **	ptrue	p0\.b, all
+-**	ldr	p4, \[sp\]
+ **	addvl	sp, sp, #1
+ **	ldr	x24, \[sp\], 32
+ **	ret
+@@ -39,13 +38,12 @@ test_1 (int n)
+ **	mov	x11, sp
+ **	...
+ **	sub	sp, sp, x13
+-**	str	p4, \[sp\]
+ **	cbz	w0, [^\n]*
++**	str	p4, \[sp\]
+ **	str	p5, \[sp, #1, mul vl\]
+ **	str	p6, \[sp, #2, mul vl\]
+ **	...
+ **	ptrue	p0\.b, all
+-**	ldr	p4, \[sp\]
+ **	addvl	sp, sp, #1
+ **	ldr	x24, \[sp\], 32
+ **	ret
+-- 
+2.34.1
+
+
+From 8254e1b9cd500e0c278465a3657543477e9d1250 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:56 +0100
+Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size
+
+After previous patches, it's no longer necessary to store
+saved_regs_size and below_hard_fp_saved_regs_size in the frame info.
+All measurements instead use the top or bottom of the frame as
+reference points.
+
+gcc/
+	* config/aarch64/aarch64.h (aarch64_frame::saved_regs_size)
+	(aarch64_frame::below_hard_fp_saved_regs_size): Delete.
+	* config/aarch64/aarch64.cc (aarch64_layout_frame): Update accordingly.
+---
+ gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++-------------------
+ gcc/config/aarch64/aarch64.h  |  7 ------
+ 2 files changed, 21 insertions(+), 31 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index a8d907df884..ac3d3b336a3 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8269,9 +8269,8 @@ aarch64_layout_frame (void)
+ 
+   /* OFFSET is now the offset of the hard frame pointer from the bottom
+      of the callee save area.  */
+-  frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
+-  bool saves_below_hard_fp_p
+-    = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
++  auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
++  bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
+   gcc_assert (!saves_below_hard_fp_p
+ 	      || (frame.sve_save_and_probe != INVALID_REGNUM
+ 		  && known_eq (frame.reg_offset[frame.sve_save_and_probe],
+@@ -8341,9 +8340,8 @@ aarch64_layout_frame (void)
+ 
+   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+-  frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
+-  gcc_assert (known_eq (frame.saved_regs_size,
+-			frame.below_hard_fp_saved_regs_size)
++  auto saved_regs_size = offset - frame.bytes_below_saved_regs;
++  gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)
+ 	      || (frame.hard_fp_save_and_probe != INVALID_REGNUM
+ 		  && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
+ 			       frame.bytes_below_hard_fp)));
+@@ -8352,7 +8350,7 @@ aarch64_layout_frame (void)
+      The saving of the bottommost register counts as an implicit probe,
+      which allows us to maintain the invariant described in the comment
+      at expand_prologue.  */
+-  gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
++  gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
+ 
+   offset += get_frame_size ();
+   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+@@ -8409,7 +8407,7 @@ aarch64_layout_frame (void)
+ 
+   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
+   HOST_WIDE_INT const_saved_regs_size;
+-  if (known_eq (frame.saved_regs_size, 0))
++  if (known_eq (saved_regs_size, 0))
+     frame.initial_adjust = frame.frame_size;
+   else if (frame.frame_size.is_constant (&const_size)
+ 	   && const_size < max_push_offset
+@@ -8422,7 +8420,7 @@ aarch64_layout_frame (void)
+       frame.callee_adjust = const_size;
+     }
+   else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
+-	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
++	   && saved_regs_size.is_constant (&const_saved_regs_size)
+ 	   && const_below_saved_regs + const_saved_regs_size < 512
+ 	   /* We could handle this case even with data below the saved
+ 	      registers, provided that that data left us with valid offsets
+@@ -8441,8 +8439,7 @@ aarch64_layout_frame (void)
+       frame.initial_adjust = frame.frame_size;
+     }
+   else if (saves_below_hard_fp_p
+-	   && known_eq (frame.saved_regs_size,
+-			frame.below_hard_fp_saved_regs_size))
++	   && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
+     {
+       /* Frame in which all saves are SVE saves:
+ 
+@@ -8464,7 +8461,7 @@ aarch64_layout_frame (void)
+ 	 [save SVE registers relative to SP]
+ 	 sub sp, sp, bytes_below_saved_regs  */
+       frame.callee_adjust = const_above_fp;
+-      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
++      frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
+       frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+   else
+@@ -8479,7 +8476,7 @@ aarch64_layout_frame (void)
+ 	 [save SVE registers relative to SP]
+ 	 sub sp, sp, bytes_below_saved_regs  */
+       frame.initial_adjust = frame.bytes_above_hard_fp;
+-      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
++      frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
+       frame.final_adjust = frame.bytes_below_saved_regs;
+     }
+ 
+@@ -9621,17 +9618,17 @@ aarch64_epilogue_uses (int regno)
+ 	|  local variables              | <-- frame_pointer_rtx
+ 	|                               |
+ 	+-------------------------------+
+-	|  padding                      | \
+-	+-------------------------------+  |
+-	|  callee-saved registers       |  | frame.saved_regs_size
+-	+-------------------------------+  |
+-	|  LR'                          |  |
+-	+-------------------------------+  |
+-	|  FP'                          |  |
+-	+-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
+-	|  SVE vector registers         |  | \
+-	+-------------------------------+  |  | below_hard_fp_saved_regs_size
+-	|  SVE predicate registers      | /  /
++	|  padding                      |
++	+-------------------------------+
++	|  callee-saved registers       |
++	+-------------------------------+
++	|  LR'                          |
++	+-------------------------------+
++	|  FP'                          |
++	+-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
++	|  SVE vector registers         |
++	+-------------------------------+
++	|  SVE predicate registers      |
+ 	+-------------------------------+
+ 	|  dynamic allocation           |
+ 	+-------------------------------+
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 46d4693e206..01f7751bc78 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -871,18 +871,11 @@ struct GTY (()) aarch64_frame
+      STACK_BOUNDARY.  */
+   HOST_WIDE_INT saved_varargs_size;
+ 
+-  /* The size of the callee-save registers with a slot in REG_OFFSET.  */
+-  poly_int64 saved_regs_size;
+-
+   /* The number of bytes between the bottom of the static frame (the bottom
+      of the outgoing arguments) and the bottom of the register save area.
+      This value is always a multiple of STACK_BOUNDARY.  */
+   poly_int64 bytes_below_saved_regs;
+ 
+-  /* The size of the callee-save registers with a slot in REG_OFFSET that
+-     are saved below the hard frame pointer.  */
+-  poly_int64 below_hard_fp_saved_regs_size;
+-
+   /* The number of bytes between the bottom of the static frame (the bottom
+      of the outgoing arguments) and the hard frame pointer.  This value is
+      always a multiple of STACK_BOUNDARY.  */
+-- 
+2.34.1
+
+
+From 75c37e031408262263442f5b4cdb83d3777b6422 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:08:57 +0100
+Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved
+ registers
+
+AArch64 normally puts the saved registers near the bottom of the frame,
+immediately above any dynamic allocations.  But this means that a
+stack-smash attack on those dynamic allocations could overwrite the
+saved registers without needing to reach as far as the stack smash
+canary.
+
+The same thing could also happen for variable-sized arguments that are
+passed by value, since those are allocated before a call and popped on
+return.
+
+This patch avoids that by putting the locals (and thus the canary) below
+the saved registers when stack smash protection is active.
+
+The patch fixes CVE-2023-4039.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p):
+	New function.
+	(aarch64_layout_frame): Use it to decide whether locals should
+	go above or below the saved registers.
+	(aarch64_expand_prologue): Update stack layout comment.
+	Emit a stack tie after the final adjustment.
+
+gcc/testsuite/
+	* gcc.target/aarch64/stack-protector-8.c: New test.
+	* gcc.target/aarch64/stack-protector-9.c: Likewise.
+---
+ gcc/config/aarch64/aarch64.cc                 | 46 +++++++--
+ .../gcc.target/aarch64/stack-protector-8.c    | 95 +++++++++++++++++++
+ .../gcc.target/aarch64/stack-protector-9.c    | 33 +++++++
+ 3 files changed, 168 insertions(+), 6 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index ac3d3b336a3..96c3f48fdc4 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8133,6 +8133,20 @@ aarch64_needs_frame_chain (void)
+   return aarch64_use_frame_pointer;
+ }
+ 
++/* Return true if the current function should save registers above
++   the locals area, rather than below it.  */
++
++static bool
++aarch64_save_regs_above_locals_p ()
++{
++  /* When using stack smash protection, make sure that the canary slot
++     comes between the locals and the saved registers.  Otherwise,
++     it would be possible for a carefully sized smash attack to change
++     the saved registers (particularly LR and FP) without reaching the
++     canary.  */
++  return crtl->stack_protect_guard;
++}
++
+ /* Mark the registers that need to be saved by the callee and calculate
+    the size of the callee-saved registers area and frame record (both FP
+    and LR may be omitted).  */
+@@ -8144,6 +8158,7 @@ aarch64_layout_frame (void)
+   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
+   bool frame_related_fp_reg_p = false;
+   aarch64_frame &frame = cfun->machine->frame;
++  poly_int64 top_of_locals = -1;
+ 
+   frame.emit_frame_chain = aarch64_needs_frame_chain ();
+ 
+@@ -8210,9 +8225,16 @@ aarch64_layout_frame (void)
+ 	&& !crtl->abi->clobbers_full_reg_p (regno))
+       frame.reg_offset[regno] = SLOT_REQUIRED;
+ 
++  bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
+ 
+   poly_int64 offset = crtl->outgoing_args_size;
+   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
++  if (regs_at_top_p)
++    {
++      offset += get_frame_size ();
++      offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
++      top_of_locals = offset;
++    }
+   frame.bytes_below_saved_regs = offset;
+   frame.sve_save_and_probe = INVALID_REGNUM;
+ 
+@@ -8352,15 +8374,18 @@ aarch64_layout_frame (void)
+      at expand_prologue.  */
+   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
+ 
+-  offset += get_frame_size ();
+-  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+-  auto top_of_locals = offset;
+-
++  if (!regs_at_top_p)
++    {
++      offset += get_frame_size ();
++      offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
++      top_of_locals = offset;
++    }
+   offset += frame.saved_varargs_size;
+   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
+   frame.frame_size = offset;
+ 
+   frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
++  gcc_assert (known_ge (top_of_locals, 0));
+   frame.bytes_above_locals = frame.frame_size - top_of_locals;
+ 
+   frame.initial_adjust = 0;
+@@ -9615,10 +9640,10 @@ aarch64_epilogue_uses (int regno)
+ 	|  for register varargs         |
+ 	|                               |
+ 	+-------------------------------+
+-	|  local variables              | <-- frame_pointer_rtx
++	|  local variables (1)          | <-- frame_pointer_rtx
+ 	|                               |
+ 	+-------------------------------+
+-	|  padding                      |
++	|  padding (1)                  |
+ 	+-------------------------------+
+ 	|  callee-saved registers       |
+ 	+-------------------------------+
+@@ -9630,6 +9655,10 @@ aarch64_epilogue_uses (int regno)
+ 	+-------------------------------+
+ 	|  SVE predicate registers      |
+ 	+-------------------------------+
++	|  local variables (2)          |
++	+-------------------------------+
++	|  padding (2)                  |
++	+-------------------------------+
+ 	|  dynamic allocation           |
+ 	+-------------------------------+
+ 	|  padding                      |
+@@ -9639,6 +9668,9 @@ aarch64_epilogue_uses (int regno)
+ 	+-------------------------------+
+ 	|                               | <-- stack_pointer_rtx (aligned)
+ 
++   The regions marked (1) and (2) are mutually exclusive.  (2) is used
++   when aarch64_save_regs_above_locals_p is true.
++
+    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
+    but leave frame_pointer_rtx and hard_frame_pointer_rtx
+    unchanged.
+@@ -9834,6 +9866,8 @@ aarch64_expand_prologue (void)
+   gcc_assert (known_eq (bytes_below_sp, final_adjust));
+   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
+ 					  !frame_pointer_needed, true);
++  if (emit_frame_chain && maybe_ne (final_adjust, 0))
++    emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
+ }
+ 
+ /* Return TRUE if we can use a simple_return insn.
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
+new file mode 100644
+index 00000000000..e71d820e365
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
+@@ -0,0 +1,95 @@
++/* { dg-options " -O -fstack-protector-strong -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 -mstack-protector-guard-offset=16" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void g(void *);
++__SVBool_t *h(void *);
++
++/*
++** test1:
++**	sub	sp, sp, #288
++**	stp	x29, x30, \[sp, #?272\]
++**	add	x29, sp, #?272
++**	mrs	(x[0-9]+), tpidr2_el0
++**	ldr	(x[0-9]+), \[\1, #?16\]
++**	str	\2, \[sp, #?264\]
++**	mov	\2, #?0
++**	add	x0, sp, #?8
++**	bl	g
++**	...
++**	mrs	.*
++**	...
++**	bne	.*
++**	...
++**	ldp	x29, x30, \[sp, #?272\]
++**	add	sp, sp, #?288
++**	ret
++**	bl	__stack_chk_fail
++*/
++int test1() {
++  int y[0x40];
++  g(y);
++  return 1;
++}
++
++/*
++** test2:
++**	stp	x29, x30, \[sp, #?-16\]!
++**	mov	x29, sp
++**	sub	sp, sp, #1040
++**	mrs	(x[0-9]+), tpidr2_el0
++**	ldr	(x[0-9]+), \[\1, #?16\]
++**	str	\2, \[sp, #?1032\]
++**	mov	\2, #?0
++**	add	x0, sp, #?8
++**	bl	g
++**	...
++**	mrs	.*
++**	...
++**	bne	.*
++**	...
++**	add	sp, sp, #?1040
++**	ldp	x29, x30, \[sp\], #?16
++**	ret
++**	bl	__stack_chk_fail
++*/
++int test2() {
++  int y[0x100];
++  g(y);
++  return 1;
++}
++
++#pragma GCC target "+sve"
++
++/*
++** test3:
++**	stp	x29, x30, \[sp, #?-16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-18
++**	...
++**	str	p4, \[sp\]
++**	...
++**	sub	sp, sp, #272
++**	mrs	(x[0-9]+), tpidr2_el0
++**	ldr	(x[0-9]+), \[\1, #?16\]
++**	str	\2, \[sp, #?264\]
++**	mov	\2, #?0
++**	add	x0, sp, #?8
++**	bl	h
++**	...
++**	mrs	.*
++**	...
++**	bne	.*
++**	...
++**	add	sp, sp, #?272
++**	...
++**	ldr	p4, \[sp\]
++**	...
++**	addvl	sp, sp, #18
++**	ldp	x29, x30, \[sp\], #?16
++**	ret
++**	bl	__stack_chk_fail
++*/
++__SVBool_t test3() {
++  int y[0x40];
++  return *h(y);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
+new file mode 100644
+index 00000000000..58f322aa480
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
+@@ -0,0 +1,33 @@
++/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/*
++** main:
++**	...
++**	stp	x29, x30, \[sp, #?-[0-9]+\]!
++**	...
++**	sub	sp, sp, #[0-9]+
++**	...
++**	str	x[0-9]+, \[x29, #?-8\]
++**	...
++*/
++int f(const char *);
++void g(void *);
++int main(int argc, char* argv[])
++{
++  int a;
++  int b;
++  char c[2+f(argv[1])];
++  int d[0x100];
++  char y;
++
++  y=42; a=4; b=10;
++  c[0] = 'h'; c[1] = '\0';
++
++  c[f(argv[2])] = '\0';
++
++  __builtin_printf("%d %d\n%s\n", a, b, c);
++  g(d);
++
++  return 0;
++}
+-- 
+2.34.1
+