Message ID | 20230912172439.2336327-1-ross.burton@arm.com |
---|---|
State | New |
Headers | show |
Series | [mickledore] gcc: Fix -fstack-protector issue on aarch64 | expand |
FYI: one of LGE proprietary components triggers ICE with this applied, I'll try to find minimal reproducer later, this is just for other people who might hit the same: error: unrecognizable insn: 2923 | } | ^ (insn 416 286 290 17 (parallel [ (set (mem/c:SI (plus:DI (reg/f:DI 29 x29) (const_int -260 [0xfffffffffffffefc])) [1 redacted.pixel_format+0 S4 A32]) (const_int 0 [0])) (set (mem/c:SI (plus:DI (reg/f:DI 29 x29) (const_int -256 [0xffffffffffffff00])) [1 redacted.pixel_value+0 S4 A128]) (reg/v:SI 22 x22 [orig:141 color ] [141])) ]) "TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c":2903:45 -1 (expr_list:REG_DEAD (reg/v:SI 22 x22 [orig:141 color ] [141]) (nil))) during RTL pass: cprop_hardreg TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c:2923:1: internal compiler error: in extract_insn, at recog.cc:2791 0x191624a internal_error(char const*, ...) ???:0 0x6bee26 fancy_abort(char const*, int, char const*) ???:0 0x697469 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*) ???:0 0x697485 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*) ???:0 0xbef198 extract_constrain_insn(rtx_insn*) ???:0 On Tue, Sep 12, 2023 at 7:24 PM Ross Burton <ross.burton@arm.com> wrote: > From: Ross Burton <ross.burton@arm.com> > > This series of patches fixes deficiencies in GCC's -fstack-protector > implementation for AArch64 when using dynamically allocated stack space. > This is CVE-2023-4039. See: > > > https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64 > > https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf > > for more details. > > Signed-off-by: Ross Burton <ross.burton@arm.com> > --- > meta/recipes-devtools/gcc/gcc-12.3.inc | 1 + > .../gcc/gcc/CVE-2023-4039.patch | 3093 +++++++++++++++++ > 2 files changed, 3094 insertions(+) > create mode 100644 meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch > > diff --git a/meta/recipes-devtools/gcc/gcc-12.3.inc > b/meta/recipes-devtools/gcc/gcc-12.3.inc > index 4ec03f925c8..5896f26e1af 100644 > --- a/meta/recipes-devtools/gcc/gcc-12.3.inc > +++ b/meta/recipes-devtools/gcc/gcc-12.3.inc > @@ -63,6 +63,7 @@ SRC_URI = "${BASEURI} \ > file://0026-rust-recursion-limit.patch \ > file://prefix-map-realpath.patch \ > file://hardcoded-paths.patch \ > + file://CVE-2023-4039.patch \ > " > SRC_URI[sha256sum] = > "949a5d4f99e786421a93b532b22ffab5578de7321369975b91aec97adfda8c3b" > > diff --git a/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch > b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch > new file mode 100644 > index 00000000000..8cb52849cd3 > --- /dev/null > +++ b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch > @@ -0,0 +1,3093 @@ > +From: Richard Sandiford <richard.sandiford@arm.com> > +Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue > +Date: Tue, 12 Sep 2023 16:25:10 +0100 > + > +This series of patches fixes deficiencies in GCC's -fstack-protector > +implementation for AArch64 when using dynamically allocated stack space. > +This is CVE-2023-4039. See: > + > + > https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64 > + > https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf > + > +for more details. > + > +The fix is to put the saved registers above the locals area when > +-fstack-protector is used. > + > +The series also fixes a stack-clash problem that I found while working > +on the CVE. In unpatched sources, the stack-clash problem would only > +trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an > +equivalent). But it would be a more significant issue with the new > +-fstack-protector frame layout. It's therefore important that both > +problems are fixed together. > + > +Some reorganisation of the code seemed necessary to fix the problems in a > +cleanish way. The series is therefore quite long, but only a handful of > +patches should have any effect on code generation. > + > +See the individual patches for a detailed description. > + > +Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches. > +I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039. > + > +CVE: CVE-2023-4039 > +Upstream-Status: Backport > +Signed-off-by: Ross Burton <ross.burton@arm.com> > + > + > +From 62fbb215cc817e9f2c1ca80282a64f4ee30806bc Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:48 +0100 > +Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping > code > + > +aarch64_layout_frame uses a shorthand for referring to > +cfun->machine->frame: > + > + aarch64_frame &frame = cfun->machine->frame; > + > +This patch does the same for some other heavy users of the structure. > +No functional change intended. > + > +gcc/ > + * config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use > + a local shorthand for cfun->machine->frame. > + (aarch64_restore_callee_saves, aarch64_get_separate_components): > + (aarch64_process_components): Likewise. > + (aarch64_allocate_and_probe_stack_space): Likewise. > + (aarch64_expand_prologue, aarch64_expand_epilogue): Likewise. > + (aarch64_layout_frame): Use existing shorthand for one more case. > +--- > + gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++---------------- > + 1 file changed, 64 insertions(+), 59 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 226dc9dffd4..ae42ffdedbe 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8351,7 +8351,7 @@ aarch64_layout_frame (void) > + frame.is_scs_enabled > + = (!crtl->calls_eh_return > + && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK) > +- && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)); > ++ && known_ge (frame.reg_offset[LR_REGNUM], 0)); > + > + /* When shadow call stack is enabled, the scs_pop in the epilogue will > + restore x30, and we don't need to pop x30 again in the traditional > +@@ -8763,6 +8763,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, > + unsigned start, unsigned limit, bool skip_wb, > + bool hard_fp_valid_p) > + { > ++ aarch64_frame &frame = cfun->machine->frame; > + rtx_insn *insn; > + unsigned regno; > + unsigned regno2; > +@@ -8777,8 +8778,8 @@ aarch64_save_callee_saves (poly_int64 start_offset, > + bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); > + > + if (skip_wb > +- && (regno == cfun->machine->frame.wb_push_candidate1 > +- || regno == cfun->machine->frame.wb_push_candidate2)) > ++ && (regno == frame.wb_push_candidate1 > ++ || regno == frame.wb_push_candidate2)) > + continue; > + > + if (cfun->machine->reg_is_wrapped_separately[regno]) > +@@ -8786,7 +8787,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, > + > + machine_mode mode = aarch64_reg_save_mode (regno); > + reg = gen_rtx_REG (mode, regno); > +- offset = start_offset + cfun->machine->frame.reg_offset[regno]; > ++ offset = start_offset + frame.reg_offset[regno]; > + rtx base_rtx = stack_pointer_rtx; > + poly_int64 sp_offset = offset; > + > +@@ -8799,7 +8800,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, > + { > + gcc_assert (known_eq (start_offset, 0)); > + poly_int64 fp_offset > +- = cfun->machine->frame.below_hard_fp_saved_regs_size; > ++ = frame.below_hard_fp_saved_regs_size; > + if (hard_fp_valid_p) > + base_rtx = hard_frame_pointer_rtx; > + else > +@@ -8821,8 +8822,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, > + && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= > limit > + && !cfun->machine->reg_is_wrapped_separately[regno2] > + && known_eq (GET_MODE_SIZE (mode), > +- cfun->machine->frame.reg_offset[regno2] > +- - cfun->machine->frame.reg_offset[regno])) > ++ frame.reg_offset[regno2] - frame.reg_offset[regno])) > + { > + rtx reg2 = gen_rtx_REG (mode, regno2); > + rtx mem2; > +@@ -8872,6 +8872,7 @@ static void > + aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, > + unsigned limit, bool skip_wb, rtx *cfi_ops) > + { > ++ aarch64_frame &frame = cfun->machine->frame; > + unsigned regno; > + unsigned regno2; > + poly_int64 offset; > +@@ -8888,13 +8889,13 @@ aarch64_restore_callee_saves (poly_int64 > start_offset, unsigned start, > + rtx reg, mem; > + > + if (skip_wb > +- && (regno == cfun->machine->frame.wb_pop_candidate1 > +- || regno == cfun->machine->frame.wb_pop_candidate2)) > ++ && (regno == frame.wb_pop_candidate1 > ++ || regno == frame.wb_pop_candidate2)) > + continue; > + > + machine_mode mode = aarch64_reg_save_mode (regno); > + reg = gen_rtx_REG (mode, regno); > +- offset = start_offset + cfun->machine->frame.reg_offset[regno]; > ++ offset = start_offset + frame.reg_offset[regno]; > + rtx base_rtx = stack_pointer_rtx; > + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) > + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, > +@@ -8905,8 +8906,7 @@ aarch64_restore_callee_saves (poly_int64 > start_offset, unsigned start, > + && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= > limit > + && !cfun->machine->reg_is_wrapped_separately[regno2] > + && known_eq (GET_MODE_SIZE (mode), > +- cfun->machine->frame.reg_offset[regno2] > +- - cfun->machine->frame.reg_offset[regno])) > ++ frame.reg_offset[regno2] - frame.reg_offset[regno])) > + { > + rtx reg2 = gen_rtx_REG (mode, regno2); > + rtx mem2; > +@@ -9011,6 +9011,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, > poly_int64 offset) > + static sbitmap > + aarch64_get_separate_components (void) > + { > ++ aarch64_frame &frame = cfun->machine->frame; > + sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); > + bitmap_clear (components); > + > +@@ -9027,18 +9028,18 @@ aarch64_get_separate_components (void) > + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) > + continue; > + > +- poly_int64 offset = cfun->machine->frame.reg_offset[regno]; > ++ poly_int64 offset = frame.reg_offset[regno]; > + > + /* If the register is saved in the first SVE save slot, we use > + it as a stack probe for -fstack-clash-protection. */ > + if (flag_stack_clash_protection > +- && maybe_ne > (cfun->machine->frame.below_hard_fp_saved_regs_size, 0) > ++ && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) > + && known_eq (offset, 0)) > + continue; > + > + /* Get the offset relative to the register we'll use. */ > + if (frame_pointer_needed) > +- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; > ++ offset -= frame.below_hard_fp_saved_regs_size; > + else > + offset += crtl->outgoing_args_size; > + > +@@ -9057,11 +9058,11 @@ aarch64_get_separate_components (void) > + /* If the spare predicate register used by big-endian SVE code > + is call-preserved, it must be saved in the main prologue > + before any saves that use it. */ > +- if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM) > +- bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg); > ++ if (frame.spare_pred_reg != INVALID_REGNUM) > ++ bitmap_clear_bit (components, frame.spare_pred_reg); > + > +- unsigned reg1 = cfun->machine->frame.wb_push_candidate1; > +- unsigned reg2 = cfun->machine->frame.wb_push_candidate2; > ++ unsigned reg1 = frame.wb_push_candidate1; > ++ unsigned reg2 = frame.wb_push_candidate2; > + /* If registers have been chosen to be stored/restored with > + writeback don't interfere with them to avoid having to output > explicit > + stack adjustment instructions. */ > +@@ -9170,6 +9171,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int > start) > + static void > + aarch64_process_components (sbitmap components, bool prologue_p) > + { > ++ aarch64_frame &frame = cfun->machine->frame; > + rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed > + ? HARD_FRAME_POINTER_REGNUM > + : STACK_POINTER_REGNUM); > +@@ -9184,9 +9186,9 @@ aarch64_process_components (sbitmap components, > bool prologue_p) > + machine_mode mode = aarch64_reg_save_mode (regno); > + > + rtx reg = gen_rtx_REG (mode, regno); > +- poly_int64 offset = cfun->machine->frame.reg_offset[regno]; > ++ poly_int64 offset = frame.reg_offset[regno]; > + if (frame_pointer_needed) > +- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; > ++ offset -= frame.below_hard_fp_saved_regs_size; > + else > + offset += crtl->outgoing_args_size; > + > +@@ -9211,14 +9213,14 @@ aarch64_process_components (sbitmap components, > bool prologue_p) > + break; > + } > + > +- poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2]; > ++ poly_int64 offset2 = frame.reg_offset[regno2]; > + /* The next register is not of the same class or its offset is not > + mergeable with the current one into a pair. */ > + if (aarch64_sve_mode_p (mode) > + || !satisfies_constraint_Ump (mem) > + || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2) > + || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno)) > +- || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]), > ++ || maybe_ne ((offset2 - frame.reg_offset[regno]), > + GET_MODE_SIZE (mode))) > + { > + insn = emit_insn (set); > +@@ -9240,7 +9242,7 @@ aarch64_process_components (sbitmap components, > bool prologue_p) > + /* REGNO2 can be saved/restored in a pair with REGNO. */ > + rtx reg2 = gen_rtx_REG (mode, regno2); > + if (frame_pointer_needed) > +- offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size; > ++ offset2 -= frame.below_hard_fp_saved_regs_size; > + else > + offset2 += crtl->outgoing_args_size; > + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); > +@@ -9335,6 +9337,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, > rtx temp2, > + bool frame_related_p, > + bool final_adjustment_p) > + { > ++ aarch64_frame &frame = cfun->machine->frame; > + HOST_WIDE_INT guard_size > + = 1 << param_stack_clash_protection_guard_size; > + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; > +@@ -9355,25 +9358,25 @@ aarch64_allocate_and_probe_stack_space (rtx > temp1, rtx temp2, > + register as a probe. We can't assume that LR was saved at > position 0 > + though, so treat any space below it as unprobed. */ > + if (final_adjustment_p > +- && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, > 0)) > ++ && known_eq (frame.below_hard_fp_saved_regs_size, 0)) > + { > +- poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM]; > ++ poly_int64 lr_offset = frame.reg_offset[LR_REGNUM]; > + if (known_ge (lr_offset, 0)) > + min_probe_threshold -= lr_offset.to_constant (); > + else > + gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, > 0)); > + } > + > +- poly_int64 frame_size = cfun->machine->frame.frame_size; > ++ poly_int64 frame_size = frame.frame_size; > + > + /* We should always have a positive probe threshold. */ > + gcc_assert (min_probe_threshold > 0); > + > + if (flag_stack_clash_protection && !final_adjustment_p) > + { > +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; > +- poly_int64 sve_callee_adjust = > cfun->machine->frame.sve_callee_adjust; > +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; > ++ poly_int64 initial_adjust = frame.initial_adjust; > ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; > ++ poly_int64 final_adjust = frame.final_adjust; > + > + if (known_eq (frame_size, 0)) > + { > +@@ -9662,17 +9665,18 @@ aarch64_epilogue_uses (int regno) > + void > + aarch64_expand_prologue (void) > + { > +- poly_int64 frame_size = cfun->machine->frame.frame_size; > +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; > +- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; > +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; > +- poly_int64 callee_offset = cfun->machine->frame.callee_offset; > +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; > ++ aarch64_frame &frame = cfun->machine->frame; > ++ poly_int64 frame_size = frame.frame_size; > ++ poly_int64 initial_adjust = frame.initial_adjust; > ++ HOST_WIDE_INT callee_adjust = frame.callee_adjust; > ++ poly_int64 final_adjust = frame.final_adjust; > ++ poly_int64 callee_offset = frame.callee_offset; > ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; > + poly_int64 below_hard_fp_saved_regs_size > +- = cfun->machine->frame.below_hard_fp_saved_regs_size; > +- unsigned reg1 = cfun->machine->frame.wb_push_candidate1; > +- unsigned reg2 = cfun->machine->frame.wb_push_candidate2; > +- bool emit_frame_chain = cfun->machine->frame.emit_frame_chain; > ++ = frame.below_hard_fp_saved_regs_size; > ++ unsigned reg1 = frame.wb_push_candidate1; > ++ unsigned reg2 = frame.wb_push_candidate2; > ++ bool emit_frame_chain = frame.emit_frame_chain; > + rtx_insn *insn; > + > + if (flag_stack_clash_protection && known_eq (callee_adjust, 0)) > +@@ -9703,7 +9707,7 @@ aarch64_expand_prologue (void) > + } > + > + /* Push return address to shadow call stack. */ > +- if (cfun->machine->frame.is_scs_enabled) > ++ if (frame.is_scs_enabled) > + emit_insn (gen_scs_push ()); > + > + if (flag_stack_usage_info) > +@@ -9742,7 +9746,7 @@ aarch64_expand_prologue (void) > + > + /* The offset of the frame chain record (if any) from the current SP. > */ > + poly_int64 chain_offset = (initial_adjust + callee_adjust > +- - cfun->machine->frame.hard_fp_offset); > ++ - frame.hard_fp_offset); > + gcc_assert (known_ge (chain_offset, 0)); > + > + /* The offset of the bottom of the save area from the current SP. */ > +@@ -9845,16 +9849,17 @@ aarch64_use_return_insn_p (void) > + void > + aarch64_expand_epilogue (bool for_sibcall) > + { > +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; > +- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; > +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; > +- poly_int64 callee_offset = cfun->machine->frame.callee_offset; > +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; > ++ aarch64_frame &frame = cfun->machine->frame; > ++ poly_int64 initial_adjust = frame.initial_adjust; > ++ HOST_WIDE_INT callee_adjust = frame.callee_adjust; > ++ poly_int64 final_adjust = frame.final_adjust; > ++ poly_int64 callee_offset = frame.callee_offset; > ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; > + poly_int64 below_hard_fp_saved_regs_size > +- = cfun->machine->frame.below_hard_fp_saved_regs_size; > +- unsigned reg1 = cfun->machine->frame.wb_pop_candidate1; > +- unsigned reg2 = cfun->machine->frame.wb_pop_candidate2; > +- unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled > ++ = frame.below_hard_fp_saved_regs_size; > ++ unsigned reg1 = frame.wb_pop_candidate1; > ++ unsigned reg2 = frame.wb_pop_candidate2; > ++ unsigned int last_gpr = (frame.is_scs_enabled > + ? R29_REGNUM : R30_REGNUM); > + rtx cfi_ops = NULL; > + rtx_insn *insn; > +@@ -9888,7 +9893,7 @@ aarch64_expand_epilogue (bool for_sibcall) > + /* We need to add memory barrier to prevent read from deallocated > stack. */ > + bool need_barrier_p > + = maybe_ne (get_frame_size () > +- + cfun->machine->frame.saved_varargs_size, 0); > ++ + frame.saved_varargs_size, 0); > + > + /* Emit a barrier to prevent loads from a deallocated stack. */ > + if (maybe_gt (final_adjust, crtl->outgoing_args_size) > +@@ -9969,7 +9974,7 @@ aarch64_expand_epilogue (bool for_sibcall) > + } > + > + /* Pop return address from shadow call stack. */ > +- if (cfun->machine->frame.is_scs_enabled) > ++ if (frame.is_scs_enabled) > + { > + machine_mode mode = aarch64_reg_save_mode (R30_REGNUM); > + rtx reg = gen_rtx_REG (mode, R30_REGNUM); > +@@ -12564,24 +12569,24 @@ aarch64_can_eliminate (const int from > ATTRIBUTE_UNUSED, const int to) > + poly_int64 > + aarch64_initial_elimination_offset (unsigned from, unsigned to) > + { > ++ aarch64_frame &frame = cfun->machine->frame; > ++ > + if (to == HARD_FRAME_POINTER_REGNUM) > + { > + if (from == ARG_POINTER_REGNUM) > +- return cfun->machine->frame.hard_fp_offset; > ++ return frame.hard_fp_offset; > + > + if (from == FRAME_POINTER_REGNUM) > +- return cfun->machine->frame.hard_fp_offset > +- - cfun->machine->frame.locals_offset; > ++ return frame.hard_fp_offset - frame.locals_offset; > + } > + > + if (to == STACK_POINTER_REGNUM) > + { > + if (from == FRAME_POINTER_REGNUM) > +- return cfun->machine->frame.frame_size > +- - cfun->machine->frame.locals_offset; > ++ return frame.frame_size - frame.locals_offset; > + } > + > +- return cfun->machine->frame.frame_size; > ++ return frame.frame_size; > + } > + > + > +-- > +2.34.1 > + > + > +From 12a8889de169f892d2e927584c00d20b8b7e456f Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:49 +0100 > +Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset > + > +When we emit the frame chain, i.e. when we reach Here in this statement > +of aarch64_expand_prologue: > + > + if (emit_frame_chain) > + { > + // Here > + ... > + } > + > +the stack is in one of two states: > + > +- We've allocated up to the frame chain, but no more. > + > +- We've allocated the whole frame, and the frame chain is within easy > + reach of the new SP. > + > +The offset of the frame chain from the current SP is available > +in aarch64_frame as callee_offset. It is also available as the > +chain_offset local variable, where the latter is calculated from other > +data. (However, chain_offset is not always equal to callee_offset when > +!emit_frame_chain, so chain_offset isn't redundant.) > + > +In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using > +chain_offset for the initialisation of the hard frame pointer: > + > + aarch64_add_offset (Pmode, hard_frame_pointer_rtx, > +- stack_pointer_rtx, callee_offset, > ++ stack_pointer_rtx, chain_offset, > + tmp1_rtx, tmp0_rtx, frame_pointer_needed); > + > +But the later REG_CFA_ADJUST_CFA handling still used callee_offset. > + > +I think the difference is harmless, but it's more logical for the > +CFA note to be in sync, and it's more convenient for later patches > +if it uses chain_offset. > + > +gcc/ > + * config/aarch64/aarch64.cc (aarch64_expand_prologue): Use > + chain_offset rather than callee_offset. > +--- > + gcc/config/aarch64/aarch64.cc | 4 +--- > + 1 file changed, 1 insertion(+), 3 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index ae42ffdedbe..79253322fd7 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -9670,7 +9670,6 @@ aarch64_expand_prologue (void) > + poly_int64 initial_adjust = frame.initial_adjust; > + HOST_WIDE_INT callee_adjust = frame.callee_adjust; > + poly_int64 final_adjust = frame.final_adjust; > +- poly_int64 callee_offset = frame.callee_offset; > + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; > + poly_int64 below_hard_fp_saved_regs_size > + = frame.below_hard_fp_saved_regs_size; > +@@ -9783,8 +9782,7 @@ aarch64_expand_prologue (void) > + implicit. */ > + if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX)) > + { > +- rtx src = plus_constant (Pmode, stack_pointer_rtx, > +- callee_offset); > ++ rtx src = plus_constant (Pmode, stack_pointer_rtx, > chain_offset); > + add_reg_note (insn, REG_CFA_ADJUST_CFA, > + gen_rtx_SET (hard_frame_pointer_rtx, src)); > + } > +-- > +2.34.1 > + > + > +From 03d5e89e7f3be53fd7142556e8e0a2774c653dca Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:49 +0100 > +Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved > + registers > + > +If a frame has no saved registers, it can be allocated in one go. > +There is no need to treat the areas below and above the saved > +registers as separate. > + > +And if we allocate the frame in one go, it should be allocated > +as the initial_adjust rather than the final_adjust. This allows the > +frame size to grow to guard_size - guard_used_by_caller before a stack > +probe is needed. (A frame with no register saves is necessarily a > +leaf frame.) > + > +This is a no-op as thing stand, since a leaf function will have > +no outgoing arguments, and so all the frame will be above where > +the saved registers normally go. > + > +gcc/ > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly > + allocate the frame in one go if there are no saved registers. > +--- > + gcc/config/aarch64/aarch64.cc | 8 +++++--- > + 1 file changed, 5 insertions(+), 3 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 79253322fd7..e1f21230c15 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8378,9 +8378,11 @@ aarch64_layout_frame (void) > + > + HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; > + HOST_WIDE_INT const_saved_regs_size; > +- if (frame.frame_size.is_constant (&const_size) > +- && const_size < max_push_offset > +- && known_eq (frame.hard_fp_offset, const_size)) > ++ if (known_eq (frame.saved_regs_size, 0)) > ++ frame.initial_adjust = frame.frame_size; > ++ else if (frame.frame_size.is_constant (&const_size) > ++ && const_size < max_push_offset > ++ && known_eq (frame.hard_fp_offset, const_size)) > + { > + /* Simple, small frame with no outgoing arguments: > + > +-- > +2.34.1 > + > + > +From 49c2eb7616756c323b7f6b18d8616ec945eb1263 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:49 +0100 > +Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info > + > +The frame layout code currently hard-codes the assumption that > +the number of bytes below the saved registers is equal to the > +size of the outgoing arguments. This patch abstracts that > +value into a new field of aarch64_frame. > + > +gcc/ > + * config/aarch64/aarch64.h > (aarch64_frame::bytes_below_saved_regs): New > + field. > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it, > + and use it instead of crtl->outgoing_args_size. > + (aarch64_get_separate_components): Use bytes_below_saved_regs > instead > + of outgoing_args_size. > + (aarch64_process_components): Likewise. > +--- > + gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++----------------- > + gcc/config/aarch64/aarch64.h | 5 +++ > + 2 files changed, 41 insertions(+), 35 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index e1f21230c15..94e1b686584 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8217,6 +8217,8 @@ aarch64_layout_frame (void) > + gcc_assert (crtl->is_leaf > + || maybe_ne (frame.reg_offset[R30_REGNUM], > SLOT_NOT_REQUIRED)); > + > ++ frame.bytes_below_saved_regs = crtl->outgoing_args_size; > ++ > + /* Now assign stack slots for the registers. Start with the predicate > + registers, since predicate LDR and STR have a relatively small > + offset range. These saves happen below the hard frame pointer. */ > +@@ -8321,18 +8323,18 @@ aarch64_layout_frame (void) > + > + poly_int64 varargs_and_saved_regs_size = offset + > frame.saved_varargs_size; > + > +- poly_int64 above_outgoing_args > ++ poly_int64 saved_regs_and_above > + = aligned_upper_bound (varargs_and_saved_regs_size > + + get_frame_size (), > + STACK_BOUNDARY / BITS_PER_UNIT); > + > + frame.hard_fp_offset > +- = above_outgoing_args - frame.below_hard_fp_saved_regs_size; > ++ = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; > + > + /* Both these values are already aligned. */ > +- gcc_assert (multiple_p (crtl->outgoing_args_size, > ++ gcc_assert (multiple_p (frame.bytes_below_saved_regs, > + STACK_BOUNDARY / BITS_PER_UNIT)); > +- frame.frame_size = above_outgoing_args + crtl->outgoing_args_size; > ++ frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; > + > + frame.locals_offset = frame.saved_varargs_size; > + > +@@ -8376,7 +8378,7 @@ aarch64_layout_frame (void) > + else if (frame.wb_pop_candidate1 != INVALID_REGNUM) > + max_push_offset = 256; > + > +- HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; > ++ HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset; > + HOST_WIDE_INT const_saved_regs_size; > + if (known_eq (frame.saved_regs_size, 0)) > + frame.initial_adjust = frame.frame_size; > +@@ -8384,31 +8386,31 @@ aarch64_layout_frame (void) > + && const_size < max_push_offset > + && known_eq (frame.hard_fp_offset, const_size)) > + { > +- /* Simple, small frame with no outgoing arguments: > ++ /* Simple, small frame with no data below the saved registers. > + > + stp reg1, reg2, [sp, -frame_size]! > + stp reg3, reg4, [sp, 16] */ > + frame.callee_adjust = const_size; > + } > +- else if (crtl->outgoing_args_size.is_constant > (&const_outgoing_args_size) > ++ else if (frame.bytes_below_saved_regs.is_constant > (&const_below_saved_regs) > + && frame.saved_regs_size.is_constant (&const_saved_regs_size) > +- && const_outgoing_args_size + const_saved_regs_size < 512 > +- /* We could handle this case even with outgoing args, provided > +- that the number of args left us with valid offsets for all > +- predicate and vector save slots. It's such a rare case that > +- it hardly seems worth the effort though. */ > +- && (!saves_below_hard_fp_p || const_outgoing_args_size == 0) > ++ && const_below_saved_regs + const_saved_regs_size < 512 > ++ /* We could handle this case even with data below the saved > ++ registers, provided that that data left us with valid offsets > ++ for all predicate and vector save slots. It's such a rare > ++ case that it hardly seems worth the effort though. */ > ++ && (!saves_below_hard_fp_p || const_below_saved_regs == 0) > + && !(cfun->calls_alloca > + && frame.hard_fp_offset.is_constant (&const_fp_offset) > + && const_fp_offset < max_push_offset)) > + { > +- /* Frame with small outgoing arguments: > ++ /* Frame with small area below the saved registers: > + > + sub sp, sp, frame_size > +- stp reg1, reg2, [sp, outgoing_args_size] > +- stp reg3, reg4, [sp, outgoing_args_size + 16] */ > ++ stp reg1, reg2, [sp, bytes_below_saved_regs] > ++ stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */ > + frame.initial_adjust = frame.frame_size; > +- frame.callee_offset = const_outgoing_args_size; > ++ frame.callee_offset = const_below_saved_regs; > + } > + else if (saves_below_hard_fp_p > + && known_eq (frame.saved_regs_size, > +@@ -8418,30 +8420,29 @@ aarch64_layout_frame (void) > + > + sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size > + save SVE registers relative to SP > +- sub sp, sp, outgoing_args_size */ > ++ sub sp, sp, bytes_below_saved_regs */ > + frame.initial_adjust = (frame.hard_fp_offset > + + frame.below_hard_fp_saved_regs_size); > +- frame.final_adjust = crtl->outgoing_args_size; > ++ frame.final_adjust = frame.bytes_below_saved_regs; > + } > + else if (frame.hard_fp_offset.is_constant (&const_fp_offset) > + && const_fp_offset < max_push_offset) > + { > +- /* Frame with large outgoing arguments or SVE saves, but with > +- a small local area: > ++ /* Frame with large area below the saved registers, or with SVE > saves, > ++ but with a small area above: > + > + stp reg1, reg2, [sp, -hard_fp_offset]! > + stp reg3, reg4, [sp, 16] > + [sub sp, sp, below_hard_fp_saved_regs_size] > + [save SVE registers relative to SP] > +- sub sp, sp, outgoing_args_size */ > ++ sub sp, sp, bytes_below_saved_regs */ > + frame.callee_adjust = const_fp_offset; > + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; > +- frame.final_adjust = crtl->outgoing_args_size; > ++ frame.final_adjust = frame.bytes_below_saved_regs; > + } > + else > + { > +- /* Frame with large local area and outgoing arguments or SVE saves, > +- using frame pointer: > ++ /* General case: > + > + sub sp, sp, hard_fp_offset > + stp x29, x30, [sp, 0] > +@@ -8449,10 +8450,10 @@ aarch64_layout_frame (void) > + stp reg3, reg4, [sp, 16] > + [sub sp, sp, below_hard_fp_saved_regs_size] > + [save SVE registers relative to SP] > +- sub sp, sp, outgoing_args_size */ > ++ sub sp, sp, bytes_below_saved_regs */ > + frame.initial_adjust = frame.hard_fp_offset; > + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; > +- frame.final_adjust = crtl->outgoing_args_size; > ++ frame.final_adjust = frame.bytes_below_saved_regs; > + } > + > + /* Make sure the individual adjustments add up to the full frame > size. */ > +@@ -9043,7 +9044,7 @@ aarch64_get_separate_components (void) > + if (frame_pointer_needed) > + offset -= frame.below_hard_fp_saved_regs_size; > + else > +- offset += crtl->outgoing_args_size; > ++ offset += frame.bytes_below_saved_regs; > + > + /* Check that we can access the stack slot of the register with one > + direct load with no adjustments needed. */ > +@@ -9192,7 +9193,7 @@ aarch64_process_components (sbitmap components, > bool prologue_p) > + if (frame_pointer_needed) > + offset -= frame.below_hard_fp_saved_regs_size; > + else > +- offset += crtl->outgoing_args_size; > ++ offset += frame.bytes_below_saved_regs; > + > + rtx addr = plus_constant (Pmode, ptr_reg, offset); > + rtx mem = gen_frame_mem (mode, addr); > +@@ -9246,7 +9247,7 @@ aarch64_process_components (sbitmap components, > bool prologue_p) > + if (frame_pointer_needed) > + offset2 -= frame.below_hard_fp_saved_regs_size; > + else > +- offset2 += crtl->outgoing_args_size; > ++ offset2 += frame.bytes_below_saved_regs; > + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); > + rtx mem2 = gen_frame_mem (mode, addr2); > + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) > +@@ -9320,10 +9321,10 @@ aarch64_stack_clash_protection_alloca_probe_range > (void) > + registers. If POLY_SIZE is not large enough to require a probe this > function > + will only adjust the stack. When allocating the stack space > + FRAME_RELATED_P is then used to indicate if the allocation is frame > related. > +- FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing > +- arguments. If we are then we ensure that any allocation larger than > the ABI > +- defined buffer needs a probe so that the invariant of having a 1KB > buffer is > +- maintained. > ++ FINAL_ADJUSTMENT_P indicates whether we are allocating the area below > ++ the saved registers. If we are then we ensure that any allocation > ++ larger than the ABI defined buffer needs a probe so that the > ++ invariant of having a 1KB buffer is maintained. > + > + We emit barriers after each stack adjustment to prevent optimizations > from > + breaking the invariant that we never drop the stack more than a > page. This > +@@ -9532,7 +9533,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, > rtx temp2, > + /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD > have to > + be probed. This maintains the requirement that each page is probed > at > + least once. For initial probing we probe only if the allocation is > +- more than GUARD_SIZE - buffer, and for the outgoing arguments we > probe > ++ more than GUARD_SIZE - buffer, and below the saved registers we > probe > + if the amount is larger than buffer. GUARD_SIZE - buffer + buffer > == > + GUARD_SIZE. This works that for any allocation that is large > enough to > + trigger a probe here, we'll have at least one, and if they're not > large > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index 6834c3e9922..1e105e12db8 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -871,6 +871,11 @@ struct GTY (()) aarch64_frame > + /* The size of the callee-save registers with a slot in REG_OFFSET. */ > + poly_int64 saved_regs_size; > + > ++ /* The number of bytes between the bottom of the static frame (the > bottom > ++ of the outgoing arguments) and the bottom of the register save area. > ++ This value is always a multiple of STACK_BOUNDARY. */ > ++ poly_int64 bytes_below_saved_regs; > ++ > + /* The size of the callee-save registers with a slot in REG_OFFSET that > + are saved below the hard frame pointer. */ > + poly_int64 below_hard_fp_saved_regs_size; > +-- > +2.34.1 > + > + > +From 34081079ea4de0c98331843f574b5f6f94d7b234 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:50 +0100 > +Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info > + > +Following on from the previous bytes_below_saved_regs patch, this one > +records the number of bytes that are below the hard frame pointer. > +This eventually replaces below_hard_fp_saved_regs_size. > + > +If a frame pointer is not needed, the epilogue adds final_adjust > +to the stack pointer before restoring registers: > + > + aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true); > + > +Therefore, if the epilogue needs to restore the stack pointer from > +the hard frame pointer, the directly corresponding offset is: > + > + -bytes_below_hard_fp + final_adjust > + > +i.e. go from the hard frame pointer to the bottom of the frame, > +then add the same amount as if we were using the stack pointer > +from the outset. > + > +gcc/ > + * config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): > New > + field. > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it. > + (aarch64_expand_epilogue): Use it instead of > + below_hard_fp_saved_regs_size. > +--- > + gcc/config/aarch64/aarch64.cc | 6 +++--- > + gcc/config/aarch64/aarch64.h | 5 +++++ > + 2 files changed, 8 insertions(+), 3 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 94e1b686584..c7d84245fbf 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8269,6 +8269,7 @@ aarch64_layout_frame (void) > + of the callee save area. */ > + bool saves_below_hard_fp_p = maybe_ne (offset, 0); > + frame.below_hard_fp_saved_regs_size = offset; > ++ frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs; > + if (frame.emit_frame_chain) > + { > + /* FP and LR are placed in the linkage record. */ > +@@ -9856,8 +9857,7 @@ aarch64_expand_epilogue (bool for_sibcall) > + poly_int64 final_adjust = frame.final_adjust; > + poly_int64 callee_offset = frame.callee_offset; > + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; > +- poly_int64 below_hard_fp_saved_regs_size > +- = frame.below_hard_fp_saved_regs_size; > ++ poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp; > + unsigned reg1 = frame.wb_pop_candidate1; > + unsigned reg2 = frame.wb_pop_candidate2; > + unsigned int last_gpr = (frame.is_scs_enabled > +@@ -9915,7 +9915,7 @@ aarch64_expand_epilogue (bool for_sibcall) > + is restored on the instruction doing the writeback. */ > + aarch64_add_offset (Pmode, stack_pointer_rtx, > + hard_frame_pointer_rtx, > +- -callee_offset - below_hard_fp_saved_regs_size, > ++ -bytes_below_hard_fp + final_adjust, > + tmp1_rtx, tmp0_rtx, callee_adjust == 0); > + else > + /* The case where we need to re-use the register here is very rare, > so > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index 1e105e12db8..de68ff7202f 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -880,6 +880,11 @@ struct GTY (()) aarch64_frame > + are saved below the hard frame pointer. */ > + poly_int64 below_hard_fp_saved_regs_size; > + > ++ /* The number of bytes between the bottom of the static frame (the > bottom > ++ of the outgoing arguments) and the hard frame pointer. This value > is > ++ always a multiple of STACK_BOUNDARY. */ > ++ poly_int64 bytes_below_hard_fp; > ++ > + /* Offset from the base of the frame (incomming SP) to the > + top of the locals area. This value is always a multiple of > + STACK_BOUNDARY. */ > +-- > +2.34.1 > + > + > +From 187861af7c51db9eddc6f954b589c121b210fc74 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:50 +0100 > +Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves > + > +aarch64_save_callee_saves and aarch64_restore_callee_saves took > +a parameter called start_offset that gives the offset of the > +bottom of the saved register area from the current stack pointer. > +However, it's more convenient for later patches if we use the > +bottom of the entire frame as the reference point, rather than > +the bottom of the saved registers. > + > +Doing that removes the need for the callee_offset field. > +Other than that, this is not a win on its own. It only really > +makes sense in combination with the follow-on patches. > + > +gcc/ > + * config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete. > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Remove > + callee_offset handling. > + (aarch64_save_callee_saves): Replace the start_offset parameter > + with a bytes_below_sp parameter. > + (aarch64_restore_callee_saves): Likewise. > + (aarch64_expand_prologue): Update accordingly. > + (aarch64_expand_epilogue): Likewise. > +--- > + gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------ > + gcc/config/aarch64/aarch64.h | 4 --- > + 2 files changed, 28 insertions(+), 32 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index c7d84245fbf..e79551af41d 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8343,7 +8343,6 @@ aarch64_layout_frame (void) > + frame.final_adjust = 0; > + frame.callee_adjust = 0; > + frame.sve_callee_adjust = 0; > +- frame.callee_offset = 0; > + > + frame.wb_pop_candidate1 = frame.wb_push_candidate1; > + frame.wb_pop_candidate2 = frame.wb_push_candidate2; > +@@ -8411,7 +8410,6 @@ aarch64_layout_frame (void) > + stp reg1, reg2, [sp, bytes_below_saved_regs] > + stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */ > + frame.initial_adjust = frame.frame_size; > +- frame.callee_offset = const_below_saved_regs; > + } > + else if (saves_below_hard_fp_p > + && known_eq (frame.saved_regs_size, > +@@ -8758,12 +8756,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx > reg, > + } > + > + /* Emit code to save the callee-saved registers from register number > START > +- to LIMIT to the stack at the location starting at offset START_OFFSET, > +- skipping any write-back candidates if SKIP_WB is true. > HARD_FP_VALID_P > +- is true if the hard frame pointer has been set up. */ > ++ to LIMIT to the stack. The stack pointer is currently BYTES_BELOW_SP > ++ bytes above the bottom of the static frame. Skip any write-back > ++ candidates if SKIP_WB is true. HARD_FP_VALID_P is true if the hard > ++ frame pointer has been set up. */ > + > + static void > +-aarch64_save_callee_saves (poly_int64 start_offset, > ++aarch64_save_callee_saves (poly_int64 bytes_below_sp, > + unsigned start, unsigned limit, bool skip_wb, > + bool hard_fp_valid_p) > + { > +@@ -8791,7 +8790,9 @@ aarch64_save_callee_saves (poly_int64 start_offset, > + > + machine_mode mode = aarch64_reg_save_mode (regno); > + reg = gen_rtx_REG (mode, regno); > +- offset = start_offset + frame.reg_offset[regno]; > ++ offset = (frame.reg_offset[regno] > ++ + frame.bytes_below_saved_regs > ++ - bytes_below_sp); > + rtx base_rtx = stack_pointer_rtx; > + poly_int64 sp_offset = offset; > + > +@@ -8802,9 +8803,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, > + else if (GP_REGNUM_P (regno) > + && (!offset.is_constant (&const_offset) || const_offset >= > 512)) > + { > +- gcc_assert (known_eq (start_offset, 0)); > +- poly_int64 fp_offset > +- = frame.below_hard_fp_saved_regs_size; > ++ poly_int64 fp_offset = frame.bytes_below_hard_fp - > bytes_below_sp; > + if (hard_fp_valid_p) > + base_rtx = hard_frame_pointer_rtx; > + else > +@@ -8868,12 +8867,13 @@ aarch64_save_callee_saves (poly_int64 > start_offset, > + } > + > + /* Emit code to restore the callee registers from register number START > +- up to and including LIMIT. Restore from the stack offset > START_OFFSET, > +- skipping any write-back candidates if SKIP_WB is true. Write the > +- appropriate REG_CFA_RESTORE notes into CFI_OPS. */ > ++ up to and including LIMIT. The stack pointer is currently > BYTES_BELOW_SP > ++ bytes above the bottom of the static frame. Skip any write-back > ++ candidates if SKIP_WB is true. Write the appropriate REG_CFA_RESTORE > ++ notes into CFI_OPS. */ > + > + static void > +-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, > ++aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start, > + unsigned limit, bool skip_wb, rtx *cfi_ops) > + { > + aarch64_frame &frame = cfun->machine->frame; > +@@ -8899,7 +8899,9 @@ aarch64_restore_callee_saves (poly_int64 > start_offset, unsigned start, > + > + machine_mode mode = aarch64_reg_save_mode (regno); > + reg = gen_rtx_REG (mode, regno); > +- offset = start_offset + frame.reg_offset[regno]; > ++ offset = (frame.reg_offset[regno] > ++ + frame.bytes_below_saved_regs > ++ - bytes_below_sp); > + rtx base_rtx = stack_pointer_rtx; > + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) > + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, > +@@ -9675,8 +9677,6 @@ aarch64_expand_prologue (void) > + HOST_WIDE_INT callee_adjust = frame.callee_adjust; > + poly_int64 final_adjust = frame.final_adjust; > + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; > +- poly_int64 below_hard_fp_saved_regs_size > +- = frame.below_hard_fp_saved_regs_size; > + unsigned reg1 = frame.wb_push_candidate1; > + unsigned reg2 = frame.wb_push_candidate2; > + bool emit_frame_chain = frame.emit_frame_chain; > +@@ -9752,8 +9752,8 @@ aarch64_expand_prologue (void) > + - frame.hard_fp_offset); > + gcc_assert (known_ge (chain_offset, 0)); > + > +- /* The offset of the bottom of the save area from the current SP. */ > +- poly_int64 saved_regs_offset = chain_offset - > below_hard_fp_saved_regs_size; > ++ /* The offset of the current SP from the bottom of the static frame. > */ > ++ poly_int64 bytes_below_sp = frame_size - initial_adjust - > callee_adjust; > + > + if (emit_frame_chain) > + { > +@@ -9761,7 +9761,7 @@ aarch64_expand_prologue (void) > + { > + reg1 = R29_REGNUM; > + reg2 = R30_REGNUM; > +- aarch64_save_callee_saves (saved_regs_offset, reg1, reg2, > ++ aarch64_save_callee_saves (bytes_below_sp, reg1, reg2, > + false, false); > + } > + else > +@@ -9801,7 +9801,7 @@ aarch64_expand_prologue (void) > + emit_insn (gen_stack_tie (stack_pointer_rtx, > hard_frame_pointer_rtx)); > + } > + > +- aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM, > ++ aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM, > + callee_adjust != 0 || emit_frame_chain, > + emit_frame_chain); > + if (maybe_ne (sve_callee_adjust, 0)) > +@@ -9811,16 +9811,17 @@ aarch64_expand_prologue (void) > + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, > + sve_callee_adjust, > + !frame_pointer_needed, > false); > +- saved_regs_offset += sve_callee_adjust; > ++ bytes_below_sp -= sve_callee_adjust; > + } > +- aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM, > ++ aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM, > + false, emit_frame_chain); > +- aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM, > ++ aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM, > + callee_adjust != 0 || emit_frame_chain, > + emit_frame_chain); > + > + /* We may need to probe the final adjustment if it is larger than the > guard > + that is assumed by the called. */ > ++ gcc_assert (known_eq (bytes_below_sp, final_adjust)); > + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, > final_adjust, > + !frame_pointer_needed, true); > + } > +@@ -9855,7 +9856,6 @@ aarch64_expand_epilogue (bool for_sibcall) > + poly_int64 initial_adjust = frame.initial_adjust; > + HOST_WIDE_INT callee_adjust = frame.callee_adjust; > + poly_int64 final_adjust = frame.final_adjust; > +- poly_int64 callee_offset = frame.callee_offset; > + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; > + poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp; > + unsigned reg1 = frame.wb_pop_candidate1; > +@@ -9925,9 +9925,9 @@ aarch64_expand_epilogue (bool for_sibcall) > + > + /* Restore the vector registers before the predicate registers, > + so that we can use P4 as a temporary for big-endian SVE frames. */ > +- aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM, > ++ aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM, > + callee_adjust != 0, &cfi_ops); > +- aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM, > ++ aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM, > + false, &cfi_ops); > + if (maybe_ne (sve_callee_adjust, 0)) > + aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true); > +@@ -9935,7 +9935,7 @@ aarch64_expand_epilogue (bool for_sibcall) > + /* When shadow call stack is enabled, the scs_pop in the epilogue will > + restore x30, we don't need to restore x30 again in the traditional > + way. */ > +- aarch64_restore_callee_saves (callee_offset - sve_callee_adjust, > ++ aarch64_restore_callee_saves (final_adjust + sve_callee_adjust, > + R0_REGNUM, last_gpr, > + callee_adjust != 0, &cfi_ops); > + > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index de68ff7202f..94fca4b9471 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -907,10 +907,6 @@ struct GTY (()) aarch64_frame > + It is zero when no push is used. */ > + HOST_WIDE_INT callee_adjust; > + > +- /* The offset from SP to the callee-save registers after > initial_adjust. > +- It may be non-zero if no push is used (ie. callee_adjust == 0). */ > +- poly_int64 callee_offset; > +- > + /* The size of the stack adjustment before saving or after restoring > + SVE registers. */ > + poly_int64 sve_callee_adjust; > +-- > +2.34.1 > + > + > +From 2b983f9064d808daf909bde1d4a13980934a7e6e Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:51 +0100 > +Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a > + chain > + > +After previous patches, it is no longer necessary to calculate > +a chain_offset in cases where there is no chain record. > + > +gcc/ > + * config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the > + calculation of chain_offset into the emit_frame_chain block. > +--- > + gcc/config/aarch64/aarch64.cc | 10 +++++----- > + 1 file changed, 5 insertions(+), 5 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index e79551af41d..d71a042d611 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -9747,16 +9747,16 @@ aarch64_expand_prologue (void) > + if (callee_adjust != 0) > + aarch64_push_regs (reg1, reg2, callee_adjust); > + > +- /* The offset of the frame chain record (if any) from the current SP. > */ > +- poly_int64 chain_offset = (initial_adjust + callee_adjust > +- - frame.hard_fp_offset); > +- gcc_assert (known_ge (chain_offset, 0)); > +- > + /* The offset of the current SP from the bottom of the static frame. > */ > + poly_int64 bytes_below_sp = frame_size - initial_adjust - > callee_adjust; > + > + if (emit_frame_chain) > + { > ++ /* The offset of the frame chain record (if any) from the current > SP. */ > ++ poly_int64 chain_offset = (initial_adjust + callee_adjust > ++ - frame.hard_fp_offset); > ++ gcc_assert (known_ge (chain_offset, 0)); > ++ > + if (callee_adjust == 0) > + { > + reg1 = R29_REGNUM; > +-- > +2.34.1 > + > + > +From 0a0a824808d1dec51004fb5805c1a0ae2a35433f Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:51 +0100 > +Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals > +MIME-Version: 1.0 > +Content-Type: text/plain; charset=UTF-8 > +Content-Transfer-Encoding: 8bit > + > +locals_offset was described as: > + > + /* Offset from the base of the frame (incomming SP) to the > + top of the locals area. This value is always a multiple of > + STACK_BOUNDARY. */ > + > +This is implicitly an “upside down” view of the frame: the incoming > +SP is at offset 0, and anything N bytes below the incoming SP is at > +offset N (rather than -N). > + > +However, reg_offset instead uses a “right way up” view; that is, > +it views offsets in address terms. Something above X is at a > +positive offset from X and something below X is at a negative > +offset from X. > + > +Also, even on FRAME_GROWS_DOWNWARD targets like AArch64, > +target-independent code views offsets in address terms too: > +locals are allocated at negative offsets to virtual_stack_vars. > + > +It seems confusing to have *_offset fields of the same structure > +using different polarities like this. This patch tries to avoid > +that by renaming locals_offset to bytes_above_locals. > + > +gcc/ > + * config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename > to... > + (aarch64_frame::bytes_above_locals): ...this. > + * config/aarch64/aarch64.cc (aarch64_layout_frame) > + (aarch64_initial_elimination_offset): Update accordingly. > +--- > + gcc/config/aarch64/aarch64.cc | 6 +++--- > + gcc/config/aarch64/aarch64.h | 6 +++--- > + 2 files changed, 6 insertions(+), 6 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index d71a042d611..d4ec352ba98 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8337,7 +8337,7 @@ aarch64_layout_frame (void) > + STACK_BOUNDARY / BITS_PER_UNIT)); > + frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; > + > +- frame.locals_offset = frame.saved_varargs_size; > ++ frame.bytes_above_locals = frame.saved_varargs_size; > + > + frame.initial_adjust = 0; > + frame.final_adjust = 0; > +@@ -12578,13 +12578,13 @@ aarch64_initial_elimination_offset (unsigned > from, unsigned to) > + return frame.hard_fp_offset; > + > + if (from == FRAME_POINTER_REGNUM) > +- return frame.hard_fp_offset - frame.locals_offset; > ++ return frame.hard_fp_offset - frame.bytes_above_locals; > + } > + > + if (to == STACK_POINTER_REGNUM) > + { > + if (from == FRAME_POINTER_REGNUM) > +- return frame.frame_size - frame.locals_offset; > ++ return frame.frame_size - frame.bytes_above_locals; > + } > + > + return frame.frame_size; > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index 94fca4b9471..bf46e6124aa 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -885,10 +885,10 @@ struct GTY (()) aarch64_frame > + always a multiple of STACK_BOUNDARY. */ > + poly_int64 bytes_below_hard_fp; > + > +- /* Offset from the base of the frame (incomming SP) to the > +- top of the locals area. This value is always a multiple of > ++ /* The number of bytes between the top of the locals area and the top > ++ of the frame (the incomming SP). This value is always a multiple of > + STACK_BOUNDARY. */ > +- poly_int64 locals_offset; > ++ poly_int64 bytes_above_locals; > + > + /* Offset from the base of the frame (incomming SP) to the > + hard_frame_pointer. This value is always a multiple of > +-- > +2.34.1 > + > + > +From 3fbf0789202b30a67b12e1fb785c7130f098d665 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:52 +0100 > +Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to > bytes_above_hard_fp > +MIME-Version: 1.0 > +Content-Type: text/plain; charset=UTF-8 > +Content-Transfer-Encoding: 8bit > + > +Similarly to the previous locals_offset patch, hard_fp_offset > +was described as: > + > + /* Offset from the base of the frame (incomming SP) to the > + hard_frame_pointer. This value is always a multiple of > + STACK_BOUNDARY. */ > + poly_int64 hard_fp_offset; > + > +which again took an “upside-down” view: higher offsets meant lower > +addresses. This patch renames the field to bytes_above_hard_fp instead. > + > +gcc/ > + * config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename > + to... > + (aarch64_frame::bytes_above_hard_fp): ...this. > + * config/aarch64/aarch64.cc (aarch64_layout_frame) > + (aarch64_expand_prologue): Update accordingly. > + (aarch64_initial_elimination_offset): Likewise. > +--- > + gcc/config/aarch64/aarch64.cc | 26 +++++++++++++------------- > + gcc/config/aarch64/aarch64.h | 6 +++--- > + 2 files changed, 16 insertions(+), 16 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index d4ec352ba98..3c4052740e7 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8329,7 +8329,7 @@ aarch64_layout_frame (void) > + + get_frame_size (), > + STACK_BOUNDARY / BITS_PER_UNIT); > + > +- frame.hard_fp_offset > ++ frame.bytes_above_hard_fp > + = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; > + > + /* Both these values are already aligned. */ > +@@ -8378,13 +8378,13 @@ aarch64_layout_frame (void) > + else if (frame.wb_pop_candidate1 != INVALID_REGNUM) > + max_push_offset = 256; > + > +- HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset; > ++ HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; > + HOST_WIDE_INT const_saved_regs_size; > + if (known_eq (frame.saved_regs_size, 0)) > + frame.initial_adjust = frame.frame_size; > + else if (frame.frame_size.is_constant (&const_size) > + && const_size < max_push_offset > +- && known_eq (frame.hard_fp_offset, const_size)) > ++ && known_eq (frame.bytes_above_hard_fp, const_size)) > + { > + /* Simple, small frame with no data below the saved registers. > + > +@@ -8401,8 +8401,8 @@ aarch64_layout_frame (void) > + case that it hardly seems worth the effort though. */ > + && (!saves_below_hard_fp_p || const_below_saved_regs == 0) > + && !(cfun->calls_alloca > +- && frame.hard_fp_offset.is_constant (&const_fp_offset) > +- && const_fp_offset < max_push_offset)) > ++ && frame.bytes_above_hard_fp.is_constant (&const_above_fp) > ++ && const_above_fp < max_push_offset)) > + { > + /* Frame with small area below the saved registers: > + > +@@ -8420,12 +8420,12 @@ aarch64_layout_frame (void) > + sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size > + save SVE registers relative to SP > + sub sp, sp, bytes_below_saved_regs */ > +- frame.initial_adjust = (frame.hard_fp_offset > ++ frame.initial_adjust = (frame.bytes_above_hard_fp > + + frame.below_hard_fp_saved_regs_size); > + frame.final_adjust = frame.bytes_below_saved_regs; > + } > +- else if (frame.hard_fp_offset.is_constant (&const_fp_offset) > +- && const_fp_offset < max_push_offset) > ++ else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) > ++ && const_above_fp < max_push_offset) > + { > + /* Frame with large area below the saved registers, or with SVE > saves, > + but with a small area above: > +@@ -8435,7 +8435,7 @@ aarch64_layout_frame (void) > + [sub sp, sp, below_hard_fp_saved_regs_size] > + [save SVE registers relative to SP] > + sub sp, sp, bytes_below_saved_regs */ > +- frame.callee_adjust = const_fp_offset; > ++ frame.callee_adjust = const_above_fp; > + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; > + frame.final_adjust = frame.bytes_below_saved_regs; > + } > +@@ -8450,7 +8450,7 @@ aarch64_layout_frame (void) > + [sub sp, sp, below_hard_fp_saved_regs_size] > + [save SVE registers relative to SP] > + sub sp, sp, bytes_below_saved_regs */ > +- frame.initial_adjust = frame.hard_fp_offset; > ++ frame.initial_adjust = frame.bytes_above_hard_fp; > + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; > + frame.final_adjust = frame.bytes_below_saved_regs; > + } > +@@ -9754,7 +9754,7 @@ aarch64_expand_prologue (void) > + { > + /* The offset of the frame chain record (if any) from the current > SP. */ > + poly_int64 chain_offset = (initial_adjust + callee_adjust > +- - frame.hard_fp_offset); > ++ - frame.bytes_above_hard_fp); > + gcc_assert (known_ge (chain_offset, 0)); > + > + if (callee_adjust == 0) > +@@ -12575,10 +12575,10 @@ aarch64_initial_elimination_offset (unsigned > from, unsigned to) > + if (to == HARD_FRAME_POINTER_REGNUM) > + { > + if (from == ARG_POINTER_REGNUM) > +- return frame.hard_fp_offset; > ++ return frame.bytes_above_hard_fp; > + > + if (from == FRAME_POINTER_REGNUM) > +- return frame.hard_fp_offset - frame.bytes_above_locals; > ++ return frame.bytes_above_hard_fp - frame.bytes_above_locals; > + } > + > + if (to == STACK_POINTER_REGNUM) > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index bf46e6124aa..dd1f403f939 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -890,10 +890,10 @@ struct GTY (()) aarch64_frame > + STACK_BOUNDARY. */ > + poly_int64 bytes_above_locals; > + > +- /* Offset from the base of the frame (incomming SP) to the > +- hard_frame_pointer. This value is always a multiple of > ++ /* The number of bytes between the hard_frame_pointer and the top of > ++ the frame (the incomming SP). This value is always a multiple of > + STACK_BOUNDARY. */ > +- poly_int64 hard_fp_offset; > ++ poly_int64 bytes_above_hard_fp; > + > + /* The size of the frame. This value is the offset from base of the > + frame (incomming SP) to the stack_pointer. This value is always > +-- > +2.34.1 > + > + > +From aac8b31379ac3bbd14fc6427dce23f56e54e8485 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:52 +0100 > +Subject: [PATCH 10/19] aarch64: Tweak frame_size comment > +MIME-Version: 1.0 > +Content-Type: text/plain; charset=UTF-8 > +Content-Transfer-Encoding: 8bit > + > +This patch fixes another case in which a value was described with > +an “upside-down” view. > + > +gcc/ > + * config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak > comment. > +--- > + gcc/config/aarch64/aarch64.h | 4 ++-- > + 1 file changed, 2 insertions(+), 2 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index dd1f403f939..700524ae22b 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -895,8 +895,8 @@ struct GTY (()) aarch64_frame > + STACK_BOUNDARY. */ > + poly_int64 bytes_above_hard_fp; > + > +- /* The size of the frame. This value is the offset from base of the > +- frame (incomming SP) to the stack_pointer. This value is always > ++ /* The size of the frame, i.e. the number of bytes between the bottom > ++ of the outgoing arguments and the incoming SP. This value is always > + a multiple of STACK_BOUNDARY. */ > + poly_int64 frame_size; > + > +-- > +2.34.1 > + > + > +From 8d5506a8aeb8dd7e8b209a3663b07688478f76b9 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:53 +0100 > +Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the > + frame > + > +reg_offset was measured from the bottom of the saved register area. > +This made perfect sense with the original layout, since the bottom > +of the saved register area was also the hard frame pointer address. > +It became slightly less obvious with SVE, since we save SVE > +registers below the hard frame pointer, but it still made sense. > + > +However, if we want to allow different frame layouts, it's more > +convenient and obvious to measure reg_offset from the bottom of > +the frame. After previous patches, it's also a slight simplification > +in its own right. > + > +gcc/ > + * config/aarch64/aarch64.h (aarch64_frame): Add comment above > + reg_offset. > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets > + from the bottom of the frame, rather than the bottom of the saved > + register area. Measure reg_offset from the bottom of the frame > + rather than the bottom of the saved register area. > + (aarch64_save_callee_saves): Update accordingly. > + (aarch64_restore_callee_saves): Likewise. > + (aarch64_get_separate_components): Likewise. > + (aarch64_process_components): Likewise. > +--- > + gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++------------------- > + gcc/config/aarch64/aarch64.h | 3 ++ > + 2 files changed, 27 insertions(+), 29 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 3c4052740e7..97dd077844b 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8139,7 +8139,6 @@ aarch64_needs_frame_chain (void) > + static void > + aarch64_layout_frame (void) > + { > +- poly_int64 offset = 0; > + int regno, last_fp_reg = INVALID_REGNUM; > + machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM); > + poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); > +@@ -8217,7 +8216,9 @@ aarch64_layout_frame (void) > + gcc_assert (crtl->is_leaf > + || maybe_ne (frame.reg_offset[R30_REGNUM], > SLOT_NOT_REQUIRED)); > + > +- frame.bytes_below_saved_regs = crtl->outgoing_args_size; > ++ poly_int64 offset = crtl->outgoing_args_size; > ++ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); > ++ frame.bytes_below_saved_regs = offset; > + > + /* Now assign stack slots for the registers. Start with the predicate > + registers, since predicate LDR and STR have a relatively small > +@@ -8229,7 +8230,8 @@ aarch64_layout_frame (void) > + offset += BYTES_PER_SVE_PRED; > + } > + > +- if (maybe_ne (offset, 0)) > ++ poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs; > ++ if (maybe_ne (saved_prs_size, 0)) > + { > + /* If we have any vector registers to save above the predicate > registers, > + the offset of the vector register save slots need to be a multiple > +@@ -8247,10 +8249,10 @@ aarch64_layout_frame (void) > + offset = aligned_upper_bound (offset, STACK_BOUNDARY / > BITS_PER_UNIT); > + else > + { > +- if (known_le (offset, vector_save_size)) > +- offset = vector_save_size; > +- else if (known_le (offset, vector_save_size * 2)) > +- offset = vector_save_size * 2; > ++ if (known_le (saved_prs_size, vector_save_size)) > ++ offset = frame.bytes_below_saved_regs + vector_save_size; > ++ else if (known_le (saved_prs_size, vector_save_size * 2)) > ++ offset = frame.bytes_below_saved_regs + vector_save_size * 2; > + else > + gcc_unreachable (); > + } > +@@ -8267,9 +8269,10 @@ aarch64_layout_frame (void) > + > + /* OFFSET is now the offset of the hard frame pointer from the bottom > + of the callee save area. */ > +- bool saves_below_hard_fp_p = maybe_ne (offset, 0); > +- frame.below_hard_fp_saved_regs_size = offset; > +- frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs; > ++ frame.below_hard_fp_saved_regs_size = offset - > frame.bytes_below_saved_regs; > ++ bool saves_below_hard_fp_p > ++ = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); > ++ frame.bytes_below_hard_fp = offset; > + if (frame.emit_frame_chain) > + { > + /* FP and LR are placed in the linkage record. */ > +@@ -8320,9 +8323,10 @@ aarch64_layout_frame (void) > + > + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); > + > +- frame.saved_regs_size = offset; > ++ frame.saved_regs_size = offset - frame.bytes_below_saved_regs; > + > +- poly_int64 varargs_and_saved_regs_size = offset + > frame.saved_varargs_size; > ++ poly_int64 varargs_and_saved_regs_size > ++ = frame.saved_regs_size + frame.saved_varargs_size; > + > + poly_int64 saved_regs_and_above > + = aligned_upper_bound (varargs_and_saved_regs_size > +@@ -8790,9 +8794,7 @@ aarch64_save_callee_saves (poly_int64 > bytes_below_sp, > + > + machine_mode mode = aarch64_reg_save_mode (regno); > + reg = gen_rtx_REG (mode, regno); > +- offset = (frame.reg_offset[regno] > +- + frame.bytes_below_saved_regs > +- - bytes_below_sp); > ++ offset = frame.reg_offset[regno] - bytes_below_sp; > + rtx base_rtx = stack_pointer_rtx; > + poly_int64 sp_offset = offset; > + > +@@ -8899,9 +8901,7 @@ aarch64_restore_callee_saves (poly_int64 > bytes_below_sp, unsigned start, > + > + machine_mode mode = aarch64_reg_save_mode (regno); > + reg = gen_rtx_REG (mode, regno); > +- offset = (frame.reg_offset[regno] > +- + frame.bytes_below_saved_regs > +- - bytes_below_sp); > ++ offset = frame.reg_offset[regno] - bytes_below_sp; > + rtx base_rtx = stack_pointer_rtx; > + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) > + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, > +@@ -9040,14 +9040,12 @@ aarch64_get_separate_components (void) > + it as a stack probe for -fstack-clash-protection. */ > + if (flag_stack_clash_protection > + && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) > +- && known_eq (offset, 0)) > ++ && known_eq (offset, frame.bytes_below_saved_regs)) > + continue; > + > + /* Get the offset relative to the register we'll use. */ > + if (frame_pointer_needed) > +- offset -= frame.below_hard_fp_saved_regs_size; > +- else > +- offset += frame.bytes_below_saved_regs; > ++ offset -= frame.bytes_below_hard_fp; > + > + /* Check that we can access the stack slot of the register with one > + direct load with no adjustments needed. */ > +@@ -9194,9 +9192,7 @@ aarch64_process_components (sbitmap components, > bool prologue_p) > + rtx reg = gen_rtx_REG (mode, regno); > + poly_int64 offset = frame.reg_offset[regno]; > + if (frame_pointer_needed) > +- offset -= frame.below_hard_fp_saved_regs_size; > +- else > +- offset += frame.bytes_below_saved_regs; > ++ offset -= frame.bytes_below_hard_fp; > + > + rtx addr = plus_constant (Pmode, ptr_reg, offset); > + rtx mem = gen_frame_mem (mode, addr); > +@@ -9248,9 +9244,7 @@ aarch64_process_components (sbitmap components, > bool prologue_p) > + /* REGNO2 can be saved/restored in a pair with REGNO. */ > + rtx reg2 = gen_rtx_REG (mode, regno2); > + if (frame_pointer_needed) > +- offset2 -= frame.below_hard_fp_saved_regs_size; > +- else > +- offset2 += frame.bytes_below_saved_regs; > ++ offset2 -= frame.bytes_below_hard_fp; > + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); > + rtx mem2 = gen_frame_mem (mode, addr2); > + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) > +@@ -9366,7 +9360,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, > rtx temp2, > + if (final_adjustment_p > + && known_eq (frame.below_hard_fp_saved_regs_size, 0)) > + { > +- poly_int64 lr_offset = frame.reg_offset[LR_REGNUM]; > ++ poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM] > ++ - frame.bytes_below_saved_regs); > + if (known_ge (lr_offset, 0)) > + min_probe_threshold -= lr_offset.to_constant (); > + else > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index 700524ae22b..b6135837073 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -860,6 +860,9 @@ extern enum aarch64_processor aarch64_tune; > + #ifdef HAVE_POLY_INT_H > + struct GTY (()) aarch64_frame > + { > ++ /* The offset from the bottom of the static frame (the bottom of the > ++ outgoing arguments) of each register save slot, or -2 if no save is > ++ needed. */ > + poly_int64 reg_offset[LAST_SAVED_REGNUM + 1]; > + > + /* The number of extra stack bytes taken up by register varargs. > +-- > +2.34.1 > + > + > +From b47766614df3b9df878262efb2ad73aaac108363 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:53 +0100 > +Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation > + > +After previous patches, it no longer really makes sense to allocate > +the top of the frame in terms of varargs_and_saved_regs_size and > +saved_regs_and_above. > + > +gcc/ > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify > + the allocation of the top of the frame. > +--- > + gcc/config/aarch64/aarch64.cc | 23 ++++++++--------------- > + 1 file changed, 8 insertions(+), 15 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 97dd077844b..81935852d5b 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8325,23 +8325,16 @@ aarch64_layout_frame (void) > + > + frame.saved_regs_size = offset - frame.bytes_below_saved_regs; > + > +- poly_int64 varargs_and_saved_regs_size > +- = frame.saved_regs_size + frame.saved_varargs_size; > +- > +- poly_int64 saved_regs_and_above > +- = aligned_upper_bound (varargs_and_saved_regs_size > +- + get_frame_size (), > +- STACK_BOUNDARY / BITS_PER_UNIT); > +- > +- frame.bytes_above_hard_fp > +- = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; > ++ offset += get_frame_size (); > ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); > ++ auto top_of_locals = offset; > + > +- /* Both these values are already aligned. */ > +- gcc_assert (multiple_p (frame.bytes_below_saved_regs, > +- STACK_BOUNDARY / BITS_PER_UNIT)); > +- frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; > ++ offset += frame.saved_varargs_size; > ++ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); > ++ frame.frame_size = offset; > + > +- frame.bytes_above_locals = frame.saved_varargs_size; > ++ frame.bytes_above_hard_fp = frame.frame_size - > frame.bytes_below_hard_fp; > ++ frame.bytes_above_locals = frame.frame_size - top_of_locals; > + > + frame.initial_adjust = 0; > + frame.final_adjust = 0; > +-- > +2.34.1 > + > + > +From 08f71b4bb28fb74d20e8d2927a557e8119ce9f4d Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:54 +0100 > +Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak > + > +This patch just changes a calculation of initial_adjust > +to one that makes it slightly more obvious that the total > +adjustment is frame.frame_size. > + > +gcc/ > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak > + calculation of initial_adjust for frames in which all saves > + are SVE saves. > +--- > + gcc/config/aarch64/aarch64.cc | 5 ++--- > + 1 file changed, 2 insertions(+), 3 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 81935852d5b..4d9fcf3d162 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8414,11 +8414,10 @@ aarch64_layout_frame (void) > + { > + /* Frame in which all saves are SVE saves: > + > +- sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size > ++ sub sp, sp, frame_size - bytes_below_saved_regs > + save SVE registers relative to SP > + sub sp, sp, bytes_below_saved_regs */ > +- frame.initial_adjust = (frame.bytes_above_hard_fp > +- + frame.below_hard_fp_saved_regs_size); > ++ frame.initial_adjust = frame.frame_size - > frame.bytes_below_saved_regs; > + frame.final_adjust = frame.bytes_below_saved_regs; > + } > + else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) > +-- > +2.34.1 > + > + > +From f22315d5c19e8310e4dc880fd509678fd291fca8 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:54 +0100 > +Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition > + > +The AArch64 ABI says that, when stack clash protection is used, > +there can be a maximum of 1KiB of unprobed space at sp on entry > +to a function. Therefore, we need to probe when allocating > +>= guard_size - 1KiB of data (>= rather than >). This is what > +GCC does. > + > +If an allocation is exactly guard_size bytes, it is enough to allocate > +those bytes and probe once at offset 1024. It isn't possible to use a > +single probe at any other offset: higher would conmplicate later code, > +by leaving more unprobed space than usual, while lower would risk > +leaving an entire page unprobed. For simplicity, the code probes all > +allocations at offset 1024. > + > +Some register saves also act as probes. If we need to allocate > +more space below the last such register save probe, we need to > +probe the allocation if it is > 1KiB. Again, this allocation is > +then sometimes (but not always) probed at offset 1024. This sort of > +allocation is currently only used for outgoing arguments, which are > +rarely this big. > + > +However, the code also probed if this final outgoing-arguments > +allocation was == 1KiB, rather than just > 1KiB. This isn't > +necessary, since the register save then probes at offset 1024 > +as required. Continuing to probe allocations of exactly 1KiB > +would complicate later patches. > + > +gcc/ > + * config/aarch64/aarch64.cc > (aarch64_allocate_and_probe_stack_space): > + Don't probe final allocations that are exactly 1KiB in size (after > + unprobed space above the final allocation has been deducted). > + > +gcc/testsuite/ > + * gcc.target/aarch64/stack-check-prologue-17.c: New test. > +--- > + gcc/config/aarch64/aarch64.cc | 4 +- > + .../aarch64/stack-check-prologue-17.c | 55 +++++++++++++++++++ > + 2 files changed, 58 insertions(+), 1 deletion(-) > + create mode 100644 > gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 4d9fcf3d162..34c1d8614cd 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -9333,9 +9333,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, > rtx temp2, > + HOST_WIDE_INT guard_size > + = 1 << param_stack_clash_protection_guard_size; > + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; > ++ HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT; > ++ gcc_assert (multiple_p (poly_size, byte_sp_alignment)); > + HOST_WIDE_INT min_probe_threshold > + = (final_adjustment_p > +- ? guard_used_by_caller > ++ ? guard_used_by_caller + byte_sp_alignment > + : guard_size - guard_used_by_caller); > + /* When doing the final adjustment for the outgoing arguments, take > into > + account any unprobed space there is above the current SP. There are > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c > +new file mode 100644 > +index 00000000000..0d8a25d73a2 > +--- /dev/null > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c > +@@ -0,0 +1,55 @@ > ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer > --param stack-clash-protection-guard-size=12" } */ > ++/* { dg-final { check-function-bodies "**" "" } } */ > ++ > ++void f(int, ...); > ++void g(); > ++ > ++/* > ++** test1: > ++** ... > ++** str x30, \[sp\] > ++** sub sp, sp, #1024 > ++** cbnz w0, .* > ++** bl g > ++** ... > ++*/ > ++int test1(int z) { > ++ __uint128_t x = 0; > ++ int y[0x400]; > ++ if (z) > ++ { > ++ f(0, 0, 0, 0, 0, 0, 0, &y, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); > ++ } > ++ g(); > ++ return 1; > ++} > ++ > ++/* > ++** test2: > ++** ... > ++** str x30, \[sp\] > ++** sub sp, sp, #1040 > ++** str xzr, \[sp\] > ++** cbnz w0, .* > ++** bl g > ++** ... > ++*/ > ++int test2(int z) { > ++ __uint128_t x = 0; > ++ int y[0x400]; > ++ if (z) > ++ { > ++ f(0, 0, 0, 0, 0, 0, 0, &y, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x); > ++ } > ++ g(); > ++ return 1; > ++} > +-- > +2.34.1 > + > + > +From 15e18831bf98fd25af098b970ebf0c9a6200a34b Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:55 +0100 > +Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes > + > +-fstack-clash-protection uses the save of LR as a probe for the next > +allocation. The next allocation could be: > + > +* another part of the static frame, e.g. when allocating SVE save slots > + or outgoing arguments > + > +* an alloca in the same function > + > +* an allocation made by a callee function > + > +However, when -fomit-frame-pointer is used, the LR save slot is placed > +above the other GPR save slots. It could therefore be up to 80 bytes > +above the base of the GPR save area (which is also the hard fp address). > + > +aarch64_allocate_and_probe_stack_space took this into account when > +deciding how much subsequent space could be allocated without needing > +a probe. However, it interacted badly with: > + > + /* If doing a small final adjustment, we always probe at offset 0. > + This is done to avoid issues when LR is not at position 0 or when > + the final adjustment is smaller than the probing offset. */ > + else if (final_adjustment_p && rounded_size == 0) > + residual_probe_offset = 0; > + > +which forces any allocation that is smaller than the guard page size > +to be probed at offset 0 rather than the usual offset 1024. It was > +therefore possible to construct cases in which we had: > + > +* a probe using LR at SP + 80 bytes (or some other value >= 16) > +* an allocation of the guard page size - 16 bytes > +* a probe at SP + 0 > + > +which allocates guard page size + 64 consecutive unprobed bytes. > + > +This patch requires the LR probe to be in the first 16 bytes of the > +save area when stack clash protection is active. Doing it > +unconditionally would cause code-quality regressions. > + > +Putting LR before other registers prevents push/pop allocation > +when shadow call stacks are enabled, since LR is restored > +separately from the other callee-saved registers. > + > +The new comment doesn't say that the probe register is required > +to be LR, since a later patch removes that restriction. > + > +gcc/ > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that > + the LR save slot is in the first 16 bytes of the register save > area. > + Only form STP/LDP push/pop candidates if both registers are valid. > + (aarch64_allocate_and_probe_stack_space): Remove workaround for > + when LR was not in the first 16 bytes. > + > +gcc/testsuite/ > + * gcc.target/aarch64/stack-check-prologue-18.c: New test. > + * gcc.target/aarch64/stack-check-prologue-19.c: Likewise. > + * gcc.target/aarch64/stack-check-prologue-20.c: Likewise. > +--- > + gcc/config/aarch64/aarch64.cc | 72 ++++++------- > + .../aarch64/stack-check-prologue-18.c | 100 ++++++++++++++++++ > + .../aarch64/stack-check-prologue-19.c | 100 ++++++++++++++++++ > + .../aarch64/stack-check-prologue-20.c | 3 + > + 4 files changed, 233 insertions(+), 42 deletions(-) > + create mode 100644 > gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c > + create mode 100644 > gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c > + create mode 100644 > gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 34c1d8614cd..16433fb70f4 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8273,26 +8273,34 @@ aarch64_layout_frame (void) > + bool saves_below_hard_fp_p > + = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); > + frame.bytes_below_hard_fp = offset; > ++ > ++ auto allocate_gpr_slot = [&](unsigned int regno) > ++ { > ++ frame.reg_offset[regno] = offset; > ++ if (frame.wb_push_candidate1 == INVALID_REGNUM) > ++ frame.wb_push_candidate1 = regno; > ++ else if (frame.wb_push_candidate2 == INVALID_REGNUM) > ++ frame.wb_push_candidate2 = regno; > ++ offset += UNITS_PER_WORD; > ++ }; > ++ > + if (frame.emit_frame_chain) > + { > + /* FP and LR are placed in the linkage record. */ > +- frame.reg_offset[R29_REGNUM] = offset; > +- frame.wb_push_candidate1 = R29_REGNUM; > +- frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD; > +- frame.wb_push_candidate2 = R30_REGNUM; > +- offset += 2 * UNITS_PER_WORD; > ++ allocate_gpr_slot (R29_REGNUM); > ++ allocate_gpr_slot (R30_REGNUM); > + } > ++ else if (flag_stack_clash_protection > ++ && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED)) > ++ /* Put the LR save slot first, since it makes a good choice of probe > ++ for stack clash purposes. The idea is that the link register > usually > ++ has to be saved before a call anyway, and so we lose little by > ++ stopping it from being individually shrink-wrapped. */ > ++ allocate_gpr_slot (R30_REGNUM); > + > + for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) > + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) > +- { > +- frame.reg_offset[regno] = offset; > +- if (frame.wb_push_candidate1 == INVALID_REGNUM) > +- frame.wb_push_candidate1 = regno; > +- else if (frame.wb_push_candidate2 == INVALID_REGNUM) > +- frame.wb_push_candidate2 = regno; > +- offset += UNITS_PER_WORD; > +- } > ++ allocate_gpr_slot (regno); > + > + poly_int64 max_int_offset = offset; > + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); > +@@ -8370,10 +8378,13 @@ aarch64_layout_frame (void) > + max_push_offset to 0, because no registers are popped at this time, > + so callee_adjust cannot be adjusted. */ > + HOST_WIDE_INT max_push_offset = 0; > +- if (frame.wb_pop_candidate2 != INVALID_REGNUM) > +- max_push_offset = 512; > +- else if (frame.wb_pop_candidate1 != INVALID_REGNUM) > +- max_push_offset = 256; > ++ if (frame.wb_pop_candidate1 != INVALID_REGNUM) > ++ { > ++ if (frame.wb_pop_candidate2 != INVALID_REGNUM) > ++ max_push_offset = 512; > ++ else > ++ max_push_offset = 256; > ++ } > + > + HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; > + HOST_WIDE_INT const_saved_regs_size; > +@@ -9339,29 +9350,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, > rtx temp2, > + = (final_adjustment_p > + ? guard_used_by_caller + byte_sp_alignment > + : guard_size - guard_used_by_caller); > +- /* When doing the final adjustment for the outgoing arguments, take > into > +- account any unprobed space there is above the current SP. There are > +- two cases: > +- > +- - When saving SVE registers below the hard frame pointer, we force > +- the lowest save to take place in the prologue before doing the > final > +- adjustment (i.e. we don't allow the save to be shrink-wrapped). > +- This acts as a probe at SP, so there is no unprobed space. > +- > +- - When there are no SVE register saves, we use the store of the link > +- register as a probe. We can't assume that LR was saved at > position 0 > +- though, so treat any space below it as unprobed. */ > +- if (final_adjustment_p > +- && known_eq (frame.below_hard_fp_saved_regs_size, 0)) > +- { > +- poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM] > +- - frame.bytes_below_saved_regs); > +- if (known_ge (lr_offset, 0)) > +- min_probe_threshold -= lr_offset.to_constant (); > +- else > +- gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, > 0)); > +- } > +- > + poly_int64 frame_size = frame.frame_size; > + > + /* We should always have a positive probe threshold. */ > +@@ -9541,8 +9529,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, > rtx temp2, > + if (final_adjustment_p && rounded_size != 0) > + min_probe_threshold = 0; > + /* If doing a small final adjustment, we always probe at offset 0. > +- This is done to avoid issues when LR is not at position 0 or when > +- the final adjustment is smaller than the probing offset. */ > ++ This is done to avoid issues when the final adjustment is smaller > ++ than the probing offset. */ > + else if (final_adjustment_p && rounded_size == 0) > + residual_probe_offset = 0; > + > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c > +new file mode 100644 > +index 00000000000..82447d20fff > +--- /dev/null > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c > +@@ -0,0 +1,100 @@ > ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer > --param stack-clash-protection-guard-size=12" } */ > ++/* { dg-final { check-function-bodies "**" "" } } */ > ++ > ++void f(int, ...); > ++void g(); > ++ > ++/* > ++** test1: > ++** ... > ++** str x30, \[sp\] > ++** sub sp, sp, #4064 > ++** str xzr, \[sp\] > ++** cbnz w0, .* > ++** bl g > ++** ... > ++** str x26, \[sp, #?4128\] > ++** ... > ++*/ > ++int test1(int z) { > ++ __uint128_t x = 0; > ++ int y[0x400]; > ++ if (z) > ++ { > ++ asm volatile ("" ::: > ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", > "x26"); > ++ f(0, 0, 0, 0, 0, 0, 0, &y, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x); > ++ } > ++ g(); > ++ return 1; > ++} > ++ > ++/* > ++** test2: > ++** ... > ++** str x30, \[sp\] > ++** sub sp, sp, #1040 > ++** str xzr, \[sp\] > ++** cbnz w0, .* > ++** bl g > ++** ... > ++*/ > ++int test2(int z) { > ++ __uint128_t x = 0; > ++ int y[0x400]; > ++ if (z) > ++ { > ++ asm volatile ("" ::: > ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", > "x26"); > ++ f(0, 0, 0, 0, 0, 0, 0, &y, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x); > ++ } > ++ g(); > ++ return 1; > ++} > ++ > ++/* > ++** test3: > ++** ... > ++** str x30, \[sp\] > ++** sub sp, sp, #1024 > ++** cbnz w0, .* > ++** bl g > ++** ... > ++*/ > ++int test3(int z) { > ++ __uint128_t x = 0; > ++ int y[0x400]; > ++ if (z) > ++ { > ++ asm volatile ("" ::: > ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", > "x26"); > ++ f(0, 0, 0, 0, 0, 0, 0, &y, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); > ++ } > ++ g(); > ++ return 1; > ++} > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c > +new file mode 100644 > +index 00000000000..73ac3e4e4eb > +--- /dev/null > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c > +@@ -0,0 +1,100 @@ > ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer > --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack > -ffixed-x18" } */ > ++/* { dg-final { check-function-bodies "**" "" } } */ > ++ > ++void f(int, ...); > ++void g(); > ++ > ++/* > ++** test1: > ++** ... > ++** str x30, \[sp\] > ++** sub sp, sp, #4064 > ++** str xzr, \[sp\] > ++** cbnz w0, .* > ++** bl g > ++** ... > ++** str x26, \[sp, #?4128\] > ++** ... > ++*/ > ++int test1(int z) { > ++ __uint128_t x = 0; > ++ int y[0x400]; > ++ if (z) > ++ { > ++ asm volatile ("" ::: > ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", > "x26"); > ++ f(0, 0, 0, 0, 0, 0, 0, &y, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x); > ++ } > ++ g(); > ++ return 1; > ++} > ++ > ++/* > ++** test2: > ++** ... > ++** str x30, \[sp\] > ++** sub sp, sp, #1040 > ++** str xzr, \[sp\] > ++** cbnz w0, .* > ++** bl g > ++** ... > ++*/ > ++int test2(int z) { > ++ __uint128_t x = 0; > ++ int y[0x400]; > ++ if (z) > ++ { > ++ asm volatile ("" ::: > ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", > "x26"); > ++ f(0, 0, 0, 0, 0, 0, 0, &y, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x); > ++ } > ++ g(); > ++ return 1; > ++} > ++ > ++/* > ++** test3: > ++** ... > ++** str x30, \[sp\] > ++** sub sp, sp, #1024 > ++** cbnz w0, .* > ++** bl g > ++** ... > ++*/ > ++int test3(int z) { > ++ __uint128_t x = 0; > ++ int y[0x400]; > ++ if (z) > ++ { > ++ asm volatile ("" ::: > ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", > "x26"); > ++ f(0, 0, 0, 0, 0, 0, 0, &y, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, > ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); > ++ } > ++ g(); > ++ return 1; > ++} > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c > +new file mode 100644 > +index 00000000000..690aae8dfd5 > +--- /dev/null > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c > +@@ -0,0 +1,3 @@ > ++/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection > -fomit-frame-pointer --param stack-clash-protection-guard-size=12 > -fsanitize=shadow-call-stack -ffixed-x18" } */ > ++ > ++#include "stack-check-prologue-19.c" > +-- > +2.34.1 > + > + > +From c4f0e121faa36342f1d21919e54a05ad841c4f86 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:55 +0100 > +Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation > + > +Previous patches ensured that the final frame allocation only needs > +a probe when the size is strictly greater than 1KiB. It's therefore > +safe to use the normal 1024 probe offset in all cases. > + > +The main motivation for doing this is to simplify the code and > +remove the number of special cases. > + > +gcc/ > + * config/aarch64/aarch64.cc > (aarch64_allocate_and_probe_stack_space): > + Always probe the residual allocation at offset 1024, asserting > + that that is in range. > + > +gcc/testsuite/ > + * gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe > + to be at offset 1024 rather than offset 0. > + * gcc.target/aarch64/stack-check-prologue-18.c: Likewise. > + * gcc.target/aarch64/stack-check-prologue-19.c: Likewise. > +--- > + gcc/config/aarch64/aarch64.cc | 12 ++++-------- > + .../gcc.target/aarch64/stack-check-prologue-17.c | 2 +- > + .../gcc.target/aarch64/stack-check-prologue-18.c | 4 ++-- > + .../gcc.target/aarch64/stack-check-prologue-19.c | 4 ++-- > + 4 files changed, 9 insertions(+), 13 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 16433fb70f4..8abf3d7a1e2 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -9523,16 +9523,12 @@ aarch64_allocate_and_probe_stack_space (rtx > temp1, rtx temp2, > + are still safe. */ > + if (residual) > + { > +- HOST_WIDE_INT residual_probe_offset = guard_used_by_caller; > ++ gcc_assert (guard_used_by_caller + byte_sp_alignment <= size); > ++ > + /* If we're doing final adjustments, and we've done any full page > + allocations then any residual needs to be probed. */ > + if (final_adjustment_p && rounded_size != 0) > + min_probe_threshold = 0; > +- /* If doing a small final adjustment, we always probe at offset 0. > +- This is done to avoid issues when the final adjustment is smaller > +- than the probing offset. */ > +- else if (final_adjustment_p && rounded_size == 0) > +- residual_probe_offset = 0; > + > + aarch64_sub_sp (temp1, temp2, residual, frame_related_p); > + if (residual >= min_probe_threshold) > +@@ -9543,8 +9539,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, > rtx temp2, > + HOST_WIDE_INT_PRINT_DEC " bytes, probing will be > required." > + "\n", residual); > + > +- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, > +- residual_probe_offset)); > ++ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, > ++ guard_used_by_caller)); > + emit_insn (gen_blockage ()); > + } > + } > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c > +index 0d8a25d73a2..f0ec1389771 100644 > +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c > +@@ -33,7 +33,7 @@ int test1(int z) { > + ** ... > + ** str x30, \[sp\] > + ** sub sp, sp, #1040 > +-** str xzr, \[sp\] > ++** str xzr, \[sp, #?1024\] > + ** cbnz w0, .* > + ** bl g > + ** ... > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c > +index 82447d20fff..6383bec5ebc 100644 > +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c > +@@ -9,7 +9,7 @@ void g(); > + ** ... > + ** str x30, \[sp\] > + ** sub sp, sp, #4064 > +-** str xzr, \[sp\] > ++** str xzr, \[sp, #?1024\] > + ** cbnz w0, .* > + ** bl g > + ** ... > +@@ -50,7 +50,7 @@ int test1(int z) { > + ** ... > + ** str x30, \[sp\] > + ** sub sp, sp, #1040 > +-** str xzr, \[sp\] > ++** str xzr, \[sp, #?1024\] > + ** cbnz w0, .* > + ** bl g > + ** ... > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c > +index 73ac3e4e4eb..562039b5e9b 100644 > +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c > +@@ -9,7 +9,7 @@ void g(); > + ** ... > + ** str x30, \[sp\] > + ** sub sp, sp, #4064 > +-** str xzr, \[sp\] > ++** str xzr, \[sp, #?1024\] > + ** cbnz w0, .* > + ** bl g > + ** ... > +@@ -50,7 +50,7 @@ int test1(int z) { > + ** ... > + ** str x30, \[sp\] > + ** sub sp, sp, #1040 > +-** str xzr, \[sp\] > ++** str xzr, \[sp, #?1024\] > + ** cbnz w0, .* > + ** bl g > + ** ... > +-- > +2.34.1 > + > + > +From 6f0ab0a9f46a17b68349ff6035aa776bf65f0575 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:56 +0100 > +Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame > + info > + > +The stack frame is currently divided into three areas: > + > +A: the area above the hard frame pointer > +B: the SVE saves below the hard frame pointer > +C: the outgoing arguments > + > +If the stack frame is allocated in one chunk, the allocation needs a > +probe if the frame size is >= guard_size - 1KiB. In addition, if the > +function is not a leaf function, it must probe an address no more than > +1KiB above the outgoing SP. We ensured the second condition by > + > +(1) using single-chunk allocations for non-leaf functions only if > + the link register save slot is within 512 bytes of the bottom > + of the frame; and > + > +(2) using the link register save as a probe (meaning, for instance, > + that it can't be individually shrink wrapped) > + > +If instead the stack is allocated in multiple chunks, then: > + > +* an allocation involving only the outgoing arguments (C above) requires > + a probe if the allocation size is > 1KiB > + > +* any other allocation requires a probe if the allocation size > + is >= guard_size - 1KiB > + > +* second and subsequent allocations require the previous allocation > + to probe at the bottom of the allocated area, regardless of the size > + of that previous allocation > + > +The final point means that, unlike for single allocations, > +it can be necessary to have both a non-SVE register probe and > +an SVE register probe. For example: > + > +* allocate A, probe using a non-SVE register save > +* allocate B, probe using an SVE register save > +* allocate C > + > +The non-SVE register used in this case was again the link register. > +It was previously used even if the link register save slot was some > +bytes above the bottom of the non-SVE register saves, but an earlier > +patch avoided that by putting the link register save slot first. > + > +As a belt-and-braces fix, this patch explicitly records which > +probe registers we're using and allows the non-SVE probe to be > +whichever register comes first (as for SVE). > + > +The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c. > + > +gcc/ > + * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe) > + (aarch64_frame::hard_fp_save_and_probe): New fields. > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize > them. > + Rather than asserting that a leaf function saves LR, instead assert > + that a leaf function saves something. > + (aarch64_get_separate_components): Prevent the chosen probe > + registers from being individually shrink-wrapped. > + (aarch64_allocate_and_probe_stack_space): Remove workaround for > + probe registers that aren't at the bottom of the previous > allocation. > + > +gcc/testsuite/ > + * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant > probes. > +--- > + gcc/config/aarch64/aarch64.cc | 68 +++++++++++++++---- > + gcc/config/aarch64/aarch64.h | 8 +++ > + .../aarch64/sve/pcs/stack_clash_3.c | 6 +- > + 3 files changed, 64 insertions(+), 18 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index 8abf3d7a1e2..a8d907df884 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8210,15 +8210,11 @@ aarch64_layout_frame (void) > + && !crtl->abi->clobbers_full_reg_p (regno)) > + frame.reg_offset[regno] = SLOT_REQUIRED; > + > +- /* With stack-clash, LR must be saved in non-leaf functions. The > saving of > +- LR counts as an implicit probe which allows us to maintain the > invariant > +- described in the comment at expand_prologue. */ > +- gcc_assert (crtl->is_leaf > +- || maybe_ne (frame.reg_offset[R30_REGNUM], > SLOT_NOT_REQUIRED)); > + > + poly_int64 offset = crtl->outgoing_args_size; > + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); > + frame.bytes_below_saved_regs = offset; > ++ frame.sve_save_and_probe = INVALID_REGNUM; > + > + /* Now assign stack slots for the registers. Start with the predicate > + registers, since predicate LDR and STR have a relatively small > +@@ -8226,6 +8222,8 @@ aarch64_layout_frame (void) > + for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) > + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) > + { > ++ if (frame.sve_save_and_probe == INVALID_REGNUM) > ++ frame.sve_save_and_probe = regno; > + frame.reg_offset[regno] = offset; > + offset += BYTES_PER_SVE_PRED; > + } > +@@ -8263,6 +8261,8 @@ aarch64_layout_frame (void) > + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) > + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) > + { > ++ if (frame.sve_save_and_probe == INVALID_REGNUM) > ++ frame.sve_save_and_probe = regno; > + frame.reg_offset[regno] = offset; > + offset += vector_save_size; > + } > +@@ -8272,10 +8272,18 @@ aarch64_layout_frame (void) > + frame.below_hard_fp_saved_regs_size = offset - > frame.bytes_below_saved_regs; > + bool saves_below_hard_fp_p > + = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); > ++ gcc_assert (!saves_below_hard_fp_p > ++ || (frame.sve_save_and_probe != INVALID_REGNUM > ++ && known_eq (frame.reg_offset[frame.sve_save_and_probe], > ++ frame.bytes_below_saved_regs))); > ++ > + frame.bytes_below_hard_fp = offset; > ++ frame.hard_fp_save_and_probe = INVALID_REGNUM; > + > + auto allocate_gpr_slot = [&](unsigned int regno) > + { > ++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM) > ++ frame.hard_fp_save_and_probe = regno; > + frame.reg_offset[regno] = offset; > + if (frame.wb_push_candidate1 == INVALID_REGNUM) > + frame.wb_push_candidate1 = regno; > +@@ -8309,6 +8317,8 @@ aarch64_layout_frame (void) > + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) > + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) > + { > ++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM) > ++ frame.hard_fp_save_and_probe = regno; > + /* If there is an alignment gap between integer and fp > callee-saves, > + allocate the last fp register to it if possible. */ > + if (regno == last_fp_reg > +@@ -8332,6 +8342,17 @@ aarch64_layout_frame (void) > + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); > + > + frame.saved_regs_size = offset - frame.bytes_below_saved_regs; > ++ gcc_assert (known_eq (frame.saved_regs_size, > ++ frame.below_hard_fp_saved_regs_size) > ++ || (frame.hard_fp_save_and_probe != INVALID_REGNUM > ++ && known_eq > (frame.reg_offset[frame.hard_fp_save_and_probe], > ++ frame.bytes_below_hard_fp))); > ++ > ++ /* With stack-clash, a register must be saved in non-leaf functions. > ++ The saving of the bottommost register counts as an implicit probe, > ++ which allows us to maintain the invariant described in the comment > ++ at expand_prologue. */ > ++ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); > + > + offset += get_frame_size (); > + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); > +@@ -8462,6 +8483,25 @@ aarch64_layout_frame (void) > + frame.final_adjust = frame.bytes_below_saved_regs; > + } > + > ++ /* The frame is allocated in pieces, with each non-final piece > ++ including a register save at offset 0 that acts as a probe for > ++ the following piece. In addition, the save of the bottommost > register > ++ acts as a probe for callees and allocas. Roll back any probes that > ++ aren't needed. > ++ > ++ A probe isn't needed if it is associated with the final allocation > ++ (including callees and allocas) that happens before the epilogue is > ++ executed. */ > ++ if (crtl->is_leaf > ++ && !cfun->calls_alloca > ++ && known_eq (frame.final_adjust, 0)) > ++ { > ++ if (maybe_ne (frame.sve_callee_adjust, 0)) > ++ frame.sve_save_and_probe = INVALID_REGNUM; > ++ else > ++ frame.hard_fp_save_and_probe = INVALID_REGNUM; > ++ } > ++ > + /* Make sure the individual adjustments add up to the full frame > size. */ > + gcc_assert (known_eq (frame.initial_adjust > + + frame.callee_adjust > +@@ -9039,13 +9079,6 @@ aarch64_get_separate_components (void) > + > + poly_int64 offset = frame.reg_offset[regno]; > + > +- /* If the register is saved in the first SVE save slot, we use > +- it as a stack probe for -fstack-clash-protection. */ > +- if (flag_stack_clash_protection > +- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) > +- && known_eq (offset, frame.bytes_below_saved_regs)) > +- continue; > +- > + /* Get the offset relative to the register we'll use. */ > + if (frame_pointer_needed) > + offset -= frame.bytes_below_hard_fp; > +@@ -9080,6 +9113,13 @@ aarch64_get_separate_components (void) > + > + bitmap_clear_bit (components, LR_REGNUM); > + bitmap_clear_bit (components, SP_REGNUM); > ++ if (flag_stack_clash_protection) > ++ { > ++ if (frame.sve_save_and_probe != INVALID_REGNUM) > ++ bitmap_clear_bit (components, frame.sve_save_and_probe); > ++ if (frame.hard_fp_save_and_probe != INVALID_REGNUM) > ++ bitmap_clear_bit (components, frame.hard_fp_save_and_probe); > ++ } > + > + return components; > + } > +@@ -9616,8 +9656,8 @@ aarch64_epilogue_uses (int regno) > + When probing is needed, we emit a probe at the start of the prologue > + and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter. > + > +- We have to track how much space has been allocated and the only stores > +- to the stack we track as implicit probes are the FP/LR stores. > ++ We can also use register saves as probes. These are stored in > ++ sve_save_and_probe and hard_fp_save_and_probe. > + > + For outgoing arguments we probe if the size is larger than 1KB, such > that > + the ABI specified buffer is maintained for the next callee. > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index b6135837073..46d4693e206 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -957,6 +957,14 @@ struct GTY (()) aarch64_frame > + This is the register they should use. */ > + unsigned spare_pred_reg; > + > ++ /* An SVE register that is saved below the hard frame pointer and that > acts > ++ as a probe for later allocations, or INVALID_REGNUM if none. */ > ++ unsigned sve_save_and_probe; > ++ > ++ /* A register that is saved at the hard frame pointer and that acts > ++ as a probe for later allocations, or INVALID_REGNUM if none. */ > ++ unsigned hard_fp_save_and_probe; > ++ > + bool laid_out; > + > + /* True if shadow call stack should be enabled for the current > function. */ > +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c > b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c > +index 3e01ec36c3a..3530a0d504b 100644 > +--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c > ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c > +@@ -11,11 +11,10 @@ > + ** mov x11, sp > + ** ... > + ** sub sp, sp, x13 > +-** str p4, \[sp\] > + ** cbz w0, [^\n]* > ++** str p4, \[sp\] > + ** ... > + ** ptrue p0\.b, all > +-** ldr p4, \[sp\] > + ** addvl sp, sp, #1 > + ** ldr x24, \[sp\], 32 > + ** ret > +@@ -39,13 +38,12 @@ test_1 (int n) > + ** mov x11, sp > + ** ... > + ** sub sp, sp, x13 > +-** str p4, \[sp\] > + ** cbz w0, [^\n]* > ++** str p4, \[sp\] > + ** str p5, \[sp, #1, mul vl\] > + ** str p6, \[sp, #2, mul vl\] > + ** ... > + ** ptrue p0\.b, all > +-** ldr p4, \[sp\] > + ** addvl sp, sp, #1 > + ** ldr x24, \[sp\], 32 > + ** ret > +-- > +2.34.1 > + > + > +From 8254e1b9cd500e0c278465a3657543477e9d1250 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:56 +0100 > +Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size > + > +After previous patches, it's no longer necessary to store > +saved_regs_size and below_hard_fp_saved_regs_size in the frame info. > +All measurements instead use the top or bottom of the frame as > +reference points. > + > +gcc/ > + * config/aarch64/aarch64.h (aarch64_frame::saved_regs_size) > + (aarch64_frame::below_hard_fp_saved_regs_size): Delete. > + * config/aarch64/aarch64.cc (aarch64_layout_frame): Update > accordingly. > +--- > + gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++------------------- > + gcc/config/aarch64/aarch64.h | 7 ------ > + 2 files changed, 21 insertions(+), 31 deletions(-) > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index a8d907df884..ac3d3b336a3 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8269,9 +8269,8 @@ aarch64_layout_frame (void) > + > + /* OFFSET is now the offset of the hard frame pointer from the bottom > + of the callee save area. */ > +- frame.below_hard_fp_saved_regs_size = offset - > frame.bytes_below_saved_regs; > +- bool saves_below_hard_fp_p > +- = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); > ++ auto below_hard_fp_saved_regs_size = offset - > frame.bytes_below_saved_regs; > ++ bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, > 0); > + gcc_assert (!saves_below_hard_fp_p > + || (frame.sve_save_and_probe != INVALID_REGNUM > + && known_eq (frame.reg_offset[frame.sve_save_and_probe], > +@@ -8341,9 +8340,8 @@ aarch64_layout_frame (void) > + > + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); > + > +- frame.saved_regs_size = offset - frame.bytes_below_saved_regs; > +- gcc_assert (known_eq (frame.saved_regs_size, > +- frame.below_hard_fp_saved_regs_size) > ++ auto saved_regs_size = offset - frame.bytes_below_saved_regs; > ++ gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size) > + || (frame.hard_fp_save_and_probe != INVALID_REGNUM > + && known_eq > (frame.reg_offset[frame.hard_fp_save_and_probe], > + frame.bytes_below_hard_fp))); > +@@ -8352,7 +8350,7 @@ aarch64_layout_frame (void) > + The saving of the bottommost register counts as an implicit probe, > + which allows us to maintain the invariant described in the comment > + at expand_prologue. */ > +- gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); > ++ gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0)); > + > + offset += get_frame_size (); > + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); > +@@ -8409,7 +8407,7 @@ aarch64_layout_frame (void) > + > + HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; > + HOST_WIDE_INT const_saved_regs_size; > +- if (known_eq (frame.saved_regs_size, 0)) > ++ if (known_eq (saved_regs_size, 0)) > + frame.initial_adjust = frame.frame_size; > + else if (frame.frame_size.is_constant (&const_size) > + && const_size < max_push_offset > +@@ -8422,7 +8420,7 @@ aarch64_layout_frame (void) > + frame.callee_adjust = const_size; > + } > + else if (frame.bytes_below_saved_regs.is_constant > (&const_below_saved_regs) > +- && frame.saved_regs_size.is_constant (&const_saved_regs_size) > ++ && saved_regs_size.is_constant (&const_saved_regs_size) > + && const_below_saved_regs + const_saved_regs_size < 512 > + /* We could handle this case even with data below the saved > + registers, provided that that data left us with valid offsets > +@@ -8441,8 +8439,7 @@ aarch64_layout_frame (void) > + frame.initial_adjust = frame.frame_size; > + } > + else if (saves_below_hard_fp_p > +- && known_eq (frame.saved_regs_size, > +- frame.below_hard_fp_saved_regs_size)) > ++ && known_eq (saved_regs_size, below_hard_fp_saved_regs_size)) > + { > + /* Frame in which all saves are SVE saves: > + > +@@ -8464,7 +8461,7 @@ aarch64_layout_frame (void) > + [save SVE registers relative to SP] > + sub sp, sp, bytes_below_saved_regs */ > + frame.callee_adjust = const_above_fp; > +- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; > ++ frame.sve_callee_adjust = below_hard_fp_saved_regs_size; > + frame.final_adjust = frame.bytes_below_saved_regs; > + } > + else > +@@ -8479,7 +8476,7 @@ aarch64_layout_frame (void) > + [save SVE registers relative to SP] > + sub sp, sp, bytes_below_saved_regs */ > + frame.initial_adjust = frame.bytes_above_hard_fp; > +- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; > ++ frame.sve_callee_adjust = below_hard_fp_saved_regs_size; > + frame.final_adjust = frame.bytes_below_saved_regs; > + } > + > +@@ -9621,17 +9618,17 @@ aarch64_epilogue_uses (int regno) > + | local variables | <-- frame_pointer_rtx > + | | > + +-------------------------------+ > +- | padding | \ > +- +-------------------------------+ | > +- | callee-saved registers | | frame.saved_regs_size > +- +-------------------------------+ | > +- | LR' | | > +- +-------------------------------+ | > +- | FP' | | > +- +-------------------------------+ |<- hard_frame_pointer_rtx > (aligned) > +- | SVE vector registers | | \ > +- +-------------------------------+ | | > below_hard_fp_saved_regs_size > +- | SVE predicate registers | / / > ++ | padding | > ++ +-------------------------------+ > ++ | callee-saved registers | > ++ +-------------------------------+ > ++ | LR' | > ++ +-------------------------------+ > ++ | FP' | > ++ +-------------------------------+ <-- hard_frame_pointer_rtx > (aligned) > ++ | SVE vector registers | > ++ +-------------------------------+ > ++ | SVE predicate registers | > + +-------------------------------+ > + | dynamic allocation | > + +-------------------------------+ > +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > +index 46d4693e206..01f7751bc78 100644 > +--- a/gcc/config/aarch64/aarch64.h > ++++ b/gcc/config/aarch64/aarch64.h > +@@ -871,18 +871,11 @@ struct GTY (()) aarch64_frame > + STACK_BOUNDARY. */ > + HOST_WIDE_INT saved_varargs_size; > + > +- /* The size of the callee-save registers with a slot in REG_OFFSET. */ > +- poly_int64 saved_regs_size; > +- > + /* The number of bytes between the bottom of the static frame (the > bottom > + of the outgoing arguments) and the bottom of the register save area. > + This value is always a multiple of STACK_BOUNDARY. */ > + poly_int64 bytes_below_saved_regs; > + > +- /* The size of the callee-save registers with a slot in REG_OFFSET that > +- are saved below the hard frame pointer. */ > +- poly_int64 below_hard_fp_saved_regs_size; > +- > + /* The number of bytes between the bottom of the static frame (the > bottom > + of the outgoing arguments) and the hard frame pointer. This value > is > + always a multiple of STACK_BOUNDARY. */ > +-- > +2.34.1 > + > + > +From 75c37e031408262263442f5b4cdb83d3777b6422 Mon Sep 17 00:00:00 2001 > +From: Richard Sandiford <richard.sandiford@arm.com> > +Date: Tue, 12 Sep 2023 16:08:57 +0100 > +Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved > + registers > + > +AArch64 normally puts the saved registers near the bottom of the frame, > +immediately above any dynamic allocations. But this means that a > +stack-smash attack on those dynamic allocations could overwrite the > +saved registers without needing to reach as far as the stack smash > +canary. > + > +The same thing could also happen for variable-sized arguments that are > +passed by value, since those are allocated before a call and popped on > +return. > + > +This patch avoids that by putting the locals (and thus the canary) below > +the saved registers when stack smash protection is active. > + > +The patch fixes CVE-2023-4039. > + > +gcc/ > + * config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p): > + New function. > + (aarch64_layout_frame): Use it to decide whether locals should > + go above or below the saved registers. > + (aarch64_expand_prologue): Update stack layout comment. > + Emit a stack tie after the final adjustment. > + > +gcc/testsuite/ > + * gcc.target/aarch64/stack-protector-8.c: New test. > + * gcc.target/aarch64/stack-protector-9.c: Likewise. > +--- > + gcc/config/aarch64/aarch64.cc | 46 +++++++-- > + .../gcc.target/aarch64/stack-protector-8.c | 95 +++++++++++++++++++ > + .../gcc.target/aarch64/stack-protector-9.c | 33 +++++++ > + 3 files changed, 168 insertions(+), 6 deletions(-) > + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c > + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c > + > +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > +index ac3d3b336a3..96c3f48fdc4 100644 > +--- a/gcc/config/aarch64/aarch64.cc > ++++ b/gcc/config/aarch64/aarch64.cc > +@@ -8133,6 +8133,20 @@ aarch64_needs_frame_chain (void) > + return aarch64_use_frame_pointer; > + } > + > ++/* Return true if the current function should save registers above > ++ the locals area, rather than below it. */ > ++ > ++static bool > ++aarch64_save_regs_above_locals_p () > ++{ > ++ /* When using stack smash protection, make sure that the canary slot > ++ comes between the locals and the saved registers. Otherwise, > ++ it would be possible for a carefully sized smash attack to change > ++ the saved registers (particularly LR and FP) without reaching the > ++ canary. */ > ++ return crtl->stack_protect_guard; > ++} > ++ > + /* Mark the registers that need to be saved by the callee and calculate > + the size of the callee-saved registers area and frame record (both FP > + and LR may be omitted). */ > +@@ -8144,6 +8158,7 @@ aarch64_layout_frame (void) > + poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); > + bool frame_related_fp_reg_p = false; > + aarch64_frame &frame = cfun->machine->frame; > ++ poly_int64 top_of_locals = -1; > + > + frame.emit_frame_chain = aarch64_needs_frame_chain (); > + > +@@ -8210,9 +8225,16 @@ aarch64_layout_frame (void) > + && !crtl->abi->clobbers_full_reg_p (regno)) > + frame.reg_offset[regno] = SLOT_REQUIRED; > + > ++ bool regs_at_top_p = aarch64_save_regs_above_locals_p (); > + > + poly_int64 offset = crtl->outgoing_args_size; > + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); > ++ if (regs_at_top_p) > ++ { > ++ offset += get_frame_size (); > ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / > BITS_PER_UNIT); > ++ top_of_locals = offset; > ++ } > + frame.bytes_below_saved_regs = offset; > + frame.sve_save_and_probe = INVALID_REGNUM; > + > +@@ -8352,15 +8374,18 @@ aarch64_layout_frame (void) > + at expand_prologue. */ > + gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0)); > + > +- offset += get_frame_size (); > +- offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); > +- auto top_of_locals = offset; > +- > ++ if (!regs_at_top_p) > ++ { > ++ offset += get_frame_size (); > ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / > BITS_PER_UNIT); > ++ top_of_locals = offset; > ++ } > + offset += frame.saved_varargs_size; > + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); > + frame.frame_size = offset; > + > + frame.bytes_above_hard_fp = frame.frame_size - > frame.bytes_below_hard_fp; > ++ gcc_assert (known_ge (top_of_locals, 0)); > + frame.bytes_above_locals = frame.frame_size - top_of_locals; > + > + frame.initial_adjust = 0; > +@@ -9615,10 +9640,10 @@ aarch64_epilogue_uses (int regno) > + | for register varargs | > + | | > + +-------------------------------+ > +- | local variables | <-- frame_pointer_rtx > ++ | local variables (1) | <-- frame_pointer_rtx > + | | > + +-------------------------------+ > +- | padding | > ++ | padding (1) | > + +-------------------------------+ > + | callee-saved registers | > + +-------------------------------+ > +@@ -9630,6 +9655,10 @@ aarch64_epilogue_uses (int regno) > + +-------------------------------+ > + | SVE predicate registers | > + +-------------------------------+ > ++ | local variables (2) | > ++ +-------------------------------+ > ++ | padding (2) | > ++ +-------------------------------+ > + | dynamic allocation | > + +-------------------------------+ > + | padding | > +@@ -9639,6 +9668,9 @@ aarch64_epilogue_uses (int regno) > + +-------------------------------+ > + | | <-- stack_pointer_rtx (aligned) > + > ++ The regions marked (1) and (2) are mutually exclusive. (2) is used > ++ when aarch64_save_regs_above_locals_p is true. > ++ > + Dynamic stack allocations via alloca() decrease stack_pointer_rtx > + but leave frame_pointer_rtx and hard_frame_pointer_rtx > + unchanged. > +@@ -9834,6 +9866,8 @@ aarch64_expand_prologue (void) > + gcc_assert (known_eq (bytes_below_sp, final_adjust)); > + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, > final_adjust, > + !frame_pointer_needed, true); > ++ if (emit_frame_chain && maybe_ne (final_adjust, 0)) > ++ emit_insn (gen_stack_tie (stack_pointer_rtx, > hard_frame_pointer_rtx)); > + } > + > + /* Return TRUE if we can use a simple_return insn. > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c > b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c > +new file mode 100644 > +index 00000000000..e71d820e365 > +--- /dev/null > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c > +@@ -0,0 +1,95 @@ > ++/* { dg-options " -O -fstack-protector-strong > -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 > -mstack-protector-guard-offset=16" } */ > ++/* { dg-final { check-function-bodies "**" "" } } */ > ++ > ++void g(void *); > ++__SVBool_t *h(void *); > ++ > ++/* > ++** test1: > ++** sub sp, sp, #288 > ++** stp x29, x30, \[sp, #?272\] > ++** add x29, sp, #?272 > ++** mrs (x[0-9]+), tpidr2_el0 > ++** ldr (x[0-9]+), \[\1, #?16\] > ++** str \2, \[sp, #?264\] > ++** mov \2, #?0 > ++** add x0, sp, #?8 > ++** bl g > ++** ... > ++** mrs .* > ++** ... > ++** bne .* > ++** ... > ++** ldp x29, x30, \[sp, #?272\] > ++** add sp, sp, #?288 > ++** ret > ++** bl __stack_chk_fail > ++*/ > ++int test1() { > ++ int y[0x40]; > ++ g(y); > ++ return 1; > ++} > ++ > ++/* > ++** test2: > ++** stp x29, x30, \[sp, #?-16\]! > ++** mov x29, sp > ++** sub sp, sp, #1040 > ++** mrs (x[0-9]+), tpidr2_el0 > ++** ldr (x[0-9]+), \[\1, #?16\] > ++** str \2, \[sp, #?1032\] > ++** mov \2, #?0 > ++** add x0, sp, #?8 > ++** bl g > ++** ... > ++** mrs .* > ++** ... > ++** bne .* > ++** ... > ++** add sp, sp, #?1040 > ++** ldp x29, x30, \[sp\], #?16 > ++** ret > ++** bl __stack_chk_fail > ++*/ > ++int test2() { > ++ int y[0x100]; > ++ g(y); > ++ return 1; > ++} > ++ > ++#pragma GCC target "+sve" > ++ > ++/* > ++** test3: > ++** stp x29, x30, \[sp, #?-16\]! > ++** mov x29, sp > ++** addvl sp, sp, #-18 > ++** ... > ++** str p4, \[sp\] > ++** ... > ++** sub sp, sp, #272 > ++** mrs (x[0-9]+), tpidr2_el0 > ++** ldr (x[0-9]+), \[\1, #?16\] > ++** str \2, \[sp, #?264\] > ++** mov \2, #?0 > ++** add x0, sp, #?8 > ++** bl h > ++** ... > ++** mrs .* > ++** ... > ++** bne .* > ++** ... > ++** add sp, sp, #?272 > ++** ... > ++** ldr p4, \[sp\] > ++** ... > ++** addvl sp, sp, #18 > ++** ldp x29, x30, \[sp\], #?16 > ++** ret > ++** bl __stack_chk_fail > ++*/ > ++__SVBool_t test3() { > ++ int y[0x40]; > ++ return *h(y); > ++} > +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c > b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c > +new file mode 100644 > +index 00000000000..58f322aa480 > +--- /dev/null > ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c > +@@ -0,0 +1,33 @@ > ++/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */ > ++/* { dg-final { check-function-bodies "**" "" } } */ > ++ > ++/* > ++** main: > ++** ... > ++** stp x29, x30, \[sp, #?-[0-9]+\]! > ++** ... > ++** sub sp, sp, #[0-9]+ > ++** ... > ++** str x[0-9]+, \[x29, #?-8\] > ++** ... > ++*/ > ++int f(const char *); > ++void g(void *); > ++int main(int argc, char* argv[]) > ++{ > ++ int a; > ++ int b; > ++ char c[2+f(argv[1])]; > ++ int d[0x100]; > ++ char y; > ++ > ++ y=42; a=4; b=10; > ++ c[0] = 'h'; c[1] = '\0'; > ++ > ++ c[f(argv[2])] = '\0'; > ++ > ++ __builtin_printf("%d %d\n%s\n", a, b, c); > ++ g(d); > ++ > ++ return 0; > ++} > +-- > +2.34.1 > + > -- > 2.34.1 > > > -=-=-=-=-=-=-=-=-=-=-=- > Links: You receive all messages sent to this group. > View/Reply Online (#187543): > https://lists.openembedded.org/g/openembedded-core/message/187543 > Mute This Topic: https://lists.openembedded.org/mt/101319990/3617156 > Group Owner: openembedded-core+owner@lists.openembedded.org > Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [ > martin.jansa@gmail.com] > -=-=-=-=-=-=-=-=-=-=-=- > >
On Thu, Sep 14, 2023 at 11:07 AM Martin Jansa <martin.jansa@gmail.com> wrote: > FYI: one of LGE proprietary components triggers ICE with this applied, > I'll try to find minimal reproducer later, this is just for other people > who might hit the same: > > error: unrecognizable insn: > 2923 | } > | ^ > (insn 416 286 290 17 (parallel [ > (set (mem/c:SI (plus:DI (reg/f:DI 29 x29) > (const_int -260 [0xfffffffffffffefc])) [1 > redacted.pixel_format+0 S4 A32]) > (const_int 0 [0])) > (set (mem/c:SI (plus:DI (reg/f:DI 29 x29) > (const_int -256 [0xffffffffffffff00])) [1 > redacted.pixel_value+0 S4 A128]) > (reg/v:SI 22 x22 [orig:141 color ] [141])) > ]) > "TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c":2903:45 -1 > (expr_list:REG_DEAD (reg/v:SI 22 x22 [orig:141 color ] [141]) > (nil))) > during RTL pass: cprop_hardreg > TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c:2923:1: > internal compiler error: in extract_insn, at recog.cc:2791 > 0x191624a internal_error(char const*, ...) > ???:0 > 0x6bee26 fancy_abort(char const*, int, char const*) > ???:0 > 0x697469 _fatal_insn(char const*, rtx_def const*, char const*, int, char > const*) > ???:0 > 0x697485 _fatal_insn_not_found(rtx_def const*, char const*, int, char > const*) > ???:0 > 0xbef198 extract_constrain_insn(rtx_insn*) > ???:0 > And the same code fails like this only with gcc-12.3 in mickledore and gcc-13.2 in nanbield. kirkstone with gcc-11.4 and your patch (as it is in kirkstone-nut) builds the same code fine.
On 14 Sep 2023, at 10:07, Martin Jansa via lists.openembedded.org <Martin.Jansa=gmail.com@lists.openembedded.org> wrote: > > FYI: one of LGE proprietary components triggers ICE with this applied, I'll try to find minimal reproducer later, this is just for other people who might hit the same: That’s… upsetting. I’ve forwarded this to our toolchain team. If you can whittle down a reproducer that would be _much_ appreciated, but I’ll see if they have any ideas about where the issue might be. Ross
diff --git a/meta/recipes-devtools/gcc/gcc-12.3.inc b/meta/recipes-devtools/gcc/gcc-12.3.inc index 4ec03f925c8..5896f26e1af 100644 --- a/meta/recipes-devtools/gcc/gcc-12.3.inc +++ b/meta/recipes-devtools/gcc/gcc-12.3.inc @@ -63,6 +63,7 @@ SRC_URI = "${BASEURI} \ file://0026-rust-recursion-limit.patch \ file://prefix-map-realpath.patch \ file://hardcoded-paths.patch \ + file://CVE-2023-4039.patch \ " SRC_URI[sha256sum] = "949a5d4f99e786421a93b532b22ffab5578de7321369975b91aec97adfda8c3b" diff --git a/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch new file mode 100644 index 00000000000..8cb52849cd3 --- /dev/null +++ b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch @@ -0,0 +1,3093 @@ +From: Richard Sandiford <richard.sandiford@arm.com> +Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue +Date: Tue, 12 Sep 2023 16:25:10 +0100 + +This series of patches fixes deficiencies in GCC's -fstack-protector +implementation for AArch64 when using dynamically allocated stack space. +This is CVE-2023-4039. See: + +https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64 +https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf + +for more details. + +The fix is to put the saved registers above the locals area when +-fstack-protector is used. + +The series also fixes a stack-clash problem that I found while working +on the CVE. In unpatched sources, the stack-clash problem would only +trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an +equivalent). But it would be a more significant issue with the new +-fstack-protector frame layout. It's therefore important that both +problems are fixed together. + +Some reorganisation of the code seemed necessary to fix the problems in a +cleanish way. The series is therefore quite long, but only a handful of +patches should have any effect on code generation. + +See the individual patches for a detailed description. + +Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches. +I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039. + +CVE: CVE-2023-4039 +Upstream-Status: Backport +Signed-off-by: Ross Burton <ross.burton@arm.com> + + +From 62fbb215cc817e9f2c1ca80282a64f4ee30806bc Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:48 +0100 +Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping code + +aarch64_layout_frame uses a shorthand for referring to +cfun->machine->frame: + + aarch64_frame &frame = cfun->machine->frame; + +This patch does the same for some other heavy users of the structure. +No functional change intended. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use + a local shorthand for cfun->machine->frame. + (aarch64_restore_callee_saves, aarch64_get_separate_components): + (aarch64_process_components): Likewise. + (aarch64_allocate_and_probe_stack_space): Likewise. + (aarch64_expand_prologue, aarch64_expand_epilogue): Likewise. + (aarch64_layout_frame): Use existing shorthand for one more case. +--- + gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++---------------- + 1 file changed, 64 insertions(+), 59 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 226dc9dffd4..ae42ffdedbe 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8351,7 +8351,7 @@ aarch64_layout_frame (void) + frame.is_scs_enabled + = (!crtl->calls_eh_return + && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK) +- && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)); ++ && known_ge (frame.reg_offset[LR_REGNUM], 0)); + + /* When shadow call stack is enabled, the scs_pop in the epilogue will + restore x30, and we don't need to pop x30 again in the traditional +@@ -8763,6 +8763,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + unsigned start, unsigned limit, bool skip_wb, + bool hard_fp_valid_p) + { ++ aarch64_frame &frame = cfun->machine->frame; + rtx_insn *insn; + unsigned regno; + unsigned regno2; +@@ -8777,8 +8778,8 @@ aarch64_save_callee_saves (poly_int64 start_offset, + bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); + + if (skip_wb +- && (regno == cfun->machine->frame.wb_push_candidate1 +- || regno == cfun->machine->frame.wb_push_candidate2)) ++ && (regno == frame.wb_push_candidate1 ++ || regno == frame.wb_push_candidate2)) + continue; + + if (cfun->machine->reg_is_wrapped_separately[regno]) +@@ -8786,7 +8787,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = start_offset + cfun->machine->frame.reg_offset[regno]; ++ offset = start_offset + frame.reg_offset[regno]; + rtx base_rtx = stack_pointer_rtx; + poly_int64 sp_offset = offset; + +@@ -8799,7 +8800,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + { + gcc_assert (known_eq (start_offset, 0)); + poly_int64 fp_offset +- = cfun->machine->frame.below_hard_fp_saved_regs_size; ++ = frame.below_hard_fp_saved_regs_size; + if (hard_fp_valid_p) + base_rtx = hard_frame_pointer_rtx; + else +@@ -8821,8 +8822,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit + && !cfun->machine->reg_is_wrapped_separately[regno2] + && known_eq (GET_MODE_SIZE (mode), +- cfun->machine->frame.reg_offset[regno2] +- - cfun->machine->frame.reg_offset[regno])) ++ frame.reg_offset[regno2] - frame.reg_offset[regno])) + { + rtx reg2 = gen_rtx_REG (mode, regno2); + rtx mem2; +@@ -8872,6 +8872,7 @@ static void + aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, + unsigned limit, bool skip_wb, rtx *cfi_ops) + { ++ aarch64_frame &frame = cfun->machine->frame; + unsigned regno; + unsigned regno2; + poly_int64 offset; +@@ -8888,13 +8889,13 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, + rtx reg, mem; + + if (skip_wb +- && (regno == cfun->machine->frame.wb_pop_candidate1 +- || regno == cfun->machine->frame.wb_pop_candidate2)) ++ && (regno == frame.wb_pop_candidate1 ++ || regno == frame.wb_pop_candidate2)) + continue; + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = start_offset + cfun->machine->frame.reg_offset[regno]; ++ offset = start_offset + frame.reg_offset[regno]; + rtx base_rtx = stack_pointer_rtx; + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, +@@ -8905,8 +8906,7 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, + && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit + && !cfun->machine->reg_is_wrapped_separately[regno2] + && known_eq (GET_MODE_SIZE (mode), +- cfun->machine->frame.reg_offset[regno2] +- - cfun->machine->frame.reg_offset[regno])) ++ frame.reg_offset[regno2] - frame.reg_offset[regno])) + { + rtx reg2 = gen_rtx_REG (mode, regno2); + rtx mem2; +@@ -9011,6 +9011,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset) + static sbitmap + aarch64_get_separate_components (void) + { ++ aarch64_frame &frame = cfun->machine->frame; + sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); + bitmap_clear (components); + +@@ -9027,18 +9028,18 @@ aarch64_get_separate_components (void) + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + continue; + +- poly_int64 offset = cfun->machine->frame.reg_offset[regno]; ++ poly_int64 offset = frame.reg_offset[regno]; + + /* If the register is saved in the first SVE save slot, we use + it as a stack probe for -fstack-clash-protection. */ + if (flag_stack_clash_protection +- && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0) ++ && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) + && known_eq (offset, 0)) + continue; + + /* Get the offset relative to the register we'll use. */ + if (frame_pointer_needed) +- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; ++ offset -= frame.below_hard_fp_saved_regs_size; + else + offset += crtl->outgoing_args_size; + +@@ -9057,11 +9058,11 @@ aarch64_get_separate_components (void) + /* If the spare predicate register used by big-endian SVE code + is call-preserved, it must be saved in the main prologue + before any saves that use it. */ +- if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM) +- bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg); ++ if (frame.spare_pred_reg != INVALID_REGNUM) ++ bitmap_clear_bit (components, frame.spare_pred_reg); + +- unsigned reg1 = cfun->machine->frame.wb_push_candidate1; +- unsigned reg2 = cfun->machine->frame.wb_push_candidate2; ++ unsigned reg1 = frame.wb_push_candidate1; ++ unsigned reg2 = frame.wb_push_candidate2; + /* If registers have been chosen to be stored/restored with + writeback don't interfere with them to avoid having to output explicit + stack adjustment instructions. */ +@@ -9170,6 +9171,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int start) + static void + aarch64_process_components (sbitmap components, bool prologue_p) + { ++ aarch64_frame &frame = cfun->machine->frame; + rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed + ? HARD_FRAME_POINTER_REGNUM + : STACK_POINTER_REGNUM); +@@ -9184,9 +9186,9 @@ aarch64_process_components (sbitmap components, bool prologue_p) + machine_mode mode = aarch64_reg_save_mode (regno); + + rtx reg = gen_rtx_REG (mode, regno); +- poly_int64 offset = cfun->machine->frame.reg_offset[regno]; ++ poly_int64 offset = frame.reg_offset[regno]; + if (frame_pointer_needed) +- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; ++ offset -= frame.below_hard_fp_saved_regs_size; + else + offset += crtl->outgoing_args_size; + +@@ -9211,14 +9213,14 @@ aarch64_process_components (sbitmap components, bool prologue_p) + break; + } + +- poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2]; ++ poly_int64 offset2 = frame.reg_offset[regno2]; + /* The next register is not of the same class or its offset is not + mergeable with the current one into a pair. */ + if (aarch64_sve_mode_p (mode) + || !satisfies_constraint_Ump (mem) + || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2) + || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno)) +- || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]), ++ || maybe_ne ((offset2 - frame.reg_offset[regno]), + GET_MODE_SIZE (mode))) + { + insn = emit_insn (set); +@@ -9240,7 +9242,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + /* REGNO2 can be saved/restored in a pair with REGNO. */ + rtx reg2 = gen_rtx_REG (mode, regno2); + if (frame_pointer_needed) +- offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size; ++ offset2 -= frame.below_hard_fp_saved_regs_size; + else + offset2 += crtl->outgoing_args_size; + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); +@@ -9335,6 +9337,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + bool frame_related_p, + bool final_adjustment_p) + { ++ aarch64_frame &frame = cfun->machine->frame; + HOST_WIDE_INT guard_size + = 1 << param_stack_clash_protection_guard_size; + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; +@@ -9355,25 +9358,25 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + register as a probe. We can't assume that LR was saved at position 0 + though, so treat any space below it as unprobed. */ + if (final_adjustment_p +- && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)) ++ && known_eq (frame.below_hard_fp_saved_regs_size, 0)) + { +- poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM]; ++ poly_int64 lr_offset = frame.reg_offset[LR_REGNUM]; + if (known_ge (lr_offset, 0)) + min_probe_threshold -= lr_offset.to_constant (); + else + gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0)); + } + +- poly_int64 frame_size = cfun->machine->frame.frame_size; ++ poly_int64 frame_size = frame.frame_size; + + /* We should always have a positive probe threshold. */ + gcc_assert (min_probe_threshold > 0); + + if (flag_stack_clash_protection && !final_adjustment_p) + { +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; ++ poly_int64 initial_adjust = frame.initial_adjust; ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; ++ poly_int64 final_adjust = frame.final_adjust; + + if (known_eq (frame_size, 0)) + { +@@ -9662,17 +9665,18 @@ aarch64_epilogue_uses (int regno) + void + aarch64_expand_prologue (void) + { +- poly_int64 frame_size = cfun->machine->frame.frame_size; +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; +- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; +- poly_int64 callee_offset = cfun->machine->frame.callee_offset; +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; ++ aarch64_frame &frame = cfun->machine->frame; ++ poly_int64 frame_size = frame.frame_size; ++ poly_int64 initial_adjust = frame.initial_adjust; ++ HOST_WIDE_INT callee_adjust = frame.callee_adjust; ++ poly_int64 final_adjust = frame.final_adjust; ++ poly_int64 callee_offset = frame.callee_offset; ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; + poly_int64 below_hard_fp_saved_regs_size +- = cfun->machine->frame.below_hard_fp_saved_regs_size; +- unsigned reg1 = cfun->machine->frame.wb_push_candidate1; +- unsigned reg2 = cfun->machine->frame.wb_push_candidate2; +- bool emit_frame_chain = cfun->machine->frame.emit_frame_chain; ++ = frame.below_hard_fp_saved_regs_size; ++ unsigned reg1 = frame.wb_push_candidate1; ++ unsigned reg2 = frame.wb_push_candidate2; ++ bool emit_frame_chain = frame.emit_frame_chain; + rtx_insn *insn; + + if (flag_stack_clash_protection && known_eq (callee_adjust, 0)) +@@ -9703,7 +9707,7 @@ aarch64_expand_prologue (void) + } + + /* Push return address to shadow call stack. */ +- if (cfun->machine->frame.is_scs_enabled) ++ if (frame.is_scs_enabled) + emit_insn (gen_scs_push ()); + + if (flag_stack_usage_info) +@@ -9742,7 +9746,7 @@ aarch64_expand_prologue (void) + + /* The offset of the frame chain record (if any) from the current SP. */ + poly_int64 chain_offset = (initial_adjust + callee_adjust +- - cfun->machine->frame.hard_fp_offset); ++ - frame.hard_fp_offset); + gcc_assert (known_ge (chain_offset, 0)); + + /* The offset of the bottom of the save area from the current SP. */ +@@ -9845,16 +9849,17 @@ aarch64_use_return_insn_p (void) + void + aarch64_expand_epilogue (bool for_sibcall) + { +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; +- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; +- poly_int64 callee_offset = cfun->machine->frame.callee_offset; +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; ++ aarch64_frame &frame = cfun->machine->frame; ++ poly_int64 initial_adjust = frame.initial_adjust; ++ HOST_WIDE_INT callee_adjust = frame.callee_adjust; ++ poly_int64 final_adjust = frame.final_adjust; ++ poly_int64 callee_offset = frame.callee_offset; ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; + poly_int64 below_hard_fp_saved_regs_size +- = cfun->machine->frame.below_hard_fp_saved_regs_size; +- unsigned reg1 = cfun->machine->frame.wb_pop_candidate1; +- unsigned reg2 = cfun->machine->frame.wb_pop_candidate2; +- unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled ++ = frame.below_hard_fp_saved_regs_size; ++ unsigned reg1 = frame.wb_pop_candidate1; ++ unsigned reg2 = frame.wb_pop_candidate2; ++ unsigned int last_gpr = (frame.is_scs_enabled + ? R29_REGNUM : R30_REGNUM); + rtx cfi_ops = NULL; + rtx_insn *insn; +@@ -9888,7 +9893,7 @@ aarch64_expand_epilogue (bool for_sibcall) + /* We need to add memory barrier to prevent read from deallocated stack. */ + bool need_barrier_p + = maybe_ne (get_frame_size () +- + cfun->machine->frame.saved_varargs_size, 0); ++ + frame.saved_varargs_size, 0); + + /* Emit a barrier to prevent loads from a deallocated stack. */ + if (maybe_gt (final_adjust, crtl->outgoing_args_size) +@@ -9969,7 +9974,7 @@ aarch64_expand_epilogue (bool for_sibcall) + } + + /* Pop return address from shadow call stack. */ +- if (cfun->machine->frame.is_scs_enabled) ++ if (frame.is_scs_enabled) + { + machine_mode mode = aarch64_reg_save_mode (R30_REGNUM); + rtx reg = gen_rtx_REG (mode, R30_REGNUM); +@@ -12564,24 +12569,24 @@ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) + poly_int64 + aarch64_initial_elimination_offset (unsigned from, unsigned to) + { ++ aarch64_frame &frame = cfun->machine->frame; ++ + if (to == HARD_FRAME_POINTER_REGNUM) + { + if (from == ARG_POINTER_REGNUM) +- return cfun->machine->frame.hard_fp_offset; ++ return frame.hard_fp_offset; + + if (from == FRAME_POINTER_REGNUM) +- return cfun->machine->frame.hard_fp_offset +- - cfun->machine->frame.locals_offset; ++ return frame.hard_fp_offset - frame.locals_offset; + } + + if (to == STACK_POINTER_REGNUM) + { + if (from == FRAME_POINTER_REGNUM) +- return cfun->machine->frame.frame_size +- - cfun->machine->frame.locals_offset; ++ return frame.frame_size - frame.locals_offset; + } + +- return cfun->machine->frame.frame_size; ++ return frame.frame_size; + } + + +-- +2.34.1 + + +From 12a8889de169f892d2e927584c00d20b8b7e456f Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:49 +0100 +Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset + +When we emit the frame chain, i.e. when we reach Here in this statement +of aarch64_expand_prologue: + + if (emit_frame_chain) + { + // Here + ... + } + +the stack is in one of two states: + +- We've allocated up to the frame chain, but no more. + +- We've allocated the whole frame, and the frame chain is within easy + reach of the new SP. + +The offset of the frame chain from the current SP is available +in aarch64_frame as callee_offset. It is also available as the +chain_offset local variable, where the latter is calculated from other +data. (However, chain_offset is not always equal to callee_offset when +!emit_frame_chain, so chain_offset isn't redundant.) + +In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using +chain_offset for the initialisation of the hard frame pointer: + + aarch64_add_offset (Pmode, hard_frame_pointer_rtx, +- stack_pointer_rtx, callee_offset, ++ stack_pointer_rtx, chain_offset, + tmp1_rtx, tmp0_rtx, frame_pointer_needed); + +But the later REG_CFA_ADJUST_CFA handling still used callee_offset. + +I think the difference is harmless, but it's more logical for the +CFA note to be in sync, and it's more convenient for later patches +if it uses chain_offset. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_expand_prologue): Use + chain_offset rather than callee_offset. +--- + gcc/config/aarch64/aarch64.cc | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index ae42ffdedbe..79253322fd7 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -9670,7 +9670,6 @@ aarch64_expand_prologue (void) + poly_int64 initial_adjust = frame.initial_adjust; + HOST_WIDE_INT callee_adjust = frame.callee_adjust; + poly_int64 final_adjust = frame.final_adjust; +- poly_int64 callee_offset = frame.callee_offset; + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; + poly_int64 below_hard_fp_saved_regs_size + = frame.below_hard_fp_saved_regs_size; +@@ -9783,8 +9782,7 @@ aarch64_expand_prologue (void) + implicit. */ + if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX)) + { +- rtx src = plus_constant (Pmode, stack_pointer_rtx, +- callee_offset); ++ rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset); + add_reg_note (insn, REG_CFA_ADJUST_CFA, + gen_rtx_SET (hard_frame_pointer_rtx, src)); + } +-- +2.34.1 + + +From 03d5e89e7f3be53fd7142556e8e0a2774c653dca Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:49 +0100 +Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved + registers + +If a frame has no saved registers, it can be allocated in one go. +There is no need to treat the areas below and above the saved +registers as separate. + +And if we allocate the frame in one go, it should be allocated +as the initial_adjust rather than the final_adjust. This allows the +frame size to grow to guard_size - guard_used_by_caller before a stack +probe is needed. (A frame with no register saves is necessarily a +leaf frame.) + +This is a no-op as thing stand, since a leaf function will have +no outgoing arguments, and so all the frame will be above where +the saved registers normally go. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly + allocate the frame in one go if there are no saved registers. +--- + gcc/config/aarch64/aarch64.cc | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 79253322fd7..e1f21230c15 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8378,9 +8378,11 @@ aarch64_layout_frame (void) + + HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; + HOST_WIDE_INT const_saved_regs_size; +- if (frame.frame_size.is_constant (&const_size) +- && const_size < max_push_offset +- && known_eq (frame.hard_fp_offset, const_size)) ++ if (known_eq (frame.saved_regs_size, 0)) ++ frame.initial_adjust = frame.frame_size; ++ else if (frame.frame_size.is_constant (&const_size) ++ && const_size < max_push_offset ++ && known_eq (frame.hard_fp_offset, const_size)) + { + /* Simple, small frame with no outgoing arguments: + +-- +2.34.1 + + +From 49c2eb7616756c323b7f6b18d8616ec945eb1263 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:49 +0100 +Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info + +The frame layout code currently hard-codes the assumption that +the number of bytes below the saved registers is equal to the +size of the outgoing arguments. This patch abstracts that +value into a new field of aarch64_frame. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::bytes_below_saved_regs): New + field. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it, + and use it instead of crtl->outgoing_args_size. + (aarch64_get_separate_components): Use bytes_below_saved_regs instead + of outgoing_args_size. + (aarch64_process_components): Likewise. +--- + gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++----------------- + gcc/config/aarch64/aarch64.h | 5 +++ + 2 files changed, 41 insertions(+), 35 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index e1f21230c15..94e1b686584 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8217,6 +8217,8 @@ aarch64_layout_frame (void) + gcc_assert (crtl->is_leaf + || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); + ++ frame.bytes_below_saved_regs = crtl->outgoing_args_size; ++ + /* Now assign stack slots for the registers. Start with the predicate + registers, since predicate LDR and STR have a relatively small + offset range. These saves happen below the hard frame pointer. */ +@@ -8321,18 +8323,18 @@ aarch64_layout_frame (void) + + poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size; + +- poly_int64 above_outgoing_args ++ poly_int64 saved_regs_and_above + = aligned_upper_bound (varargs_and_saved_regs_size + + get_frame_size (), + STACK_BOUNDARY / BITS_PER_UNIT); + + frame.hard_fp_offset +- = above_outgoing_args - frame.below_hard_fp_saved_regs_size; ++ = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; + + /* Both these values are already aligned. */ +- gcc_assert (multiple_p (crtl->outgoing_args_size, ++ gcc_assert (multiple_p (frame.bytes_below_saved_regs, + STACK_BOUNDARY / BITS_PER_UNIT)); +- frame.frame_size = above_outgoing_args + crtl->outgoing_args_size; ++ frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; + + frame.locals_offset = frame.saved_varargs_size; + +@@ -8376,7 +8378,7 @@ aarch64_layout_frame (void) + else if (frame.wb_pop_candidate1 != INVALID_REGNUM) + max_push_offset = 256; + +- HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; ++ HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset; + HOST_WIDE_INT const_saved_regs_size; + if (known_eq (frame.saved_regs_size, 0)) + frame.initial_adjust = frame.frame_size; +@@ -8384,31 +8386,31 @@ aarch64_layout_frame (void) + && const_size < max_push_offset + && known_eq (frame.hard_fp_offset, const_size)) + { +- /* Simple, small frame with no outgoing arguments: ++ /* Simple, small frame with no data below the saved registers. + + stp reg1, reg2, [sp, -frame_size]! + stp reg3, reg4, [sp, 16] */ + frame.callee_adjust = const_size; + } +- else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size) ++ else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs) + && frame.saved_regs_size.is_constant (&const_saved_regs_size) +- && const_outgoing_args_size + const_saved_regs_size < 512 +- /* We could handle this case even with outgoing args, provided +- that the number of args left us with valid offsets for all +- predicate and vector save slots. It's such a rare case that +- it hardly seems worth the effort though. */ +- && (!saves_below_hard_fp_p || const_outgoing_args_size == 0) ++ && const_below_saved_regs + const_saved_regs_size < 512 ++ /* We could handle this case even with data below the saved ++ registers, provided that that data left us with valid offsets ++ for all predicate and vector save slots. It's such a rare ++ case that it hardly seems worth the effort though. */ ++ && (!saves_below_hard_fp_p || const_below_saved_regs == 0) + && !(cfun->calls_alloca + && frame.hard_fp_offset.is_constant (&const_fp_offset) + && const_fp_offset < max_push_offset)) + { +- /* Frame with small outgoing arguments: ++ /* Frame with small area below the saved registers: + + sub sp, sp, frame_size +- stp reg1, reg2, [sp, outgoing_args_size] +- stp reg3, reg4, [sp, outgoing_args_size + 16] */ ++ stp reg1, reg2, [sp, bytes_below_saved_regs] ++ stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */ + frame.initial_adjust = frame.frame_size; +- frame.callee_offset = const_outgoing_args_size; ++ frame.callee_offset = const_below_saved_regs; + } + else if (saves_below_hard_fp_p + && known_eq (frame.saved_regs_size, +@@ -8418,30 +8420,29 @@ aarch64_layout_frame (void) + + sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size + save SVE registers relative to SP +- sub sp, sp, outgoing_args_size */ ++ sub sp, sp, bytes_below_saved_regs */ + frame.initial_adjust = (frame.hard_fp_offset + + frame.below_hard_fp_saved_regs_size); +- frame.final_adjust = crtl->outgoing_args_size; ++ frame.final_adjust = frame.bytes_below_saved_regs; + } + else if (frame.hard_fp_offset.is_constant (&const_fp_offset) + && const_fp_offset < max_push_offset) + { +- /* Frame with large outgoing arguments or SVE saves, but with +- a small local area: ++ /* Frame with large area below the saved registers, or with SVE saves, ++ but with a small area above: + + stp reg1, reg2, [sp, -hard_fp_offset]! + stp reg3, reg4, [sp, 16] + [sub sp, sp, below_hard_fp_saved_regs_size] + [save SVE registers relative to SP] +- sub sp, sp, outgoing_args_size */ ++ sub sp, sp, bytes_below_saved_regs */ + frame.callee_adjust = const_fp_offset; + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; +- frame.final_adjust = crtl->outgoing_args_size; ++ frame.final_adjust = frame.bytes_below_saved_regs; + } + else + { +- /* Frame with large local area and outgoing arguments or SVE saves, +- using frame pointer: ++ /* General case: + + sub sp, sp, hard_fp_offset + stp x29, x30, [sp, 0] +@@ -8449,10 +8450,10 @@ aarch64_layout_frame (void) + stp reg3, reg4, [sp, 16] + [sub sp, sp, below_hard_fp_saved_regs_size] + [save SVE registers relative to SP] +- sub sp, sp, outgoing_args_size */ ++ sub sp, sp, bytes_below_saved_regs */ + frame.initial_adjust = frame.hard_fp_offset; + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; +- frame.final_adjust = crtl->outgoing_args_size; ++ frame.final_adjust = frame.bytes_below_saved_regs; + } + + /* Make sure the individual adjustments add up to the full frame size. */ +@@ -9043,7 +9044,7 @@ aarch64_get_separate_components (void) + if (frame_pointer_needed) + offset -= frame.below_hard_fp_saved_regs_size; + else +- offset += crtl->outgoing_args_size; ++ offset += frame.bytes_below_saved_regs; + + /* Check that we can access the stack slot of the register with one + direct load with no adjustments needed. */ +@@ -9192,7 +9193,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + if (frame_pointer_needed) + offset -= frame.below_hard_fp_saved_regs_size; + else +- offset += crtl->outgoing_args_size; ++ offset += frame.bytes_below_saved_regs; + + rtx addr = plus_constant (Pmode, ptr_reg, offset); + rtx mem = gen_frame_mem (mode, addr); +@@ -9246,7 +9247,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + if (frame_pointer_needed) + offset2 -= frame.below_hard_fp_saved_regs_size; + else +- offset2 += crtl->outgoing_args_size; ++ offset2 += frame.bytes_below_saved_regs; + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); + rtx mem2 = gen_frame_mem (mode, addr2); + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) +@@ -9320,10 +9321,10 @@ aarch64_stack_clash_protection_alloca_probe_range (void) + registers. If POLY_SIZE is not large enough to require a probe this function + will only adjust the stack. When allocating the stack space + FRAME_RELATED_P is then used to indicate if the allocation is frame related. +- FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing +- arguments. If we are then we ensure that any allocation larger than the ABI +- defined buffer needs a probe so that the invariant of having a 1KB buffer is +- maintained. ++ FINAL_ADJUSTMENT_P indicates whether we are allocating the area below ++ the saved registers. If we are then we ensure that any allocation ++ larger than the ABI defined buffer needs a probe so that the ++ invariant of having a 1KB buffer is maintained. + + We emit barriers after each stack adjustment to prevent optimizations from + breaking the invariant that we never drop the stack more than a page. This +@@ -9532,7 +9533,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to + be probed. This maintains the requirement that each page is probed at + least once. For initial probing we probe only if the allocation is +- more than GUARD_SIZE - buffer, and for the outgoing arguments we probe ++ more than GUARD_SIZE - buffer, and below the saved registers we probe + if the amount is larger than buffer. GUARD_SIZE - buffer + buffer == + GUARD_SIZE. This works that for any allocation that is large enough to + trigger a probe here, we'll have at least one, and if they're not large +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 6834c3e9922..1e105e12db8 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -871,6 +871,11 @@ struct GTY (()) aarch64_frame + /* The size of the callee-save registers with a slot in REG_OFFSET. */ + poly_int64 saved_regs_size; + ++ /* The number of bytes between the bottom of the static frame (the bottom ++ of the outgoing arguments) and the bottom of the register save area. ++ This value is always a multiple of STACK_BOUNDARY. */ ++ poly_int64 bytes_below_saved_regs; ++ + /* The size of the callee-save registers with a slot in REG_OFFSET that + are saved below the hard frame pointer. */ + poly_int64 below_hard_fp_saved_regs_size; +-- +2.34.1 + + +From 34081079ea4de0c98331843f574b5f6f94d7b234 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:50 +0100 +Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info + +Following on from the previous bytes_below_saved_regs patch, this one +records the number of bytes that are below the hard frame pointer. +This eventually replaces below_hard_fp_saved_regs_size. + +If a frame pointer is not needed, the epilogue adds final_adjust +to the stack pointer before restoring registers: + + aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true); + +Therefore, if the epilogue needs to restore the stack pointer from +the hard frame pointer, the directly corresponding offset is: + + -bytes_below_hard_fp + final_adjust + +i.e. go from the hard frame pointer to the bottom of the frame, +then add the same amount as if we were using the stack pointer +from the outset. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): New + field. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it. + (aarch64_expand_epilogue): Use it instead of + below_hard_fp_saved_regs_size. +--- + gcc/config/aarch64/aarch64.cc | 6 +++--- + gcc/config/aarch64/aarch64.h | 5 +++++ + 2 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 94e1b686584..c7d84245fbf 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8269,6 +8269,7 @@ aarch64_layout_frame (void) + of the callee save area. */ + bool saves_below_hard_fp_p = maybe_ne (offset, 0); + frame.below_hard_fp_saved_regs_size = offset; ++ frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs; + if (frame.emit_frame_chain) + { + /* FP and LR are placed in the linkage record. */ +@@ -9856,8 +9857,7 @@ aarch64_expand_epilogue (bool for_sibcall) + poly_int64 final_adjust = frame.final_adjust; + poly_int64 callee_offset = frame.callee_offset; + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; +- poly_int64 below_hard_fp_saved_regs_size +- = frame.below_hard_fp_saved_regs_size; ++ poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp; + unsigned reg1 = frame.wb_pop_candidate1; + unsigned reg2 = frame.wb_pop_candidate2; + unsigned int last_gpr = (frame.is_scs_enabled +@@ -9915,7 +9915,7 @@ aarch64_expand_epilogue (bool for_sibcall) + is restored on the instruction doing the writeback. */ + aarch64_add_offset (Pmode, stack_pointer_rtx, + hard_frame_pointer_rtx, +- -callee_offset - below_hard_fp_saved_regs_size, ++ -bytes_below_hard_fp + final_adjust, + tmp1_rtx, tmp0_rtx, callee_adjust == 0); + else + /* The case where we need to re-use the register here is very rare, so +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 1e105e12db8..de68ff7202f 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -880,6 +880,11 @@ struct GTY (()) aarch64_frame + are saved below the hard frame pointer. */ + poly_int64 below_hard_fp_saved_regs_size; + ++ /* The number of bytes between the bottom of the static frame (the bottom ++ of the outgoing arguments) and the hard frame pointer. This value is ++ always a multiple of STACK_BOUNDARY. */ ++ poly_int64 bytes_below_hard_fp; ++ + /* Offset from the base of the frame (incomming SP) to the + top of the locals area. This value is always a multiple of + STACK_BOUNDARY. */ +-- +2.34.1 + + +From 187861af7c51db9eddc6f954b589c121b210fc74 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:50 +0100 +Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves + +aarch64_save_callee_saves and aarch64_restore_callee_saves took +a parameter called start_offset that gives the offset of the +bottom of the saved register area from the current stack pointer. +However, it's more convenient for later patches if we use the +bottom of the entire frame as the reference point, rather than +the bottom of the saved registers. + +Doing that removes the need for the callee_offset field. +Other than that, this is not a win on its own. It only really +makes sense in combination with the follow-on patches. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Remove + callee_offset handling. + (aarch64_save_callee_saves): Replace the start_offset parameter + with a bytes_below_sp parameter. + (aarch64_restore_callee_saves): Likewise. + (aarch64_expand_prologue): Update accordingly. + (aarch64_expand_epilogue): Likewise. +--- + gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------ + gcc/config/aarch64/aarch64.h | 4 --- + 2 files changed, 28 insertions(+), 32 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index c7d84245fbf..e79551af41d 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8343,7 +8343,6 @@ aarch64_layout_frame (void) + frame.final_adjust = 0; + frame.callee_adjust = 0; + frame.sve_callee_adjust = 0; +- frame.callee_offset = 0; + + frame.wb_pop_candidate1 = frame.wb_push_candidate1; + frame.wb_pop_candidate2 = frame.wb_push_candidate2; +@@ -8411,7 +8410,6 @@ aarch64_layout_frame (void) + stp reg1, reg2, [sp, bytes_below_saved_regs] + stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */ + frame.initial_adjust = frame.frame_size; +- frame.callee_offset = const_below_saved_regs; + } + else if (saves_below_hard_fp_p + && known_eq (frame.saved_regs_size, +@@ -8758,12 +8756,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx reg, + } + + /* Emit code to save the callee-saved registers from register number START +- to LIMIT to the stack at the location starting at offset START_OFFSET, +- skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P +- is true if the hard frame pointer has been set up. */ ++ to LIMIT to the stack. The stack pointer is currently BYTES_BELOW_SP ++ bytes above the bottom of the static frame. Skip any write-back ++ candidates if SKIP_WB is true. HARD_FP_VALID_P is true if the hard ++ frame pointer has been set up. */ + + static void +-aarch64_save_callee_saves (poly_int64 start_offset, ++aarch64_save_callee_saves (poly_int64 bytes_below_sp, + unsigned start, unsigned limit, bool skip_wb, + bool hard_fp_valid_p) + { +@@ -8791,7 +8790,9 @@ aarch64_save_callee_saves (poly_int64 start_offset, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = start_offset + frame.reg_offset[regno]; ++ offset = (frame.reg_offset[regno] ++ + frame.bytes_below_saved_regs ++ - bytes_below_sp); + rtx base_rtx = stack_pointer_rtx; + poly_int64 sp_offset = offset; + +@@ -8802,9 +8803,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + else if (GP_REGNUM_P (regno) + && (!offset.is_constant (&const_offset) || const_offset >= 512)) + { +- gcc_assert (known_eq (start_offset, 0)); +- poly_int64 fp_offset +- = frame.below_hard_fp_saved_regs_size; ++ poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp; + if (hard_fp_valid_p) + base_rtx = hard_frame_pointer_rtx; + else +@@ -8868,12 +8867,13 @@ aarch64_save_callee_saves (poly_int64 start_offset, + } + + /* Emit code to restore the callee registers from register number START +- up to and including LIMIT. Restore from the stack offset START_OFFSET, +- skipping any write-back candidates if SKIP_WB is true. Write the +- appropriate REG_CFA_RESTORE notes into CFI_OPS. */ ++ up to and including LIMIT. The stack pointer is currently BYTES_BELOW_SP ++ bytes above the bottom of the static frame. Skip any write-back ++ candidates if SKIP_WB is true. Write the appropriate REG_CFA_RESTORE ++ notes into CFI_OPS. */ + + static void +-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, ++aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start, + unsigned limit, bool skip_wb, rtx *cfi_ops) + { + aarch64_frame &frame = cfun->machine->frame; +@@ -8899,7 +8899,9 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = start_offset + frame.reg_offset[regno]; ++ offset = (frame.reg_offset[regno] ++ + frame.bytes_below_saved_regs ++ - bytes_below_sp); + rtx base_rtx = stack_pointer_rtx; + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, +@@ -9675,8 +9677,6 @@ aarch64_expand_prologue (void) + HOST_WIDE_INT callee_adjust = frame.callee_adjust; + poly_int64 final_adjust = frame.final_adjust; + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; +- poly_int64 below_hard_fp_saved_regs_size +- = frame.below_hard_fp_saved_regs_size; + unsigned reg1 = frame.wb_push_candidate1; + unsigned reg2 = frame.wb_push_candidate2; + bool emit_frame_chain = frame.emit_frame_chain; +@@ -9752,8 +9752,8 @@ aarch64_expand_prologue (void) + - frame.hard_fp_offset); + gcc_assert (known_ge (chain_offset, 0)); + +- /* The offset of the bottom of the save area from the current SP. */ +- poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size; ++ /* The offset of the current SP from the bottom of the static frame. */ ++ poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust; + + if (emit_frame_chain) + { +@@ -9761,7 +9761,7 @@ aarch64_expand_prologue (void) + { + reg1 = R29_REGNUM; + reg2 = R30_REGNUM; +- aarch64_save_callee_saves (saved_regs_offset, reg1, reg2, ++ aarch64_save_callee_saves (bytes_below_sp, reg1, reg2, + false, false); + } + else +@@ -9801,7 +9801,7 @@ aarch64_expand_prologue (void) + emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); + } + +- aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM, ++ aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM, + callee_adjust != 0 || emit_frame_chain, + emit_frame_chain); + if (maybe_ne (sve_callee_adjust, 0)) +@@ -9811,16 +9811,17 @@ aarch64_expand_prologue (void) + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, + sve_callee_adjust, + !frame_pointer_needed, false); +- saved_regs_offset += sve_callee_adjust; ++ bytes_below_sp -= sve_callee_adjust; + } +- aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM, ++ aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM, + false, emit_frame_chain); +- aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM, ++ aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM, + callee_adjust != 0 || emit_frame_chain, + emit_frame_chain); + + /* We may need to probe the final adjustment if it is larger than the guard + that is assumed by the called. */ ++ gcc_assert (known_eq (bytes_below_sp, final_adjust)); + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, + !frame_pointer_needed, true); + } +@@ -9855,7 +9856,6 @@ aarch64_expand_epilogue (bool for_sibcall) + poly_int64 initial_adjust = frame.initial_adjust; + HOST_WIDE_INT callee_adjust = frame.callee_adjust; + poly_int64 final_adjust = frame.final_adjust; +- poly_int64 callee_offset = frame.callee_offset; + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; + poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp; + unsigned reg1 = frame.wb_pop_candidate1; +@@ -9925,9 +9925,9 @@ aarch64_expand_epilogue (bool for_sibcall) + + /* Restore the vector registers before the predicate registers, + so that we can use P4 as a temporary for big-endian SVE frames. */ +- aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM, ++ aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM, + callee_adjust != 0, &cfi_ops); +- aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM, ++ aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM, + false, &cfi_ops); + if (maybe_ne (sve_callee_adjust, 0)) + aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true); +@@ -9935,7 +9935,7 @@ aarch64_expand_epilogue (bool for_sibcall) + /* When shadow call stack is enabled, the scs_pop in the epilogue will + restore x30, we don't need to restore x30 again in the traditional + way. */ +- aarch64_restore_callee_saves (callee_offset - sve_callee_adjust, ++ aarch64_restore_callee_saves (final_adjust + sve_callee_adjust, + R0_REGNUM, last_gpr, + callee_adjust != 0, &cfi_ops); + +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index de68ff7202f..94fca4b9471 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -907,10 +907,6 @@ struct GTY (()) aarch64_frame + It is zero when no push is used. */ + HOST_WIDE_INT callee_adjust; + +- /* The offset from SP to the callee-save registers after initial_adjust. +- It may be non-zero if no push is used (ie. callee_adjust == 0). */ +- poly_int64 callee_offset; +- + /* The size of the stack adjustment before saving or after restoring + SVE registers. */ + poly_int64 sve_callee_adjust; +-- +2.34.1 + + +From 2b983f9064d808daf909bde1d4a13980934a7e6e Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:51 +0100 +Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a + chain + +After previous patches, it is no longer necessary to calculate +a chain_offset in cases where there is no chain record. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the + calculation of chain_offset into the emit_frame_chain block. +--- + gcc/config/aarch64/aarch64.cc | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index e79551af41d..d71a042d611 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -9747,16 +9747,16 @@ aarch64_expand_prologue (void) + if (callee_adjust != 0) + aarch64_push_regs (reg1, reg2, callee_adjust); + +- /* The offset of the frame chain record (if any) from the current SP. */ +- poly_int64 chain_offset = (initial_adjust + callee_adjust +- - frame.hard_fp_offset); +- gcc_assert (known_ge (chain_offset, 0)); +- + /* The offset of the current SP from the bottom of the static frame. */ + poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust; + + if (emit_frame_chain) + { ++ /* The offset of the frame chain record (if any) from the current SP. */ ++ poly_int64 chain_offset = (initial_adjust + callee_adjust ++ - frame.hard_fp_offset); ++ gcc_assert (known_ge (chain_offset, 0)); ++ + if (callee_adjust == 0) + { + reg1 = R29_REGNUM; +-- +2.34.1 + + +From 0a0a824808d1dec51004fb5805c1a0ae2a35433f Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:51 +0100 +Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +locals_offset was described as: + + /* Offset from the base of the frame (incomming SP) to the + top of the locals area. This value is always a multiple of + STACK_BOUNDARY. */ + +This is implicitly an “upside down” view of the frame: the incoming +SP is at offset 0, and anything N bytes below the incoming SP is at +offset N (rather than -N). + +However, reg_offset instead uses a “right way up” view; that is, +it views offsets in address terms. Something above X is at a +positive offset from X and something below X is at a negative +offset from X. + +Also, even on FRAME_GROWS_DOWNWARD targets like AArch64, +target-independent code views offsets in address terms too: +locals are allocated at negative offsets to virtual_stack_vars. + +It seems confusing to have *_offset fields of the same structure +using different polarities like this. This patch tries to avoid +that by renaming locals_offset to bytes_above_locals. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename to... + (aarch64_frame::bytes_above_locals): ...this. + * config/aarch64/aarch64.cc (aarch64_layout_frame) + (aarch64_initial_elimination_offset): Update accordingly. +--- + gcc/config/aarch64/aarch64.cc | 6 +++--- + gcc/config/aarch64/aarch64.h | 6 +++--- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index d71a042d611..d4ec352ba98 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8337,7 +8337,7 @@ aarch64_layout_frame (void) + STACK_BOUNDARY / BITS_PER_UNIT)); + frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; + +- frame.locals_offset = frame.saved_varargs_size; ++ frame.bytes_above_locals = frame.saved_varargs_size; + + frame.initial_adjust = 0; + frame.final_adjust = 0; +@@ -12578,13 +12578,13 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to) + return frame.hard_fp_offset; + + if (from == FRAME_POINTER_REGNUM) +- return frame.hard_fp_offset - frame.locals_offset; ++ return frame.hard_fp_offset - frame.bytes_above_locals; + } + + if (to == STACK_POINTER_REGNUM) + { + if (from == FRAME_POINTER_REGNUM) +- return frame.frame_size - frame.locals_offset; ++ return frame.frame_size - frame.bytes_above_locals; + } + + return frame.frame_size; +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 94fca4b9471..bf46e6124aa 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -885,10 +885,10 @@ struct GTY (()) aarch64_frame + always a multiple of STACK_BOUNDARY. */ + poly_int64 bytes_below_hard_fp; + +- /* Offset from the base of the frame (incomming SP) to the +- top of the locals area. This value is always a multiple of ++ /* The number of bytes between the top of the locals area and the top ++ of the frame (the incomming SP). This value is always a multiple of + STACK_BOUNDARY. */ +- poly_int64 locals_offset; ++ poly_int64 bytes_above_locals; + + /* Offset from the base of the frame (incomming SP) to the + hard_frame_pointer. This value is always a multiple of +-- +2.34.1 + + +From 3fbf0789202b30a67b12e1fb785c7130f098d665 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:52 +0100 +Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to bytes_above_hard_fp +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Similarly to the previous locals_offset patch, hard_fp_offset +was described as: + + /* Offset from the base of the frame (incomming SP) to the + hard_frame_pointer. This value is always a multiple of + STACK_BOUNDARY. */ + poly_int64 hard_fp_offset; + +which again took an “upside-down” view: higher offsets meant lower +addresses. This patch renames the field to bytes_above_hard_fp instead. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename + to... + (aarch64_frame::bytes_above_hard_fp): ...this. + * config/aarch64/aarch64.cc (aarch64_layout_frame) + (aarch64_expand_prologue): Update accordingly. + (aarch64_initial_elimination_offset): Likewise. +--- + gcc/config/aarch64/aarch64.cc | 26 +++++++++++++------------- + gcc/config/aarch64/aarch64.h | 6 +++--- + 2 files changed, 16 insertions(+), 16 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index d4ec352ba98..3c4052740e7 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8329,7 +8329,7 @@ aarch64_layout_frame (void) + + get_frame_size (), + STACK_BOUNDARY / BITS_PER_UNIT); + +- frame.hard_fp_offset ++ frame.bytes_above_hard_fp + = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; + + /* Both these values are already aligned. */ +@@ -8378,13 +8378,13 @@ aarch64_layout_frame (void) + else if (frame.wb_pop_candidate1 != INVALID_REGNUM) + max_push_offset = 256; + +- HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset; ++ HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; + HOST_WIDE_INT const_saved_regs_size; + if (known_eq (frame.saved_regs_size, 0)) + frame.initial_adjust = frame.frame_size; + else if (frame.frame_size.is_constant (&const_size) + && const_size < max_push_offset +- && known_eq (frame.hard_fp_offset, const_size)) ++ && known_eq (frame.bytes_above_hard_fp, const_size)) + { + /* Simple, small frame with no data below the saved registers. + +@@ -8401,8 +8401,8 @@ aarch64_layout_frame (void) + case that it hardly seems worth the effort though. */ + && (!saves_below_hard_fp_p || const_below_saved_regs == 0) + && !(cfun->calls_alloca +- && frame.hard_fp_offset.is_constant (&const_fp_offset) +- && const_fp_offset < max_push_offset)) ++ && frame.bytes_above_hard_fp.is_constant (&const_above_fp) ++ && const_above_fp < max_push_offset)) + { + /* Frame with small area below the saved registers: + +@@ -8420,12 +8420,12 @@ aarch64_layout_frame (void) + sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size + save SVE registers relative to SP + sub sp, sp, bytes_below_saved_regs */ +- frame.initial_adjust = (frame.hard_fp_offset ++ frame.initial_adjust = (frame.bytes_above_hard_fp + + frame.below_hard_fp_saved_regs_size); + frame.final_adjust = frame.bytes_below_saved_regs; + } +- else if (frame.hard_fp_offset.is_constant (&const_fp_offset) +- && const_fp_offset < max_push_offset) ++ else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) ++ && const_above_fp < max_push_offset) + { + /* Frame with large area below the saved registers, or with SVE saves, + but with a small area above: +@@ -8435,7 +8435,7 @@ aarch64_layout_frame (void) + [sub sp, sp, below_hard_fp_saved_regs_size] + [save SVE registers relative to SP] + sub sp, sp, bytes_below_saved_regs */ +- frame.callee_adjust = const_fp_offset; ++ frame.callee_adjust = const_above_fp; + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; + frame.final_adjust = frame.bytes_below_saved_regs; + } +@@ -8450,7 +8450,7 @@ aarch64_layout_frame (void) + [sub sp, sp, below_hard_fp_saved_regs_size] + [save SVE registers relative to SP] + sub sp, sp, bytes_below_saved_regs */ +- frame.initial_adjust = frame.hard_fp_offset; ++ frame.initial_adjust = frame.bytes_above_hard_fp; + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; + frame.final_adjust = frame.bytes_below_saved_regs; + } +@@ -9754,7 +9754,7 @@ aarch64_expand_prologue (void) + { + /* The offset of the frame chain record (if any) from the current SP. */ + poly_int64 chain_offset = (initial_adjust + callee_adjust +- - frame.hard_fp_offset); ++ - frame.bytes_above_hard_fp); + gcc_assert (known_ge (chain_offset, 0)); + + if (callee_adjust == 0) +@@ -12575,10 +12575,10 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to) + if (to == HARD_FRAME_POINTER_REGNUM) + { + if (from == ARG_POINTER_REGNUM) +- return frame.hard_fp_offset; ++ return frame.bytes_above_hard_fp; + + if (from == FRAME_POINTER_REGNUM) +- return frame.hard_fp_offset - frame.bytes_above_locals; ++ return frame.bytes_above_hard_fp - frame.bytes_above_locals; + } + + if (to == STACK_POINTER_REGNUM) +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index bf46e6124aa..dd1f403f939 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -890,10 +890,10 @@ struct GTY (()) aarch64_frame + STACK_BOUNDARY. */ + poly_int64 bytes_above_locals; + +- /* Offset from the base of the frame (incomming SP) to the +- hard_frame_pointer. This value is always a multiple of ++ /* The number of bytes between the hard_frame_pointer and the top of ++ the frame (the incomming SP). This value is always a multiple of + STACK_BOUNDARY. */ +- poly_int64 hard_fp_offset; ++ poly_int64 bytes_above_hard_fp; + + /* The size of the frame. This value is the offset from base of the + frame (incomming SP) to the stack_pointer. This value is always +-- +2.34.1 + + +From aac8b31379ac3bbd14fc6427dce23f56e54e8485 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:52 +0100 +Subject: [PATCH 10/19] aarch64: Tweak frame_size comment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This patch fixes another case in which a value was described with +an “upside-down” view. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak comment. +--- + gcc/config/aarch64/aarch64.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index dd1f403f939..700524ae22b 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -895,8 +895,8 @@ struct GTY (()) aarch64_frame + STACK_BOUNDARY. */ + poly_int64 bytes_above_hard_fp; + +- /* The size of the frame. This value is the offset from base of the +- frame (incomming SP) to the stack_pointer. This value is always ++ /* The size of the frame, i.e. the number of bytes between the bottom ++ of the outgoing arguments and the incoming SP. This value is always + a multiple of STACK_BOUNDARY. */ + poly_int64 frame_size; + +-- +2.34.1 + + +From 8d5506a8aeb8dd7e8b209a3663b07688478f76b9 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:53 +0100 +Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the + frame + +reg_offset was measured from the bottom of the saved register area. +This made perfect sense with the original layout, since the bottom +of the saved register area was also the hard frame pointer address. +It became slightly less obvious with SVE, since we save SVE +registers below the hard frame pointer, but it still made sense. + +However, if we want to allow different frame layouts, it's more +convenient and obvious to measure reg_offset from the bottom of +the frame. After previous patches, it's also a slight simplification +in its own right. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame): Add comment above + reg_offset. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets + from the bottom of the frame, rather than the bottom of the saved + register area. Measure reg_offset from the bottom of the frame + rather than the bottom of the saved register area. + (aarch64_save_callee_saves): Update accordingly. + (aarch64_restore_callee_saves): Likewise. + (aarch64_get_separate_components): Likewise. + (aarch64_process_components): Likewise. +--- + gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++------------------- + gcc/config/aarch64/aarch64.h | 3 ++ + 2 files changed, 27 insertions(+), 29 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 3c4052740e7..97dd077844b 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8139,7 +8139,6 @@ aarch64_needs_frame_chain (void) + static void + aarch64_layout_frame (void) + { +- poly_int64 offset = 0; + int regno, last_fp_reg = INVALID_REGNUM; + machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM); + poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); +@@ -8217,7 +8216,9 @@ aarch64_layout_frame (void) + gcc_assert (crtl->is_leaf + || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); + +- frame.bytes_below_saved_regs = crtl->outgoing_args_size; ++ poly_int64 offset = crtl->outgoing_args_size; ++ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); ++ frame.bytes_below_saved_regs = offset; + + /* Now assign stack slots for the registers. Start with the predicate + registers, since predicate LDR and STR have a relatively small +@@ -8229,7 +8230,8 @@ aarch64_layout_frame (void) + offset += BYTES_PER_SVE_PRED; + } + +- if (maybe_ne (offset, 0)) ++ poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs; ++ if (maybe_ne (saved_prs_size, 0)) + { + /* If we have any vector registers to save above the predicate registers, + the offset of the vector register save slots need to be a multiple +@@ -8247,10 +8249,10 @@ aarch64_layout_frame (void) + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + else + { +- if (known_le (offset, vector_save_size)) +- offset = vector_save_size; +- else if (known_le (offset, vector_save_size * 2)) +- offset = vector_save_size * 2; ++ if (known_le (saved_prs_size, vector_save_size)) ++ offset = frame.bytes_below_saved_regs + vector_save_size; ++ else if (known_le (saved_prs_size, vector_save_size * 2)) ++ offset = frame.bytes_below_saved_regs + vector_save_size * 2; + else + gcc_unreachable (); + } +@@ -8267,9 +8269,10 @@ aarch64_layout_frame (void) + + /* OFFSET is now the offset of the hard frame pointer from the bottom + of the callee save area. */ +- bool saves_below_hard_fp_p = maybe_ne (offset, 0); +- frame.below_hard_fp_saved_regs_size = offset; +- frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs; ++ frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; ++ bool saves_below_hard_fp_p ++ = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); ++ frame.bytes_below_hard_fp = offset; + if (frame.emit_frame_chain) + { + /* FP and LR are placed in the linkage record. */ +@@ -8320,9 +8323,10 @@ aarch64_layout_frame (void) + + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + +- frame.saved_regs_size = offset; ++ frame.saved_regs_size = offset - frame.bytes_below_saved_regs; + +- poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size; ++ poly_int64 varargs_and_saved_regs_size ++ = frame.saved_regs_size + frame.saved_varargs_size; + + poly_int64 saved_regs_and_above + = aligned_upper_bound (varargs_and_saved_regs_size +@@ -8790,9 +8794,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = (frame.reg_offset[regno] +- + frame.bytes_below_saved_regs +- - bytes_below_sp); ++ offset = frame.reg_offset[regno] - bytes_below_sp; + rtx base_rtx = stack_pointer_rtx; + poly_int64 sp_offset = offset; + +@@ -8899,9 +8901,7 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = (frame.reg_offset[regno] +- + frame.bytes_below_saved_regs +- - bytes_below_sp); ++ offset = frame.reg_offset[regno] - bytes_below_sp; + rtx base_rtx = stack_pointer_rtx; + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, +@@ -9040,14 +9040,12 @@ aarch64_get_separate_components (void) + it as a stack probe for -fstack-clash-protection. */ + if (flag_stack_clash_protection + && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) +- && known_eq (offset, 0)) ++ && known_eq (offset, frame.bytes_below_saved_regs)) + continue; + + /* Get the offset relative to the register we'll use. */ + if (frame_pointer_needed) +- offset -= frame.below_hard_fp_saved_regs_size; +- else +- offset += frame.bytes_below_saved_regs; ++ offset -= frame.bytes_below_hard_fp; + + /* Check that we can access the stack slot of the register with one + direct load with no adjustments needed. */ +@@ -9194,9 +9192,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + rtx reg = gen_rtx_REG (mode, regno); + poly_int64 offset = frame.reg_offset[regno]; + if (frame_pointer_needed) +- offset -= frame.below_hard_fp_saved_regs_size; +- else +- offset += frame.bytes_below_saved_regs; ++ offset -= frame.bytes_below_hard_fp; + + rtx addr = plus_constant (Pmode, ptr_reg, offset); + rtx mem = gen_frame_mem (mode, addr); +@@ -9248,9 +9244,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + /* REGNO2 can be saved/restored in a pair with REGNO. */ + rtx reg2 = gen_rtx_REG (mode, regno2); + if (frame_pointer_needed) +- offset2 -= frame.below_hard_fp_saved_regs_size; +- else +- offset2 += frame.bytes_below_saved_regs; ++ offset2 -= frame.bytes_below_hard_fp; + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); + rtx mem2 = gen_frame_mem (mode, addr2); + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) +@@ -9366,7 +9360,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + if (final_adjustment_p + && known_eq (frame.below_hard_fp_saved_regs_size, 0)) + { +- poly_int64 lr_offset = frame.reg_offset[LR_REGNUM]; ++ poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM] ++ - frame.bytes_below_saved_regs); + if (known_ge (lr_offset, 0)) + min_probe_threshold -= lr_offset.to_constant (); + else +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 700524ae22b..b6135837073 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -860,6 +860,9 @@ extern enum aarch64_processor aarch64_tune; + #ifdef HAVE_POLY_INT_H + struct GTY (()) aarch64_frame + { ++ /* The offset from the bottom of the static frame (the bottom of the ++ outgoing arguments) of each register save slot, or -2 if no save is ++ needed. */ + poly_int64 reg_offset[LAST_SAVED_REGNUM + 1]; + + /* The number of extra stack bytes taken up by register varargs. +-- +2.34.1 + + +From b47766614df3b9df878262efb2ad73aaac108363 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:53 +0100 +Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation + +After previous patches, it no longer really makes sense to allocate +the top of the frame in terms of varargs_and_saved_regs_size and +saved_regs_and_above. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify + the allocation of the top of the frame. +--- + gcc/config/aarch64/aarch64.cc | 23 ++++++++--------------- + 1 file changed, 8 insertions(+), 15 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 97dd077844b..81935852d5b 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8325,23 +8325,16 @@ aarch64_layout_frame (void) + + frame.saved_regs_size = offset - frame.bytes_below_saved_regs; + +- poly_int64 varargs_and_saved_regs_size +- = frame.saved_regs_size + frame.saved_varargs_size; +- +- poly_int64 saved_regs_and_above +- = aligned_upper_bound (varargs_and_saved_regs_size +- + get_frame_size (), +- STACK_BOUNDARY / BITS_PER_UNIT); +- +- frame.bytes_above_hard_fp +- = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; ++ offset += get_frame_size (); ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); ++ auto top_of_locals = offset; + +- /* Both these values are already aligned. */ +- gcc_assert (multiple_p (frame.bytes_below_saved_regs, +- STACK_BOUNDARY / BITS_PER_UNIT)); +- frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; ++ offset += frame.saved_varargs_size; ++ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); ++ frame.frame_size = offset; + +- frame.bytes_above_locals = frame.saved_varargs_size; ++ frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp; ++ frame.bytes_above_locals = frame.frame_size - top_of_locals; + + frame.initial_adjust = 0; + frame.final_adjust = 0; +-- +2.34.1 + + +From 08f71b4bb28fb74d20e8d2927a557e8119ce9f4d Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:54 +0100 +Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak + +This patch just changes a calculation of initial_adjust +to one that makes it slightly more obvious that the total +adjustment is frame.frame_size. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak + calculation of initial_adjust for frames in which all saves + are SVE saves. +--- + gcc/config/aarch64/aarch64.cc | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 81935852d5b..4d9fcf3d162 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8414,11 +8414,10 @@ aarch64_layout_frame (void) + { + /* Frame in which all saves are SVE saves: + +- sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size ++ sub sp, sp, frame_size - bytes_below_saved_regs + save SVE registers relative to SP + sub sp, sp, bytes_below_saved_regs */ +- frame.initial_adjust = (frame.bytes_above_hard_fp +- + frame.below_hard_fp_saved_regs_size); ++ frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs; + frame.final_adjust = frame.bytes_below_saved_regs; + } + else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) +-- +2.34.1 + + +From f22315d5c19e8310e4dc880fd509678fd291fca8 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:54 +0100 +Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition + +The AArch64 ABI says that, when stack clash protection is used, +there can be a maximum of 1KiB of unprobed space at sp on entry +to a function. Therefore, we need to probe when allocating +>= guard_size - 1KiB of data (>= rather than >). This is what +GCC does. + +If an allocation is exactly guard_size bytes, it is enough to allocate +those bytes and probe once at offset 1024. It isn't possible to use a +single probe at any other offset: higher would conmplicate later code, +by leaving more unprobed space than usual, while lower would risk +leaving an entire page unprobed. For simplicity, the code probes all +allocations at offset 1024. + +Some register saves also act as probes. If we need to allocate +more space below the last such register save probe, we need to +probe the allocation if it is > 1KiB. Again, this allocation is +then sometimes (but not always) probed at offset 1024. This sort of +allocation is currently only used for outgoing arguments, which are +rarely this big. + +However, the code also probed if this final outgoing-arguments +allocation was == 1KiB, rather than just > 1KiB. This isn't +necessary, since the register save then probes at offset 1024 +as required. Continuing to probe allocations of exactly 1KiB +would complicate later patches. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space): + Don't probe final allocations that are exactly 1KiB in size (after + unprobed space above the final allocation has been deducted). + +gcc/testsuite/ + * gcc.target/aarch64/stack-check-prologue-17.c: New test. +--- + gcc/config/aarch64/aarch64.cc | 4 +- + .../aarch64/stack-check-prologue-17.c | 55 +++++++++++++++++++ + 2 files changed, 58 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 4d9fcf3d162..34c1d8614cd 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -9333,9 +9333,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + HOST_WIDE_INT guard_size + = 1 << param_stack_clash_protection_guard_size; + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; ++ HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT; ++ gcc_assert (multiple_p (poly_size, byte_sp_alignment)); + HOST_WIDE_INT min_probe_threshold + = (final_adjustment_p +- ? guard_used_by_caller ++ ? guard_used_by_caller + byte_sp_alignment + : guard_size - guard_used_by_caller); + /* When doing the final adjustment for the outgoing arguments, take into + account any unprobed space there is above the current SP. There are +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c +new file mode 100644 +index 00000000000..0d8a25d73a2 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c +@@ -0,0 +1,55 @@ ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++void f(int, ...); ++void g(); ++ ++/* ++** test1: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1024 ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test1(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test2: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1040 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test2(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x); ++ } ++ g(); ++ return 1; ++} +-- +2.34.1 + + +From 15e18831bf98fd25af098b970ebf0c9a6200a34b Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:55 +0100 +Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes + +-fstack-clash-protection uses the save of LR as a probe for the next +allocation. The next allocation could be: + +* another part of the static frame, e.g. when allocating SVE save slots + or outgoing arguments + +* an alloca in the same function + +* an allocation made by a callee function + +However, when -fomit-frame-pointer is used, the LR save slot is placed +above the other GPR save slots. It could therefore be up to 80 bytes +above the base of the GPR save area (which is also the hard fp address). + +aarch64_allocate_and_probe_stack_space took this into account when +deciding how much subsequent space could be allocated without needing +a probe. However, it interacted badly with: + + /* If doing a small final adjustment, we always probe at offset 0. + This is done to avoid issues when LR is not at position 0 or when + the final adjustment is smaller than the probing offset. */ + else if (final_adjustment_p && rounded_size == 0) + residual_probe_offset = 0; + +which forces any allocation that is smaller than the guard page size +to be probed at offset 0 rather than the usual offset 1024. It was +therefore possible to construct cases in which we had: + +* a probe using LR at SP + 80 bytes (or some other value >= 16) +* an allocation of the guard page size - 16 bytes +* a probe at SP + 0 + +which allocates guard page size + 64 consecutive unprobed bytes. + +This patch requires the LR probe to be in the first 16 bytes of the +save area when stack clash protection is active. Doing it +unconditionally would cause code-quality regressions. + +Putting LR before other registers prevents push/pop allocation +when shadow call stacks are enabled, since LR is restored +separately from the other callee-saved registers. + +The new comment doesn't say that the probe register is required +to be LR, since a later patch removes that restriction. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that + the LR save slot is in the first 16 bytes of the register save area. + Only form STP/LDP push/pop candidates if both registers are valid. + (aarch64_allocate_and_probe_stack_space): Remove workaround for + when LR was not in the first 16 bytes. + +gcc/testsuite/ + * gcc.target/aarch64/stack-check-prologue-18.c: New test. + * gcc.target/aarch64/stack-check-prologue-19.c: Likewise. + * gcc.target/aarch64/stack-check-prologue-20.c: Likewise. +--- + gcc/config/aarch64/aarch64.cc | 72 ++++++------- + .../aarch64/stack-check-prologue-18.c | 100 ++++++++++++++++++ + .../aarch64/stack-check-prologue-19.c | 100 ++++++++++++++++++ + .../aarch64/stack-check-prologue-20.c | 3 + + 4 files changed, 233 insertions(+), 42 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 34c1d8614cd..16433fb70f4 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8273,26 +8273,34 @@ aarch64_layout_frame (void) + bool saves_below_hard_fp_p + = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); + frame.bytes_below_hard_fp = offset; ++ ++ auto allocate_gpr_slot = [&](unsigned int regno) ++ { ++ frame.reg_offset[regno] = offset; ++ if (frame.wb_push_candidate1 == INVALID_REGNUM) ++ frame.wb_push_candidate1 = regno; ++ else if (frame.wb_push_candidate2 == INVALID_REGNUM) ++ frame.wb_push_candidate2 = regno; ++ offset += UNITS_PER_WORD; ++ }; ++ + if (frame.emit_frame_chain) + { + /* FP and LR are placed in the linkage record. */ +- frame.reg_offset[R29_REGNUM] = offset; +- frame.wb_push_candidate1 = R29_REGNUM; +- frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD; +- frame.wb_push_candidate2 = R30_REGNUM; +- offset += 2 * UNITS_PER_WORD; ++ allocate_gpr_slot (R29_REGNUM); ++ allocate_gpr_slot (R30_REGNUM); + } ++ else if (flag_stack_clash_protection ++ && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED)) ++ /* Put the LR save slot first, since it makes a good choice of probe ++ for stack clash purposes. The idea is that the link register usually ++ has to be saved before a call anyway, and so we lose little by ++ stopping it from being individually shrink-wrapped. */ ++ allocate_gpr_slot (R30_REGNUM); + + for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) +- { +- frame.reg_offset[regno] = offset; +- if (frame.wb_push_candidate1 == INVALID_REGNUM) +- frame.wb_push_candidate1 = regno; +- else if (frame.wb_push_candidate2 == INVALID_REGNUM) +- frame.wb_push_candidate2 = regno; +- offset += UNITS_PER_WORD; +- } ++ allocate_gpr_slot (regno); + + poly_int64 max_int_offset = offset; + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); +@@ -8370,10 +8378,13 @@ aarch64_layout_frame (void) + max_push_offset to 0, because no registers are popped at this time, + so callee_adjust cannot be adjusted. */ + HOST_WIDE_INT max_push_offset = 0; +- if (frame.wb_pop_candidate2 != INVALID_REGNUM) +- max_push_offset = 512; +- else if (frame.wb_pop_candidate1 != INVALID_REGNUM) +- max_push_offset = 256; ++ if (frame.wb_pop_candidate1 != INVALID_REGNUM) ++ { ++ if (frame.wb_pop_candidate2 != INVALID_REGNUM) ++ max_push_offset = 512; ++ else ++ max_push_offset = 256; ++ } + + HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; + HOST_WIDE_INT const_saved_regs_size; +@@ -9339,29 +9350,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + = (final_adjustment_p + ? guard_used_by_caller + byte_sp_alignment + : guard_size - guard_used_by_caller); +- /* When doing the final adjustment for the outgoing arguments, take into +- account any unprobed space there is above the current SP. There are +- two cases: +- +- - When saving SVE registers below the hard frame pointer, we force +- the lowest save to take place in the prologue before doing the final +- adjustment (i.e. we don't allow the save to be shrink-wrapped). +- This acts as a probe at SP, so there is no unprobed space. +- +- - When there are no SVE register saves, we use the store of the link +- register as a probe. We can't assume that LR was saved at position 0 +- though, so treat any space below it as unprobed. */ +- if (final_adjustment_p +- && known_eq (frame.below_hard_fp_saved_regs_size, 0)) +- { +- poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM] +- - frame.bytes_below_saved_regs); +- if (known_ge (lr_offset, 0)) +- min_probe_threshold -= lr_offset.to_constant (); +- else +- gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0)); +- } +- + poly_int64 frame_size = frame.frame_size; + + /* We should always have a positive probe threshold. */ +@@ -9541,8 +9529,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + if (final_adjustment_p && rounded_size != 0) + min_probe_threshold = 0; + /* If doing a small final adjustment, we always probe at offset 0. +- This is done to avoid issues when LR is not at position 0 or when +- the final adjustment is smaller than the probing offset. */ ++ This is done to avoid issues when the final adjustment is smaller ++ than the probing offset. */ + else if (final_adjustment_p && rounded_size == 0) + residual_probe_offset = 0; + +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c +new file mode 100644 +index 00000000000..82447d20fff +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c +@@ -0,0 +1,100 @@ ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++void f(int, ...); ++void g(); ++ ++/* ++** test1: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #4064 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++** str x26, \[sp, #?4128\] ++** ... ++*/ ++int test1(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test2: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1040 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test2(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test3: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1024 ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test3(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c +new file mode 100644 +index 00000000000..73ac3e4e4eb +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c +@@ -0,0 +1,100 @@ ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++void f(int, ...); ++void g(); ++ ++/* ++** test1: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #4064 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++** str x26, \[sp, #?4128\] ++** ... ++*/ ++int test1(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test2: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1040 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test2(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test3: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1024 ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test3(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c +new file mode 100644 +index 00000000000..690aae8dfd5 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c +@@ -0,0 +1,3 @@ ++/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */ ++ ++#include "stack-check-prologue-19.c" +-- +2.34.1 + + +From c4f0e121faa36342f1d21919e54a05ad841c4f86 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:55 +0100 +Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation + +Previous patches ensured that the final frame allocation only needs +a probe when the size is strictly greater than 1KiB. It's therefore +safe to use the normal 1024 probe offset in all cases. + +The main motivation for doing this is to simplify the code and +remove the number of special cases. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space): + Always probe the residual allocation at offset 1024, asserting + that that is in range. + +gcc/testsuite/ + * gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe + to be at offset 1024 rather than offset 0. + * gcc.target/aarch64/stack-check-prologue-18.c: Likewise. + * gcc.target/aarch64/stack-check-prologue-19.c: Likewise. +--- + gcc/config/aarch64/aarch64.cc | 12 ++++-------- + .../gcc.target/aarch64/stack-check-prologue-17.c | 2 +- + .../gcc.target/aarch64/stack-check-prologue-18.c | 4 ++-- + .../gcc.target/aarch64/stack-check-prologue-19.c | 4 ++-- + 4 files changed, 9 insertions(+), 13 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 16433fb70f4..8abf3d7a1e2 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -9523,16 +9523,12 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + are still safe. */ + if (residual) + { +- HOST_WIDE_INT residual_probe_offset = guard_used_by_caller; ++ gcc_assert (guard_used_by_caller + byte_sp_alignment <= size); ++ + /* If we're doing final adjustments, and we've done any full page + allocations then any residual needs to be probed. */ + if (final_adjustment_p && rounded_size != 0) + min_probe_threshold = 0; +- /* If doing a small final adjustment, we always probe at offset 0. +- This is done to avoid issues when the final adjustment is smaller +- than the probing offset. */ +- else if (final_adjustment_p && rounded_size == 0) +- residual_probe_offset = 0; + + aarch64_sub_sp (temp1, temp2, residual, frame_related_p); + if (residual >= min_probe_threshold) +@@ -9543,8 +9539,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required." + "\n", residual); + +- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, +- residual_probe_offset)); ++ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, ++ guard_used_by_caller)); + emit_insn (gen_blockage ()); + } + } +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c +index 0d8a25d73a2..f0ec1389771 100644 +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c +@@ -33,7 +33,7 @@ int test1(int z) { + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #1040 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c +index 82447d20fff..6383bec5ebc 100644 +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c +@@ -9,7 +9,7 @@ void g(); + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #4064 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +@@ -50,7 +50,7 @@ int test1(int z) { + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #1040 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c +index 73ac3e4e4eb..562039b5e9b 100644 +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c +@@ -9,7 +9,7 @@ void g(); + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #4064 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +@@ -50,7 +50,7 @@ int test1(int z) { + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #1040 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +-- +2.34.1 + + +From 6f0ab0a9f46a17b68349ff6035aa776bf65f0575 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:56 +0100 +Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame + info + +The stack frame is currently divided into three areas: + +A: the area above the hard frame pointer +B: the SVE saves below the hard frame pointer +C: the outgoing arguments + +If the stack frame is allocated in one chunk, the allocation needs a +probe if the frame size is >= guard_size - 1KiB. In addition, if the +function is not a leaf function, it must probe an address no more than +1KiB above the outgoing SP. We ensured the second condition by + +(1) using single-chunk allocations for non-leaf functions only if + the link register save slot is within 512 bytes of the bottom + of the frame; and + +(2) using the link register save as a probe (meaning, for instance, + that it can't be individually shrink wrapped) + +If instead the stack is allocated in multiple chunks, then: + +* an allocation involving only the outgoing arguments (C above) requires + a probe if the allocation size is > 1KiB + +* any other allocation requires a probe if the allocation size + is >= guard_size - 1KiB + +* second and subsequent allocations require the previous allocation + to probe at the bottom of the allocated area, regardless of the size + of that previous allocation + +The final point means that, unlike for single allocations, +it can be necessary to have both a non-SVE register probe and +an SVE register probe. For example: + +* allocate A, probe using a non-SVE register save +* allocate B, probe using an SVE register save +* allocate C + +The non-SVE register used in this case was again the link register. +It was previously used even if the link register save slot was some +bytes above the bottom of the non-SVE register saves, but an earlier +patch avoided that by putting the link register save slot first. + +As a belt-and-braces fix, this patch explicitly records which +probe registers we're using and allows the non-SVE probe to be +whichever register comes first (as for SVE). + +The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe) + (aarch64_frame::hard_fp_save_and_probe): New fields. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize them. + Rather than asserting that a leaf function saves LR, instead assert + that a leaf function saves something. + (aarch64_get_separate_components): Prevent the chosen probe + registers from being individually shrink-wrapped. + (aarch64_allocate_and_probe_stack_space): Remove workaround for + probe registers that aren't at the bottom of the previous allocation. + +gcc/testsuite/ + * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes. +--- + gcc/config/aarch64/aarch64.cc | 68 +++++++++++++++---- + gcc/config/aarch64/aarch64.h | 8 +++ + .../aarch64/sve/pcs/stack_clash_3.c | 6 +- + 3 files changed, 64 insertions(+), 18 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 8abf3d7a1e2..a8d907df884 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8210,15 +8210,11 @@ aarch64_layout_frame (void) + && !crtl->abi->clobbers_full_reg_p (regno)) + frame.reg_offset[regno] = SLOT_REQUIRED; + +- /* With stack-clash, LR must be saved in non-leaf functions. The saving of +- LR counts as an implicit probe which allows us to maintain the invariant +- described in the comment at expand_prologue. */ +- gcc_assert (crtl->is_leaf +- || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); + + poly_int64 offset = crtl->outgoing_args_size; + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); + frame.bytes_below_saved_regs = offset; ++ frame.sve_save_and_probe = INVALID_REGNUM; + + /* Now assign stack slots for the registers. Start with the predicate + registers, since predicate LDR and STR have a relatively small +@@ -8226,6 +8222,8 @@ aarch64_layout_frame (void) + for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) + { ++ if (frame.sve_save_and_probe == INVALID_REGNUM) ++ frame.sve_save_and_probe = regno; + frame.reg_offset[regno] = offset; + offset += BYTES_PER_SVE_PRED; + } +@@ -8263,6 +8261,8 @@ aarch64_layout_frame (void) + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) + { ++ if (frame.sve_save_and_probe == INVALID_REGNUM) ++ frame.sve_save_and_probe = regno; + frame.reg_offset[regno] = offset; + offset += vector_save_size; + } +@@ -8272,10 +8272,18 @@ aarch64_layout_frame (void) + frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; + bool saves_below_hard_fp_p + = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); ++ gcc_assert (!saves_below_hard_fp_p ++ || (frame.sve_save_and_probe != INVALID_REGNUM ++ && known_eq (frame.reg_offset[frame.sve_save_and_probe], ++ frame.bytes_below_saved_regs))); ++ + frame.bytes_below_hard_fp = offset; ++ frame.hard_fp_save_and_probe = INVALID_REGNUM; + + auto allocate_gpr_slot = [&](unsigned int regno) + { ++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM) ++ frame.hard_fp_save_and_probe = regno; + frame.reg_offset[regno] = offset; + if (frame.wb_push_candidate1 == INVALID_REGNUM) + frame.wb_push_candidate1 = regno; +@@ -8309,6 +8317,8 @@ aarch64_layout_frame (void) + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) + { ++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM) ++ frame.hard_fp_save_and_probe = regno; + /* If there is an alignment gap between integer and fp callee-saves, + allocate the last fp register to it if possible. */ + if (regno == last_fp_reg +@@ -8332,6 +8342,17 @@ aarch64_layout_frame (void) + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + + frame.saved_regs_size = offset - frame.bytes_below_saved_regs; ++ gcc_assert (known_eq (frame.saved_regs_size, ++ frame.below_hard_fp_saved_regs_size) ++ || (frame.hard_fp_save_and_probe != INVALID_REGNUM ++ && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe], ++ frame.bytes_below_hard_fp))); ++ ++ /* With stack-clash, a register must be saved in non-leaf functions. ++ The saving of the bottommost register counts as an implicit probe, ++ which allows us to maintain the invariant described in the comment ++ at expand_prologue. */ ++ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); + + offset += get_frame_size (); + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); +@@ -8462,6 +8483,25 @@ aarch64_layout_frame (void) + frame.final_adjust = frame.bytes_below_saved_regs; + } + ++ /* The frame is allocated in pieces, with each non-final piece ++ including a register save at offset 0 that acts as a probe for ++ the following piece. In addition, the save of the bottommost register ++ acts as a probe for callees and allocas. Roll back any probes that ++ aren't needed. ++ ++ A probe isn't needed if it is associated with the final allocation ++ (including callees and allocas) that happens before the epilogue is ++ executed. */ ++ if (crtl->is_leaf ++ && !cfun->calls_alloca ++ && known_eq (frame.final_adjust, 0)) ++ { ++ if (maybe_ne (frame.sve_callee_adjust, 0)) ++ frame.sve_save_and_probe = INVALID_REGNUM; ++ else ++ frame.hard_fp_save_and_probe = INVALID_REGNUM; ++ } ++ + /* Make sure the individual adjustments add up to the full frame size. */ + gcc_assert (known_eq (frame.initial_adjust + + frame.callee_adjust +@@ -9039,13 +9079,6 @@ aarch64_get_separate_components (void) + + poly_int64 offset = frame.reg_offset[regno]; + +- /* If the register is saved in the first SVE save slot, we use +- it as a stack probe for -fstack-clash-protection. */ +- if (flag_stack_clash_protection +- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) +- && known_eq (offset, frame.bytes_below_saved_regs)) +- continue; +- + /* Get the offset relative to the register we'll use. */ + if (frame_pointer_needed) + offset -= frame.bytes_below_hard_fp; +@@ -9080,6 +9113,13 @@ aarch64_get_separate_components (void) + + bitmap_clear_bit (components, LR_REGNUM); + bitmap_clear_bit (components, SP_REGNUM); ++ if (flag_stack_clash_protection) ++ { ++ if (frame.sve_save_and_probe != INVALID_REGNUM) ++ bitmap_clear_bit (components, frame.sve_save_and_probe); ++ if (frame.hard_fp_save_and_probe != INVALID_REGNUM) ++ bitmap_clear_bit (components, frame.hard_fp_save_and_probe); ++ } + + return components; + } +@@ -9616,8 +9656,8 @@ aarch64_epilogue_uses (int regno) + When probing is needed, we emit a probe at the start of the prologue + and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter. + +- We have to track how much space has been allocated and the only stores +- to the stack we track as implicit probes are the FP/LR stores. ++ We can also use register saves as probes. These are stored in ++ sve_save_and_probe and hard_fp_save_and_probe. + + For outgoing arguments we probe if the size is larger than 1KB, such that + the ABI specified buffer is maintained for the next callee. +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index b6135837073..46d4693e206 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -957,6 +957,14 @@ struct GTY (()) aarch64_frame + This is the register they should use. */ + unsigned spare_pred_reg; + ++ /* An SVE register that is saved below the hard frame pointer and that acts ++ as a probe for later allocations, or INVALID_REGNUM if none. */ ++ unsigned sve_save_and_probe; ++ ++ /* A register that is saved at the hard frame pointer and that acts ++ as a probe for later allocations, or INVALID_REGNUM if none. */ ++ unsigned hard_fp_save_and_probe; ++ + bool laid_out; + + /* True if shadow call stack should be enabled for the current function. */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c +index 3e01ec36c3a..3530a0d504b 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c +@@ -11,11 +11,10 @@ + ** mov x11, sp + ** ... + ** sub sp, sp, x13 +-** str p4, \[sp\] + ** cbz w0, [^\n]* ++** str p4, \[sp\] + ** ... + ** ptrue p0\.b, all +-** ldr p4, \[sp\] + ** addvl sp, sp, #1 + ** ldr x24, \[sp\], 32 + ** ret +@@ -39,13 +38,12 @@ test_1 (int n) + ** mov x11, sp + ** ... + ** sub sp, sp, x13 +-** str p4, \[sp\] + ** cbz w0, [^\n]* ++** str p4, \[sp\] + ** str p5, \[sp, #1, mul vl\] + ** str p6, \[sp, #2, mul vl\] + ** ... + ** ptrue p0\.b, all +-** ldr p4, \[sp\] + ** addvl sp, sp, #1 + ** ldr x24, \[sp\], 32 + ** ret +-- +2.34.1 + + +From 8254e1b9cd500e0c278465a3657543477e9d1250 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:56 +0100 +Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size + +After previous patches, it's no longer necessary to store +saved_regs_size and below_hard_fp_saved_regs_size in the frame info. +All measurements instead use the top or bottom of the frame as +reference points. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::saved_regs_size) + (aarch64_frame::below_hard_fp_saved_regs_size): Delete. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Update accordingly. +--- + gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++------------------- + gcc/config/aarch64/aarch64.h | 7 ------ + 2 files changed, 21 insertions(+), 31 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index a8d907df884..ac3d3b336a3 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8269,9 +8269,8 @@ aarch64_layout_frame (void) + + /* OFFSET is now the offset of the hard frame pointer from the bottom + of the callee save area. */ +- frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; +- bool saves_below_hard_fp_p +- = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); ++ auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; ++ bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0); + gcc_assert (!saves_below_hard_fp_p + || (frame.sve_save_and_probe != INVALID_REGNUM + && known_eq (frame.reg_offset[frame.sve_save_and_probe], +@@ -8341,9 +8340,8 @@ aarch64_layout_frame (void) + + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + +- frame.saved_regs_size = offset - frame.bytes_below_saved_regs; +- gcc_assert (known_eq (frame.saved_regs_size, +- frame.below_hard_fp_saved_regs_size) ++ auto saved_regs_size = offset - frame.bytes_below_saved_regs; ++ gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size) + || (frame.hard_fp_save_and_probe != INVALID_REGNUM + && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe], + frame.bytes_below_hard_fp))); +@@ -8352,7 +8350,7 @@ aarch64_layout_frame (void) + The saving of the bottommost register counts as an implicit probe, + which allows us to maintain the invariant described in the comment + at expand_prologue. */ +- gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); ++ gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0)); + + offset += get_frame_size (); + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); +@@ -8409,7 +8407,7 @@ aarch64_layout_frame (void) + + HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; + HOST_WIDE_INT const_saved_regs_size; +- if (known_eq (frame.saved_regs_size, 0)) ++ if (known_eq (saved_regs_size, 0)) + frame.initial_adjust = frame.frame_size; + else if (frame.frame_size.is_constant (&const_size) + && const_size < max_push_offset +@@ -8422,7 +8420,7 @@ aarch64_layout_frame (void) + frame.callee_adjust = const_size; + } + else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs) +- && frame.saved_regs_size.is_constant (&const_saved_regs_size) ++ && saved_regs_size.is_constant (&const_saved_regs_size) + && const_below_saved_regs + const_saved_regs_size < 512 + /* We could handle this case even with data below the saved + registers, provided that that data left us with valid offsets +@@ -8441,8 +8439,7 @@ aarch64_layout_frame (void) + frame.initial_adjust = frame.frame_size; + } + else if (saves_below_hard_fp_p +- && known_eq (frame.saved_regs_size, +- frame.below_hard_fp_saved_regs_size)) ++ && known_eq (saved_regs_size, below_hard_fp_saved_regs_size)) + { + /* Frame in which all saves are SVE saves: + +@@ -8464,7 +8461,7 @@ aarch64_layout_frame (void) + [save SVE registers relative to SP] + sub sp, sp, bytes_below_saved_regs */ + frame.callee_adjust = const_above_fp; +- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; ++ frame.sve_callee_adjust = below_hard_fp_saved_regs_size; + frame.final_adjust = frame.bytes_below_saved_regs; + } + else +@@ -8479,7 +8476,7 @@ aarch64_layout_frame (void) + [save SVE registers relative to SP] + sub sp, sp, bytes_below_saved_regs */ + frame.initial_adjust = frame.bytes_above_hard_fp; +- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; ++ frame.sve_callee_adjust = below_hard_fp_saved_regs_size; + frame.final_adjust = frame.bytes_below_saved_regs; + } + +@@ -9621,17 +9618,17 @@ aarch64_epilogue_uses (int regno) + | local variables | <-- frame_pointer_rtx + | | + +-------------------------------+ +- | padding | \ +- +-------------------------------+ | +- | callee-saved registers | | frame.saved_regs_size +- +-------------------------------+ | +- | LR' | | +- +-------------------------------+ | +- | FP' | | +- +-------------------------------+ |<- hard_frame_pointer_rtx (aligned) +- | SVE vector registers | | \ +- +-------------------------------+ | | below_hard_fp_saved_regs_size +- | SVE predicate registers | / / ++ | padding | ++ +-------------------------------+ ++ | callee-saved registers | ++ +-------------------------------+ ++ | LR' | ++ +-------------------------------+ ++ | FP' | ++ +-------------------------------+ <-- hard_frame_pointer_rtx (aligned) ++ | SVE vector registers | ++ +-------------------------------+ ++ | SVE predicate registers | + +-------------------------------+ + | dynamic allocation | + +-------------------------------+ +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 46d4693e206..01f7751bc78 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -871,18 +871,11 @@ struct GTY (()) aarch64_frame + STACK_BOUNDARY. */ + HOST_WIDE_INT saved_varargs_size; + +- /* The size of the callee-save registers with a slot in REG_OFFSET. */ +- poly_int64 saved_regs_size; +- + /* The number of bytes between the bottom of the static frame (the bottom + of the outgoing arguments) and the bottom of the register save area. + This value is always a multiple of STACK_BOUNDARY. */ + poly_int64 bytes_below_saved_regs; + +- /* The size of the callee-save registers with a slot in REG_OFFSET that +- are saved below the hard frame pointer. */ +- poly_int64 below_hard_fp_saved_regs_size; +- + /* The number of bytes between the bottom of the static frame (the bottom + of the outgoing arguments) and the hard frame pointer. This value is + always a multiple of STACK_BOUNDARY. */ +-- +2.34.1 + + +From 75c37e031408262263442f5b4cdb83d3777b6422 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:57 +0100 +Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved + registers + +AArch64 normally puts the saved registers near the bottom of the frame, +immediately above any dynamic allocations. But this means that a +stack-smash attack on those dynamic allocations could overwrite the +saved registers without needing to reach as far as the stack smash +canary. + +The same thing could also happen for variable-sized arguments that are +passed by value, since those are allocated before a call and popped on +return. + +This patch avoids that by putting the locals (and thus the canary) below +the saved registers when stack smash protection is active. + +The patch fixes CVE-2023-4039. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p): + New function. + (aarch64_layout_frame): Use it to decide whether locals should + go above or below the saved registers. + (aarch64_expand_prologue): Update stack layout comment. + Emit a stack tie after the final adjustment. + +gcc/testsuite/ + * gcc.target/aarch64/stack-protector-8.c: New test. + * gcc.target/aarch64/stack-protector-9.c: Likewise. +--- + gcc/config/aarch64/aarch64.cc | 46 +++++++-- + .../gcc.target/aarch64/stack-protector-8.c | 95 +++++++++++++++++++ + .../gcc.target/aarch64/stack-protector-9.c | 33 +++++++ + 3 files changed, 168 insertions(+), 6 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index ac3d3b336a3..96c3f48fdc4 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8133,6 +8133,20 @@ aarch64_needs_frame_chain (void) + return aarch64_use_frame_pointer; + } + ++/* Return true if the current function should save registers above ++ the locals area, rather than below it. */ ++ ++static bool ++aarch64_save_regs_above_locals_p () ++{ ++ /* When using stack smash protection, make sure that the canary slot ++ comes between the locals and the saved registers. Otherwise, ++ it would be possible for a carefully sized smash attack to change ++ the saved registers (particularly LR and FP) without reaching the ++ canary. */ ++ return crtl->stack_protect_guard; ++} ++ + /* Mark the registers that need to be saved by the callee and calculate + the size of the callee-saved registers area and frame record (both FP + and LR may be omitted). */ +@@ -8144,6 +8158,7 @@ aarch64_layout_frame (void) + poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); + bool frame_related_fp_reg_p = false; + aarch64_frame &frame = cfun->machine->frame; ++ poly_int64 top_of_locals = -1; + + frame.emit_frame_chain = aarch64_needs_frame_chain (); + +@@ -8210,9 +8225,16 @@ aarch64_layout_frame (void) + && !crtl->abi->clobbers_full_reg_p (regno)) + frame.reg_offset[regno] = SLOT_REQUIRED; + ++ bool regs_at_top_p = aarch64_save_regs_above_locals_p (); + + poly_int64 offset = crtl->outgoing_args_size; + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); ++ if (regs_at_top_p) ++ { ++ offset += get_frame_size (); ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); ++ top_of_locals = offset; ++ } + frame.bytes_below_saved_regs = offset; + frame.sve_save_and_probe = INVALID_REGNUM; + +@@ -8352,15 +8374,18 @@ aarch64_layout_frame (void) + at expand_prologue. */ + gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0)); + +- offset += get_frame_size (); +- offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); +- auto top_of_locals = offset; +- ++ if (!regs_at_top_p) ++ { ++ offset += get_frame_size (); ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); ++ top_of_locals = offset; ++ } + offset += frame.saved_varargs_size; + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); + frame.frame_size = offset; + + frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp; ++ gcc_assert (known_ge (top_of_locals, 0)); + frame.bytes_above_locals = frame.frame_size - top_of_locals; + + frame.initial_adjust = 0; +@@ -9615,10 +9640,10 @@ aarch64_epilogue_uses (int regno) + | for register varargs | + | | + +-------------------------------+ +- | local variables | <-- frame_pointer_rtx ++ | local variables (1) | <-- frame_pointer_rtx + | | + +-------------------------------+ +- | padding | ++ | padding (1) | + +-------------------------------+ + | callee-saved registers | + +-------------------------------+ +@@ -9630,6 +9655,10 @@ aarch64_epilogue_uses (int regno) + +-------------------------------+ + | SVE predicate registers | + +-------------------------------+ ++ | local variables (2) | ++ +-------------------------------+ ++ | padding (2) | ++ +-------------------------------+ + | dynamic allocation | + +-------------------------------+ + | padding | +@@ -9639,6 +9668,9 @@ aarch64_epilogue_uses (int regno) + +-------------------------------+ + | | <-- stack_pointer_rtx (aligned) + ++ The regions marked (1) and (2) are mutually exclusive. (2) is used ++ when aarch64_save_regs_above_locals_p is true. ++ + Dynamic stack allocations via alloca() decrease stack_pointer_rtx + but leave frame_pointer_rtx and hard_frame_pointer_rtx + unchanged. +@@ -9834,6 +9866,8 @@ aarch64_expand_prologue (void) + gcc_assert (known_eq (bytes_below_sp, final_adjust)); + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, + !frame_pointer_needed, true); ++ if (emit_frame_chain && maybe_ne (final_adjust, 0)) ++ emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); + } + + /* Return TRUE if we can use a simple_return insn. +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c +new file mode 100644 +index 00000000000..e71d820e365 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c +@@ -0,0 +1,95 @@ ++/* { dg-options " -O -fstack-protector-strong -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 -mstack-protector-guard-offset=16" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++void g(void *); ++__SVBool_t *h(void *); ++ ++/* ++** test1: ++** sub sp, sp, #288 ++** stp x29, x30, \[sp, #?272\] ++** add x29, sp, #?272 ++** mrs (x[0-9]+), tpidr2_el0 ++** ldr (x[0-9]+), \[\1, #?16\] ++** str \2, \[sp, #?264\] ++** mov \2, #?0 ++** add x0, sp, #?8 ++** bl g ++** ... ++** mrs .* ++** ... ++** bne .* ++** ... ++** ldp x29, x30, \[sp, #?272\] ++** add sp, sp, #?288 ++** ret ++** bl __stack_chk_fail ++*/ ++int test1() { ++ int y[0x40]; ++ g(y); ++ return 1; ++} ++ ++/* ++** test2: ++** stp x29, x30, \[sp, #?-16\]! ++** mov x29, sp ++** sub sp, sp, #1040 ++** mrs (x[0-9]+), tpidr2_el0 ++** ldr (x[0-9]+), \[\1, #?16\] ++** str \2, \[sp, #?1032\] ++** mov \2, #?0 ++** add x0, sp, #?8 ++** bl g ++** ... ++** mrs .* ++** ... ++** bne .* ++** ... ++** add sp, sp, #?1040 ++** ldp x29, x30, \[sp\], #?16 ++** ret ++** bl __stack_chk_fail ++*/ ++int test2() { ++ int y[0x100]; ++ g(y); ++ return 1; ++} ++ ++#pragma GCC target "+sve" ++ ++/* ++** test3: ++** stp x29, x30, \[sp, #?-16\]! ++** mov x29, sp ++** addvl sp, sp, #-18 ++** ... ++** str p4, \[sp\] ++** ... ++** sub sp, sp, #272 ++** mrs (x[0-9]+), tpidr2_el0 ++** ldr (x[0-9]+), \[\1, #?16\] ++** str \2, \[sp, #?264\] ++** mov \2, #?0 ++** add x0, sp, #?8 ++** bl h ++** ... ++** mrs .* ++** ... ++** bne .* ++** ... ++** add sp, sp, #?272 ++** ... ++** ldr p4, \[sp\] ++** ... ++** addvl sp, sp, #18 ++** ldp x29, x30, \[sp\], #?16 ++** ret ++** bl __stack_chk_fail ++*/ ++__SVBool_t test3() { ++ int y[0x40]; ++ return *h(y); ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c +new file mode 100644 +index 00000000000..58f322aa480 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c +@@ -0,0 +1,33 @@ ++/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++/* ++** main: ++** ... ++** stp x29, x30, \[sp, #?-[0-9]+\]! ++** ... ++** sub sp, sp, #[0-9]+ ++** ... ++** str x[0-9]+, \[x29, #?-8\] ++** ... ++*/ ++int f(const char *); ++void g(void *); ++int main(int argc, char* argv[]) ++{ ++ int a; ++ int b; ++ char c[2+f(argv[1])]; ++ int d[0x100]; ++ char y; ++ ++ y=42; a=4; b=10; ++ c[0] = 'h'; c[1] = '\0'; ++ ++ c[f(argv[2])] = '\0'; ++ ++ __builtin_printf("%d %d\n%s\n", a, b, c); ++ g(d); ++ ++ return 0; ++} +-- +2.34.1 +