1From: Richard Sandiford <richard.sandiford@arm.com> 2Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue 3Date: Tue, 12 Sep 2023 16:25:10 +0100 4 5This series of patches fixes deficiencies in GCC's -fstack-protector 6implementation for AArch64 when using dynamically allocated stack space. 7This is CVE-2023-4039. See: 8 9https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64 10https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf 11 12for more details. 13 14The fix is to put the saved registers above the locals area when 15-fstack-protector is used. 16 17The series also fixes a stack-clash problem that I found while working 18on the CVE. In unpatched sources, the stack-clash problem would only 19trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an 20equivalent). But it would be a more significant issue with the new 21-fstack-protector frame layout. It's therefore important that both 22problems are fixed together. 23 24Some reorganisation of the code seemed necessary to fix the problems in a 25cleanish way. The series is therefore quite long, but only a handful of 26patches should have any effect on code generation. 27 28See the individual patches for a detailed description. 29 30Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches. 31I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039. 32 33CVE: CVE-2023-4039 34Upstream-Status: Backport 35Signed-off-by: Ross Burton <ross.burton@arm.com> 36 37 38From 71a2aa2127283f450c623d3604dbcabe0e14a8d4 Mon Sep 17 00:00:00 2001 39From: Richard Sandiford <richard.sandiford@arm.com> 40Date: Tue, 12 Sep 2023 16:07:12 +0100 41Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping code 42 43aarch64_layout_frame uses a shorthand for referring to 44cfun->machine->frame: 45 46 aarch64_frame &frame = cfun->machine->frame; 47 48This patch does the same for some other heavy users of the structure. 49No functional change intended. 50 51gcc/ 52 * config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use 53 a local shorthand for cfun->machine->frame. 54 (aarch64_restore_callee_saves, aarch64_get_separate_components): 55 (aarch64_process_components): Likewise. 56 (aarch64_allocate_and_probe_stack_space): Likewise. 57 (aarch64_expand_prologue, aarch64_expand_epilogue): Likewise. 58 (aarch64_layout_frame): Use existing shorthand for one more case. 59--- 60 gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++---------------- 61 1 file changed, 64 insertions(+), 59 deletions(-) 62 63diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 64index 822a2b49a46..5d473d161d9 100644 65--- a/gcc/config/aarch64/aarch64.cc 66+++ b/gcc/config/aarch64/aarch64.cc 67@@ -8612,7 +8612,7 @@ aarch64_layout_frame (void) 68 frame.is_scs_enabled 69 = (!crtl->calls_eh_return 70 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK) 71- && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)); 72+ && known_ge (frame.reg_offset[LR_REGNUM], 0)); 73 74 /* When shadow call stack is enabled, the scs_pop in the epilogue will 75 restore x30, and we don't need to pop x30 again in the traditional 76@@ -9078,6 +9078,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, 77 unsigned start, unsigned limit, bool skip_wb, 78 bool hard_fp_valid_p) 79 { 80+ aarch64_frame &frame = cfun->machine->frame; 81 rtx_insn *insn; 82 unsigned regno; 83 unsigned regno2; 84@@ -9092,8 +9093,8 @@ aarch64_save_callee_saves (poly_int64 start_offset, 85 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); 86 87 if (skip_wb 88- && (regno == cfun->machine->frame.wb_push_candidate1 89- || regno == cfun->machine->frame.wb_push_candidate2)) 90+ && (regno == frame.wb_push_candidate1 91+ || regno == frame.wb_push_candidate2)) 92 continue; 93 94 if (cfun->machine->reg_is_wrapped_separately[regno]) 95@@ -9101,7 +9102,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, 96 97 machine_mode mode = aarch64_reg_save_mode (regno); 98 reg = gen_rtx_REG (mode, regno); 99- offset = start_offset + cfun->machine->frame.reg_offset[regno]; 100+ offset = start_offset + frame.reg_offset[regno]; 101 rtx base_rtx = stack_pointer_rtx; 102 poly_int64 sp_offset = offset; 103 104@@ -9114,7 +9115,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, 105 { 106 gcc_assert (known_eq (start_offset, 0)); 107 poly_int64 fp_offset 108- = cfun->machine->frame.below_hard_fp_saved_regs_size; 109+ = frame.below_hard_fp_saved_regs_size; 110 if (hard_fp_valid_p) 111 base_rtx = hard_frame_pointer_rtx; 112 else 113@@ -9136,8 +9137,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, 114 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit 115 && !cfun->machine->reg_is_wrapped_separately[regno2] 116 && known_eq (GET_MODE_SIZE (mode), 117- cfun->machine->frame.reg_offset[regno2] 118- - cfun->machine->frame.reg_offset[regno])) 119+ frame.reg_offset[regno2] - frame.reg_offset[regno])) 120 { 121 rtx reg2 = gen_rtx_REG (mode, regno2); 122 rtx mem2; 123@@ -9187,6 +9187,7 @@ static void 124 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, 125 unsigned limit, bool skip_wb, rtx *cfi_ops) 126 { 127+ aarch64_frame &frame = cfun->machine->frame; 128 unsigned regno; 129 unsigned regno2; 130 poly_int64 offset; 131@@ -9203,13 +9204,13 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, 132 rtx reg, mem; 133 134 if (skip_wb 135- && (regno == cfun->machine->frame.wb_pop_candidate1 136- || regno == cfun->machine->frame.wb_pop_candidate2)) 137+ && (regno == frame.wb_pop_candidate1 138+ || regno == frame.wb_pop_candidate2)) 139 continue; 140 141 machine_mode mode = aarch64_reg_save_mode (regno); 142 reg = gen_rtx_REG (mode, regno); 143- offset = start_offset + cfun->machine->frame.reg_offset[regno]; 144+ offset = start_offset + frame.reg_offset[regno]; 145 rtx base_rtx = stack_pointer_rtx; 146 if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 147 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, 148@@ -9220,8 +9221,7 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, 149 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit 150 && !cfun->machine->reg_is_wrapped_separately[regno2] 151 && known_eq (GET_MODE_SIZE (mode), 152- cfun->machine->frame.reg_offset[regno2] 153- - cfun->machine->frame.reg_offset[regno])) 154+ frame.reg_offset[regno2] - frame.reg_offset[regno])) 155 { 156 rtx reg2 = gen_rtx_REG (mode, regno2); 157 rtx mem2; 158@@ -9326,6 +9326,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset) 159 static sbitmap 160 aarch64_get_separate_components (void) 161 { 162+ aarch64_frame &frame = cfun->machine->frame; 163 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); 164 bitmap_clear (components); 165 166@@ -9342,18 +9343,18 @@ aarch64_get_separate_components (void) 167 if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 168 continue; 169 170- poly_int64 offset = cfun->machine->frame.reg_offset[regno]; 171+ poly_int64 offset = frame.reg_offset[regno]; 172 173 /* If the register is saved in the first SVE save slot, we use 174 it as a stack probe for -fstack-clash-protection. */ 175 if (flag_stack_clash_protection 176- && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0) 177+ && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) 178 && known_eq (offset, 0)) 179 continue; 180 181 /* Get the offset relative to the register we'll use. */ 182 if (frame_pointer_needed) 183- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; 184+ offset -= frame.below_hard_fp_saved_regs_size; 185 else 186 offset += crtl->outgoing_args_size; 187 188@@ -9372,11 +9373,11 @@ aarch64_get_separate_components (void) 189 /* If the spare predicate register used by big-endian SVE code 190 is call-preserved, it must be saved in the main prologue 191 before any saves that use it. */ 192- if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM) 193- bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg); 194+ if (frame.spare_pred_reg != INVALID_REGNUM) 195+ bitmap_clear_bit (components, frame.spare_pred_reg); 196 197- unsigned reg1 = cfun->machine->frame.wb_push_candidate1; 198- unsigned reg2 = cfun->machine->frame.wb_push_candidate2; 199+ unsigned reg1 = frame.wb_push_candidate1; 200+ unsigned reg2 = frame.wb_push_candidate2; 201 /* If registers have been chosen to be stored/restored with 202 writeback don't interfere with them to avoid having to output explicit 203 stack adjustment instructions. */ 204@@ -9485,6 +9486,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int start) 205 static void 206 aarch64_process_components (sbitmap components, bool prologue_p) 207 { 208+ aarch64_frame &frame = cfun->machine->frame; 209 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed 210 ? HARD_FRAME_POINTER_REGNUM 211 : STACK_POINTER_REGNUM); 212@@ -9499,9 +9501,9 @@ aarch64_process_components (sbitmap components, bool prologue_p) 213 machine_mode mode = aarch64_reg_save_mode (regno); 214 215 rtx reg = gen_rtx_REG (mode, regno); 216- poly_int64 offset = cfun->machine->frame.reg_offset[regno]; 217+ poly_int64 offset = frame.reg_offset[regno]; 218 if (frame_pointer_needed) 219- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; 220+ offset -= frame.below_hard_fp_saved_regs_size; 221 else 222 offset += crtl->outgoing_args_size; 223 224@@ -9526,14 +9528,14 @@ aarch64_process_components (sbitmap components, bool prologue_p) 225 break; 226 } 227 228- poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2]; 229+ poly_int64 offset2 = frame.reg_offset[regno2]; 230 /* The next register is not of the same class or its offset is not 231 mergeable with the current one into a pair. */ 232 if (aarch64_sve_mode_p (mode) 233 || !satisfies_constraint_Ump (mem) 234 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2) 235 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno)) 236- || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]), 237+ || maybe_ne ((offset2 - frame.reg_offset[regno]), 238 GET_MODE_SIZE (mode))) 239 { 240 insn = emit_insn (set); 241@@ -9555,7 +9557,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) 242 /* REGNO2 can be saved/restored in a pair with REGNO. */ 243 rtx reg2 = gen_rtx_REG (mode, regno2); 244 if (frame_pointer_needed) 245- offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size; 246+ offset2 -= frame.below_hard_fp_saved_regs_size; 247 else 248 offset2 += crtl->outgoing_args_size; 249 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); 250@@ -9650,6 +9652,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 251 bool frame_related_p, 252 bool final_adjustment_p) 253 { 254+ aarch64_frame &frame = cfun->machine->frame; 255 HOST_WIDE_INT guard_size 256 = 1 << param_stack_clash_protection_guard_size; 257 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; 258@@ -9670,25 +9673,25 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 259 register as a probe. We can't assume that LR was saved at position 0 260 though, so treat any space below it as unprobed. */ 261 if (final_adjustment_p 262- && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)) 263+ && known_eq (frame.below_hard_fp_saved_regs_size, 0)) 264 { 265- poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM]; 266+ poly_int64 lr_offset = frame.reg_offset[LR_REGNUM]; 267 if (known_ge (lr_offset, 0)) 268 min_probe_threshold -= lr_offset.to_constant (); 269 else 270 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0)); 271 } 272 273- poly_int64 frame_size = cfun->machine->frame.frame_size; 274+ poly_int64 frame_size = frame.frame_size; 275 276 /* We should always have a positive probe threshold. */ 277 gcc_assert (min_probe_threshold > 0); 278 279 if (flag_stack_clash_protection && !final_adjustment_p) 280 { 281- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 282- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; 283- poly_int64 final_adjust = cfun->machine->frame.final_adjust; 284+ poly_int64 initial_adjust = frame.initial_adjust; 285+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; 286+ poly_int64 final_adjust = frame.final_adjust; 287 288 if (known_eq (frame_size, 0)) 289 { 290@@ -9977,17 +9980,18 @@ aarch64_epilogue_uses (int regno) 291 void 292 aarch64_expand_prologue (void) 293 { 294- poly_int64 frame_size = cfun->machine->frame.frame_size; 295- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 296- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; 297- poly_int64 final_adjust = cfun->machine->frame.final_adjust; 298- poly_int64 callee_offset = cfun->machine->frame.callee_offset; 299- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; 300+ aarch64_frame &frame = cfun->machine->frame; 301+ poly_int64 frame_size = frame.frame_size; 302+ poly_int64 initial_adjust = frame.initial_adjust; 303+ HOST_WIDE_INT callee_adjust = frame.callee_adjust; 304+ poly_int64 final_adjust = frame.final_adjust; 305+ poly_int64 callee_offset = frame.callee_offset; 306+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; 307 poly_int64 below_hard_fp_saved_regs_size 308- = cfun->machine->frame.below_hard_fp_saved_regs_size; 309- unsigned reg1 = cfun->machine->frame.wb_push_candidate1; 310- unsigned reg2 = cfun->machine->frame.wb_push_candidate2; 311- bool emit_frame_chain = cfun->machine->frame.emit_frame_chain; 312+ = frame.below_hard_fp_saved_regs_size; 313+ unsigned reg1 = frame.wb_push_candidate1; 314+ unsigned reg2 = frame.wb_push_candidate2; 315+ bool emit_frame_chain = frame.emit_frame_chain; 316 rtx_insn *insn; 317 318 if (flag_stack_clash_protection && known_eq (callee_adjust, 0)) 319@@ -10018,7 +10022,7 @@ aarch64_expand_prologue (void) 320 } 321 322 /* Push return address to shadow call stack. */ 323- if (cfun->machine->frame.is_scs_enabled) 324+ if (frame.is_scs_enabled) 325 emit_insn (gen_scs_push ()); 326 327 if (flag_stack_usage_info) 328@@ -10057,7 +10061,7 @@ aarch64_expand_prologue (void) 329 330 /* The offset of the frame chain record (if any) from the current SP. */ 331 poly_int64 chain_offset = (initial_adjust + callee_adjust 332- - cfun->machine->frame.hard_fp_offset); 333+ - frame.hard_fp_offset); 334 gcc_assert (known_ge (chain_offset, 0)); 335 336 /* The offset of the bottom of the save area from the current SP. */ 337@@ -10160,16 +10164,17 @@ aarch64_use_return_insn_p (void) 338 void 339 aarch64_expand_epilogue (bool for_sibcall) 340 { 341- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 342- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; 343- poly_int64 final_adjust = cfun->machine->frame.final_adjust; 344- poly_int64 callee_offset = cfun->machine->frame.callee_offset; 345- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; 346+ aarch64_frame &frame = cfun->machine->frame; 347+ poly_int64 initial_adjust = frame.initial_adjust; 348+ HOST_WIDE_INT callee_adjust = frame.callee_adjust; 349+ poly_int64 final_adjust = frame.final_adjust; 350+ poly_int64 callee_offset = frame.callee_offset; 351+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; 352 poly_int64 below_hard_fp_saved_regs_size 353- = cfun->machine->frame.below_hard_fp_saved_regs_size; 354- unsigned reg1 = cfun->machine->frame.wb_pop_candidate1; 355- unsigned reg2 = cfun->machine->frame.wb_pop_candidate2; 356- unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled 357+ = frame.below_hard_fp_saved_regs_size; 358+ unsigned reg1 = frame.wb_pop_candidate1; 359+ unsigned reg2 = frame.wb_pop_candidate2; 360+ unsigned int last_gpr = (frame.is_scs_enabled 361 ? R29_REGNUM : R30_REGNUM); 362 rtx cfi_ops = NULL; 363 rtx_insn *insn; 364@@ -10203,7 +10208,7 @@ aarch64_expand_epilogue (bool for_sibcall) 365 /* We need to add memory barrier to prevent read from deallocated stack. */ 366 bool need_barrier_p 367 = maybe_ne (get_frame_size () 368- + cfun->machine->frame.saved_varargs_size, 0); 369+ + frame.saved_varargs_size, 0); 370 371 /* Emit a barrier to prevent loads from a deallocated stack. */ 372 if (maybe_gt (final_adjust, crtl->outgoing_args_size) 373@@ -10284,7 +10289,7 @@ aarch64_expand_epilogue (bool for_sibcall) 374 } 375 376 /* Pop return address from shadow call stack. */ 377- if (cfun->machine->frame.is_scs_enabled) 378+ if (frame.is_scs_enabled) 379 { 380 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM); 381 rtx reg = gen_rtx_REG (mode, R30_REGNUM); 382@@ -12740,24 +12745,24 @@ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) 383 poly_int64 384 aarch64_initial_elimination_offset (unsigned from, unsigned to) 385 { 386+ aarch64_frame &frame = cfun->machine->frame; 387+ 388 if (to == HARD_FRAME_POINTER_REGNUM) 389 { 390 if (from == ARG_POINTER_REGNUM) 391- return cfun->machine->frame.hard_fp_offset; 392+ return frame.hard_fp_offset; 393 394 if (from == FRAME_POINTER_REGNUM) 395- return cfun->machine->frame.hard_fp_offset 396- - cfun->machine->frame.locals_offset; 397+ return frame.hard_fp_offset - frame.locals_offset; 398 } 399 400 if (to == STACK_POINTER_REGNUM) 401 { 402 if (from == FRAME_POINTER_REGNUM) 403- return cfun->machine->frame.frame_size 404- - cfun->machine->frame.locals_offset; 405+ return frame.frame_size - frame.locals_offset; 406 } 407 408- return cfun->machine->frame.frame_size; 409+ return frame.frame_size; 410 } 411 412 413-- 4142.34.1 415 416 417From 89a9fa287706c5011f61926eaf65e7b996b963a3 Mon Sep 17 00:00:00 2001 418From: Richard Sandiford <richard.sandiford@arm.com> 419Date: Tue, 12 Sep 2023 16:07:12 +0100 420Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset 421 422When we emit the frame chain, i.e. when we reach Here in this statement 423of aarch64_expand_prologue: 424 425 if (emit_frame_chain) 426 { 427 // Here 428 ... 429 } 430 431the stack is in one of two states: 432 433- We've allocated up to the frame chain, but no more. 434 435- We've allocated the whole frame, and the frame chain is within easy 436 reach of the new SP. 437 438The offset of the frame chain from the current SP is available 439in aarch64_frame as callee_offset. It is also available as the 440chain_offset local variable, where the latter is calculated from other 441data. (However, chain_offset is not always equal to callee_offset when 442!emit_frame_chain, so chain_offset isn't redundant.) 443 444In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using 445chain_offset for the initialisation of the hard frame pointer: 446 447 aarch64_add_offset (Pmode, hard_frame_pointer_rtx, 448- stack_pointer_rtx, callee_offset, 449+ stack_pointer_rtx, chain_offset, 450 tmp1_rtx, tmp0_rtx, frame_pointer_needed); 451 452But the later REG_CFA_ADJUST_CFA handling still used callee_offset. 453 454I think the difference is harmless, but it's more logical for the 455CFA note to be in sync, and it's more convenient for later patches 456if it uses chain_offset. 457 458gcc/ 459 * config/aarch64/aarch64.cc (aarch64_expand_prologue): Use 460 chain_offset rather than callee_offset. 461--- 462 gcc/config/aarch64/aarch64.cc | 4 +--- 463 1 file changed, 1 insertion(+), 3 deletions(-) 464 465diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 466index 5d473d161d9..4f233c95140 100644 467--- a/gcc/config/aarch64/aarch64.cc 468+++ b/gcc/config/aarch64/aarch64.cc 469@@ -9985,7 +9985,6 @@ aarch64_expand_prologue (void) 470 poly_int64 initial_adjust = frame.initial_adjust; 471 HOST_WIDE_INT callee_adjust = frame.callee_adjust; 472 poly_int64 final_adjust = frame.final_adjust; 473- poly_int64 callee_offset = frame.callee_offset; 474 poly_int64 sve_callee_adjust = frame.sve_callee_adjust; 475 poly_int64 below_hard_fp_saved_regs_size 476 = frame.below_hard_fp_saved_regs_size; 477@@ -10098,8 +10097,7 @@ aarch64_expand_prologue (void) 478 implicit. */ 479 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX)) 480 { 481- rtx src = plus_constant (Pmode, stack_pointer_rtx, 482- callee_offset); 483+ rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset); 484 add_reg_note (insn, REG_CFA_ADJUST_CFA, 485 gen_rtx_SET (hard_frame_pointer_rtx, src)); 486 } 487-- 4882.34.1 489 490 491From b36a2a78040722dab6124366c5d6baf8eaf80aef Mon Sep 17 00:00:00 2001 492From: Richard Sandiford <richard.sandiford@arm.com> 493Date: Tue, 12 Sep 2023 16:07:13 +0100 494Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved 495 registers 496 497If a frame has no saved registers, it can be allocated in one go. 498There is no need to treat the areas below and above the saved 499registers as separate. 500 501And if we allocate the frame in one go, it should be allocated 502as the initial_adjust rather than the final_adjust. This allows the 503frame size to grow to guard_size - guard_used_by_caller before a stack 504probe is needed. (A frame with no register saves is necessarily a 505leaf frame.) 506 507This is a no-op as thing stand, since a leaf function will have 508no outgoing arguments, and so all the frame will be above where 509the saved registers normally go. 510 511gcc/ 512 * config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly 513 allocate the frame in one go if there are no saved registers. 514--- 515 gcc/config/aarch64/aarch64.cc | 8 +++++--- 516 1 file changed, 5 insertions(+), 3 deletions(-) 517 518diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 519index 4f233c95140..37643041ffb 100644 520--- a/gcc/config/aarch64/aarch64.cc 521+++ b/gcc/config/aarch64/aarch64.cc 522@@ -8639,9 +8639,11 @@ aarch64_layout_frame (void) 523 524 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; 525 HOST_WIDE_INT const_saved_regs_size; 526- if (frame.frame_size.is_constant (&const_size) 527- && const_size < max_push_offset 528- && known_eq (frame.hard_fp_offset, const_size)) 529+ if (known_eq (frame.saved_regs_size, 0)) 530+ frame.initial_adjust = frame.frame_size; 531+ else if (frame.frame_size.is_constant (&const_size) 532+ && const_size < max_push_offset 533+ && known_eq (frame.hard_fp_offset, const_size)) 534 { 535 /* Simple, small frame with no outgoing arguments: 536 537-- 5382.34.1 539 540 541From ada2ab0093596be707f23a3466ac82cff59fcffe Mon Sep 17 00:00:00 2001 542From: Richard Sandiford <richard.sandiford@arm.com> 543Date: Tue, 12 Sep 2023 16:07:13 +0100 544Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info 545 546The frame layout code currently hard-codes the assumption that 547the number of bytes below the saved registers is equal to the 548size of the outgoing arguments. This patch abstracts that 549value into a new field of aarch64_frame. 550 551gcc/ 552 * config/aarch64/aarch64.h (aarch64_frame::bytes_below_saved_regs): New 553 field. 554 * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it, 555 and use it instead of crtl->outgoing_args_size. 556 (aarch64_get_separate_components): Use bytes_below_saved_regs instead 557 of outgoing_args_size. 558 (aarch64_process_components): Likewise. 559--- 560 gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++----------------- 561 gcc/config/aarch64/aarch64.h | 5 +++ 562 2 files changed, 41 insertions(+), 35 deletions(-) 563 564diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 565index 37643041ffb..dacc2b0e4dd 100644 566--- a/gcc/config/aarch64/aarch64.cc 567+++ b/gcc/config/aarch64/aarch64.cc 568@@ -8478,6 +8478,8 @@ aarch64_layout_frame (void) 569 gcc_assert (crtl->is_leaf 570 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); 571 572+ frame.bytes_below_saved_regs = crtl->outgoing_args_size; 573+ 574 /* Now assign stack slots for the registers. Start with the predicate 575 registers, since predicate LDR and STR have a relatively small 576 offset range. These saves happen below the hard frame pointer. */ 577@@ -8582,18 +8584,18 @@ aarch64_layout_frame (void) 578 579 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size; 580 581- poly_int64 above_outgoing_args 582+ poly_int64 saved_regs_and_above 583 = aligned_upper_bound (varargs_and_saved_regs_size 584 + get_frame_size (), 585 STACK_BOUNDARY / BITS_PER_UNIT); 586 587 frame.hard_fp_offset 588- = above_outgoing_args - frame.below_hard_fp_saved_regs_size; 589+ = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; 590 591 /* Both these values are already aligned. */ 592- gcc_assert (multiple_p (crtl->outgoing_args_size, 593+ gcc_assert (multiple_p (frame.bytes_below_saved_regs, 594 STACK_BOUNDARY / BITS_PER_UNIT)); 595- frame.frame_size = above_outgoing_args + crtl->outgoing_args_size; 596+ frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; 597 598 frame.locals_offset = frame.saved_varargs_size; 599 600@@ -8637,7 +8639,7 @@ aarch64_layout_frame (void) 601 else if (frame.wb_pop_candidate1 != INVALID_REGNUM) 602 max_push_offset = 256; 603 604- HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; 605+ HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset; 606 HOST_WIDE_INT const_saved_regs_size; 607 if (known_eq (frame.saved_regs_size, 0)) 608 frame.initial_adjust = frame.frame_size; 609@@ -8645,31 +8647,31 @@ aarch64_layout_frame (void) 610 && const_size < max_push_offset 611 && known_eq (frame.hard_fp_offset, const_size)) 612 { 613- /* Simple, small frame with no outgoing arguments: 614+ /* Simple, small frame with no data below the saved registers. 615 616 stp reg1, reg2, [sp, -frame_size]! 617 stp reg3, reg4, [sp, 16] */ 618 frame.callee_adjust = const_size; 619 } 620- else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size) 621+ else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs) 622 && frame.saved_regs_size.is_constant (&const_saved_regs_size) 623- && const_outgoing_args_size + const_saved_regs_size < 512 624- /* We could handle this case even with outgoing args, provided 625- that the number of args left us with valid offsets for all 626- predicate and vector save slots. It's such a rare case that 627- it hardly seems worth the effort though. */ 628- && (!saves_below_hard_fp_p || const_outgoing_args_size == 0) 629+ && const_below_saved_regs + const_saved_regs_size < 512 630+ /* We could handle this case even with data below the saved 631+ registers, provided that that data left us with valid offsets 632+ for all predicate and vector save slots. It's such a rare 633+ case that it hardly seems worth the effort though. */ 634+ && (!saves_below_hard_fp_p || const_below_saved_regs == 0) 635 && !(cfun->calls_alloca 636 && frame.hard_fp_offset.is_constant (&const_fp_offset) 637 && const_fp_offset < max_push_offset)) 638 { 639- /* Frame with small outgoing arguments: 640+ /* Frame with small area below the saved registers: 641 642 sub sp, sp, frame_size 643- stp reg1, reg2, [sp, outgoing_args_size] 644- stp reg3, reg4, [sp, outgoing_args_size + 16] */ 645+ stp reg1, reg2, [sp, bytes_below_saved_regs] 646+ stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */ 647 frame.initial_adjust = frame.frame_size; 648- frame.callee_offset = const_outgoing_args_size; 649+ frame.callee_offset = const_below_saved_regs; 650 } 651 else if (saves_below_hard_fp_p 652 && known_eq (frame.saved_regs_size, 653@@ -8679,30 +8681,29 @@ aarch64_layout_frame (void) 654 655 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size 656 save SVE registers relative to SP 657- sub sp, sp, outgoing_args_size */ 658+ sub sp, sp, bytes_below_saved_regs */ 659 frame.initial_adjust = (frame.hard_fp_offset 660 + frame.below_hard_fp_saved_regs_size); 661- frame.final_adjust = crtl->outgoing_args_size; 662+ frame.final_adjust = frame.bytes_below_saved_regs; 663 } 664 else if (frame.hard_fp_offset.is_constant (&const_fp_offset) 665 && const_fp_offset < max_push_offset) 666 { 667- /* Frame with large outgoing arguments or SVE saves, but with 668- a small local area: 669+ /* Frame with large area below the saved registers, or with SVE saves, 670+ but with a small area above: 671 672 stp reg1, reg2, [sp, -hard_fp_offset]! 673 stp reg3, reg4, [sp, 16] 674 [sub sp, sp, below_hard_fp_saved_regs_size] 675 [save SVE registers relative to SP] 676- sub sp, sp, outgoing_args_size */ 677+ sub sp, sp, bytes_below_saved_regs */ 678 frame.callee_adjust = const_fp_offset; 679 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; 680- frame.final_adjust = crtl->outgoing_args_size; 681+ frame.final_adjust = frame.bytes_below_saved_regs; 682 } 683 else 684 { 685- /* Frame with large local area and outgoing arguments or SVE saves, 686- using frame pointer: 687+ /* General case: 688 689 sub sp, sp, hard_fp_offset 690 stp x29, x30, [sp, 0] 691@@ -8710,10 +8711,10 @@ aarch64_layout_frame (void) 692 stp reg3, reg4, [sp, 16] 693 [sub sp, sp, below_hard_fp_saved_regs_size] 694 [save SVE registers relative to SP] 695- sub sp, sp, outgoing_args_size */ 696+ sub sp, sp, bytes_below_saved_regs */ 697 frame.initial_adjust = frame.hard_fp_offset; 698 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; 699- frame.final_adjust = crtl->outgoing_args_size; 700+ frame.final_adjust = frame.bytes_below_saved_regs; 701 } 702 703 /* Make sure the individual adjustments add up to the full frame size. */ 704@@ -9358,7 +9359,7 @@ aarch64_get_separate_components (void) 705 if (frame_pointer_needed) 706 offset -= frame.below_hard_fp_saved_regs_size; 707 else 708- offset += crtl->outgoing_args_size; 709+ offset += frame.bytes_below_saved_regs; 710 711 /* Check that we can access the stack slot of the register with one 712 direct load with no adjustments needed. */ 713@@ -9507,7 +9508,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) 714 if (frame_pointer_needed) 715 offset -= frame.below_hard_fp_saved_regs_size; 716 else 717- offset += crtl->outgoing_args_size; 718+ offset += frame.bytes_below_saved_regs; 719 720 rtx addr = plus_constant (Pmode, ptr_reg, offset); 721 rtx mem = gen_frame_mem (mode, addr); 722@@ -9561,7 +9562,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) 723 if (frame_pointer_needed) 724 offset2 -= frame.below_hard_fp_saved_regs_size; 725 else 726- offset2 += crtl->outgoing_args_size; 727+ offset2 += frame.bytes_below_saved_regs; 728 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); 729 rtx mem2 = gen_frame_mem (mode, addr2); 730 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) 731@@ -9635,10 +9636,10 @@ aarch64_stack_clash_protection_alloca_probe_range (void) 732 registers. If POLY_SIZE is not large enough to require a probe this function 733 will only adjust the stack. When allocating the stack space 734 FRAME_RELATED_P is then used to indicate if the allocation is frame related. 735- FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing 736- arguments. If we are then we ensure that any allocation larger than the ABI 737- defined buffer needs a probe so that the invariant of having a 1KB buffer is 738- maintained. 739+ FINAL_ADJUSTMENT_P indicates whether we are allocating the area below 740+ the saved registers. If we are then we ensure that any allocation 741+ larger than the ABI defined buffer needs a probe so that the 742+ invariant of having a 1KB buffer is maintained. 743 744 We emit barriers after each stack adjustment to prevent optimizations from 745 breaking the invariant that we never drop the stack more than a page. This 746@@ -9847,7 +9848,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 747 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to 748 be probed. This maintains the requirement that each page is probed at 749 least once. For initial probing we probe only if the allocation is 750- more than GUARD_SIZE - buffer, and for the outgoing arguments we probe 751+ more than GUARD_SIZE - buffer, and below the saved registers we probe 752 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer == 753 GUARD_SIZE. This works that for any allocation that is large enough to 754 trigger a probe here, we'll have at least one, and if they're not large 755diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 756index 73b09e20508..0b6faa3ddf1 100644 757--- a/gcc/config/aarch64/aarch64.h 758+++ b/gcc/config/aarch64/aarch64.h 759@@ -777,6 +777,11 @@ struct GTY (()) aarch64_frame 760 /* The size of the callee-save registers with a slot in REG_OFFSET. */ 761 poly_int64 saved_regs_size; 762 763+ /* The number of bytes between the bottom of the static frame (the bottom 764+ of the outgoing arguments) and the bottom of the register save area. 765+ This value is always a multiple of STACK_BOUNDARY. */ 766+ poly_int64 bytes_below_saved_regs; 767+ 768 /* The size of the callee-save registers with a slot in REG_OFFSET that 769 are saved below the hard frame pointer. */ 770 poly_int64 below_hard_fp_saved_regs_size; 771-- 7722.34.1 773 774 775From 82f6b3e1b596ef0f4e3ac3bb9c6e88fb4458f402 Mon Sep 17 00:00:00 2001 776From: Richard Sandiford <richard.sandiford@arm.com> 777Date: Tue, 12 Sep 2023 16:07:14 +0100 778Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info 779 780Following on from the previous bytes_below_saved_regs patch, this one 781records the number of bytes that are below the hard frame pointer. 782This eventually replaces below_hard_fp_saved_regs_size. 783 784If a frame pointer is not needed, the epilogue adds final_adjust 785to the stack pointer before restoring registers: 786 787 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true); 788 789Therefore, if the epilogue needs to restore the stack pointer from 790the hard frame pointer, the directly corresponding offset is: 791 792 -bytes_below_hard_fp + final_adjust 793 794i.e. go from the hard frame pointer to the bottom of the frame, 795then add the same amount as if we were using the stack pointer 796from the outset. 797 798gcc/ 799 * config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): New 800 field. 801 * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it. 802 (aarch64_expand_epilogue): Use it instead of 803 below_hard_fp_saved_regs_size. 804--- 805 gcc/config/aarch64/aarch64.cc | 6 +++--- 806 gcc/config/aarch64/aarch64.h | 5 +++++ 807 2 files changed, 8 insertions(+), 3 deletions(-) 808 809diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 810index dacc2b0e4dd..a3f7aabcc59 100644 811--- a/gcc/config/aarch64/aarch64.cc 812+++ b/gcc/config/aarch64/aarch64.cc 813@@ -8530,6 +8530,7 @@ aarch64_layout_frame (void) 814 of the callee save area. */ 815 bool saves_below_hard_fp_p = maybe_ne (offset, 0); 816 frame.below_hard_fp_saved_regs_size = offset; 817+ frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs; 818 if (frame.emit_frame_chain) 819 { 820 /* FP and LR are placed in the linkage record. */ 821@@ -10171,8 +10172,7 @@ aarch64_expand_epilogue (bool for_sibcall) 822 poly_int64 final_adjust = frame.final_adjust; 823 poly_int64 callee_offset = frame.callee_offset; 824 poly_int64 sve_callee_adjust = frame.sve_callee_adjust; 825- poly_int64 below_hard_fp_saved_regs_size 826- = frame.below_hard_fp_saved_regs_size; 827+ poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp; 828 unsigned reg1 = frame.wb_pop_candidate1; 829 unsigned reg2 = frame.wb_pop_candidate2; 830 unsigned int last_gpr = (frame.is_scs_enabled 831@@ -10230,7 +10230,7 @@ aarch64_expand_epilogue (bool for_sibcall) 832 is restored on the instruction doing the writeback. */ 833 aarch64_add_offset (Pmode, stack_pointer_rtx, 834 hard_frame_pointer_rtx, 835- -callee_offset - below_hard_fp_saved_regs_size, 836+ -bytes_below_hard_fp + final_adjust, 837 tmp1_rtx, tmp0_rtx, callee_adjust == 0); 838 else 839 /* The case where we need to re-use the register here is very rare, so 840diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 841index 0b6faa3ddf1..4263d29d29d 100644 842--- a/gcc/config/aarch64/aarch64.h 843+++ b/gcc/config/aarch64/aarch64.h 844@@ -786,6 +786,11 @@ struct GTY (()) aarch64_frame 845 are saved below the hard frame pointer. */ 846 poly_int64 below_hard_fp_saved_regs_size; 847 848+ /* The number of bytes between the bottom of the static frame (the bottom 849+ of the outgoing arguments) and the hard frame pointer. This value is 850+ always a multiple of STACK_BOUNDARY. */ 851+ poly_int64 bytes_below_hard_fp; 852+ 853 /* Offset from the base of the frame (incomming SP) to the 854 top of the locals area. This value is always a multiple of 855 STACK_BOUNDARY. */ 856-- 8572.34.1 858 859 860From 86fa43e9fe4a8bf954f2919f07cbe3646d1d1df3 Mon Sep 17 00:00:00 2001 861From: Richard Sandiford <richard.sandiford@arm.com> 862Date: Tue, 12 Sep 2023 16:07:14 +0100 863Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves 864 865aarch64_save_callee_saves and aarch64_restore_callee_saves took 866a parameter called start_offset that gives the offset of the 867bottom of the saved register area from the current stack pointer. 868However, it's more convenient for later patches if we use the 869bottom of the entire frame as the reference point, rather than 870the bottom of the saved registers. 871 872Doing that removes the need for the callee_offset field. 873Other than that, this is not a win on its own. It only really 874makes sense in combination with the follow-on patches. 875 876gcc/ 877 * config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete. 878 * config/aarch64/aarch64.cc (aarch64_layout_frame): Remove 879 callee_offset handling. 880 (aarch64_save_callee_saves): Replace the start_offset parameter 881 with a bytes_below_sp parameter. 882 (aarch64_restore_callee_saves): Likewise. 883 (aarch64_expand_prologue): Update accordingly. 884 (aarch64_expand_epilogue): Likewise. 885--- 886 gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------ 887 gcc/config/aarch64/aarch64.h | 4 --- 888 2 files changed, 28 insertions(+), 32 deletions(-) 889 890diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 891index a3f7aabcc59..46ae5cf7673 100644 892--- a/gcc/config/aarch64/aarch64.cc 893+++ b/gcc/config/aarch64/aarch64.cc 894@@ -8604,7 +8604,6 @@ aarch64_layout_frame (void) 895 frame.final_adjust = 0; 896 frame.callee_adjust = 0; 897 frame.sve_callee_adjust = 0; 898- frame.callee_offset = 0; 899 900 frame.wb_pop_candidate1 = frame.wb_push_candidate1; 901 frame.wb_pop_candidate2 = frame.wb_push_candidate2; 902@@ -8672,7 +8671,6 @@ aarch64_layout_frame (void) 903 stp reg1, reg2, [sp, bytes_below_saved_regs] 904 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */ 905 frame.initial_adjust = frame.frame_size; 906- frame.callee_offset = const_below_saved_regs; 907 } 908 else if (saves_below_hard_fp_p 909 && known_eq (frame.saved_regs_size, 910@@ -9073,12 +9071,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx reg, 911 } 912 913 /* Emit code to save the callee-saved registers from register number START 914- to LIMIT to the stack at the location starting at offset START_OFFSET, 915- skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P 916- is true if the hard frame pointer has been set up. */ 917+ to LIMIT to the stack. The stack pointer is currently BYTES_BELOW_SP 918+ bytes above the bottom of the static frame. Skip any write-back 919+ candidates if SKIP_WB is true. HARD_FP_VALID_P is true if the hard 920+ frame pointer has been set up. */ 921 922 static void 923-aarch64_save_callee_saves (poly_int64 start_offset, 924+aarch64_save_callee_saves (poly_int64 bytes_below_sp, 925 unsigned start, unsigned limit, bool skip_wb, 926 bool hard_fp_valid_p) 927 { 928@@ -9106,7 +9105,9 @@ aarch64_save_callee_saves (poly_int64 start_offset, 929 930 machine_mode mode = aarch64_reg_save_mode (regno); 931 reg = gen_rtx_REG (mode, regno); 932- offset = start_offset + frame.reg_offset[regno]; 933+ offset = (frame.reg_offset[regno] 934+ + frame.bytes_below_saved_regs 935+ - bytes_below_sp); 936 rtx base_rtx = stack_pointer_rtx; 937 poly_int64 sp_offset = offset; 938 939@@ -9117,9 +9118,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, 940 else if (GP_REGNUM_P (regno) 941 && (!offset.is_constant (&const_offset) || const_offset >= 512)) 942 { 943- gcc_assert (known_eq (start_offset, 0)); 944- poly_int64 fp_offset 945- = frame.below_hard_fp_saved_regs_size; 946+ poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp; 947 if (hard_fp_valid_p) 948 base_rtx = hard_frame_pointer_rtx; 949 else 950@@ -9183,12 +9182,13 @@ aarch64_save_callee_saves (poly_int64 start_offset, 951 } 952 953 /* Emit code to restore the callee registers from register number START 954- up to and including LIMIT. Restore from the stack offset START_OFFSET, 955- skipping any write-back candidates if SKIP_WB is true. Write the 956- appropriate REG_CFA_RESTORE notes into CFI_OPS. */ 957+ up to and including LIMIT. The stack pointer is currently BYTES_BELOW_SP 958+ bytes above the bottom of the static frame. Skip any write-back 959+ candidates if SKIP_WB is true. Write the appropriate REG_CFA_RESTORE 960+ notes into CFI_OPS. */ 961 962 static void 963-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, 964+aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start, 965 unsigned limit, bool skip_wb, rtx *cfi_ops) 966 { 967 aarch64_frame &frame = cfun->machine->frame; 968@@ -9214,7 +9214,9 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, 969 970 machine_mode mode = aarch64_reg_save_mode (regno); 971 reg = gen_rtx_REG (mode, regno); 972- offset = start_offset + frame.reg_offset[regno]; 973+ offset = (frame.reg_offset[regno] 974+ + frame.bytes_below_saved_regs 975+ - bytes_below_sp); 976 rtx base_rtx = stack_pointer_rtx; 977 if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 978 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, 979@@ -9990,8 +9992,6 @@ aarch64_expand_prologue (void) 980 HOST_WIDE_INT callee_adjust = frame.callee_adjust; 981 poly_int64 final_adjust = frame.final_adjust; 982 poly_int64 sve_callee_adjust = frame.sve_callee_adjust; 983- poly_int64 below_hard_fp_saved_regs_size 984- = frame.below_hard_fp_saved_regs_size; 985 unsigned reg1 = frame.wb_push_candidate1; 986 unsigned reg2 = frame.wb_push_candidate2; 987 bool emit_frame_chain = frame.emit_frame_chain; 988@@ -10067,8 +10067,8 @@ aarch64_expand_prologue (void) 989 - frame.hard_fp_offset); 990 gcc_assert (known_ge (chain_offset, 0)); 991 992- /* The offset of the bottom of the save area from the current SP. */ 993- poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size; 994+ /* The offset of the current SP from the bottom of the static frame. */ 995+ poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust; 996 997 if (emit_frame_chain) 998 { 999@@ -10076,7 +10076,7 @@ aarch64_expand_prologue (void) 1000 { 1001 reg1 = R29_REGNUM; 1002 reg2 = R30_REGNUM; 1003- aarch64_save_callee_saves (saved_regs_offset, reg1, reg2, 1004+ aarch64_save_callee_saves (bytes_below_sp, reg1, reg2, 1005 false, false); 1006 } 1007 else 1008@@ -10116,7 +10116,7 @@ aarch64_expand_prologue (void) 1009 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); 1010 } 1011 1012- aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM, 1013+ aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM, 1014 callee_adjust != 0 || emit_frame_chain, 1015 emit_frame_chain); 1016 if (maybe_ne (sve_callee_adjust, 0)) 1017@@ -10126,16 +10126,17 @@ aarch64_expand_prologue (void) 1018 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, 1019 sve_callee_adjust, 1020 !frame_pointer_needed, false); 1021- saved_regs_offset += sve_callee_adjust; 1022+ bytes_below_sp -= sve_callee_adjust; 1023 } 1024- aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM, 1025+ aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM, 1026 false, emit_frame_chain); 1027- aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM, 1028+ aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM, 1029 callee_adjust != 0 || emit_frame_chain, 1030 emit_frame_chain); 1031 1032 /* We may need to probe the final adjustment if it is larger than the guard 1033 that is assumed by the called. */ 1034+ gcc_assert (known_eq (bytes_below_sp, final_adjust)); 1035 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, 1036 !frame_pointer_needed, true); 1037 } 1038@@ -10170,7 +10171,6 @@ aarch64_expand_epilogue (bool for_sibcall) 1039 poly_int64 initial_adjust = frame.initial_adjust; 1040 HOST_WIDE_INT callee_adjust = frame.callee_adjust; 1041 poly_int64 final_adjust = frame.final_adjust; 1042- poly_int64 callee_offset = frame.callee_offset; 1043 poly_int64 sve_callee_adjust = frame.sve_callee_adjust; 1044 poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp; 1045 unsigned reg1 = frame.wb_pop_candidate1; 1046@@ -10240,9 +10240,9 @@ aarch64_expand_epilogue (bool for_sibcall) 1047 1048 /* Restore the vector registers before the predicate registers, 1049 so that we can use P4 as a temporary for big-endian SVE frames. */ 1050- aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM, 1051+ aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM, 1052 callee_adjust != 0, &cfi_ops); 1053- aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM, 1054+ aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM, 1055 false, &cfi_ops); 1056 if (maybe_ne (sve_callee_adjust, 0)) 1057 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true); 1058@@ -10250,7 +10250,7 @@ aarch64_expand_epilogue (bool for_sibcall) 1059 /* When shadow call stack is enabled, the scs_pop in the epilogue will 1060 restore x30, we don't need to restore x30 again in the traditional 1061 way. */ 1062- aarch64_restore_callee_saves (callee_offset - sve_callee_adjust, 1063+ aarch64_restore_callee_saves (final_adjust + sve_callee_adjust, 1064 R0_REGNUM, last_gpr, 1065 callee_adjust != 0, &cfi_ops); 1066 1067diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 1068index 4263d29d29d..fd820b1be4e 100644 1069--- a/gcc/config/aarch64/aarch64.h 1070+++ b/gcc/config/aarch64/aarch64.h 1071@@ -813,10 +813,6 @@ struct GTY (()) aarch64_frame 1072 It is zero when no push is used. */ 1073 HOST_WIDE_INT callee_adjust; 1074 1075- /* The offset from SP to the callee-save registers after initial_adjust. 1076- It may be non-zero if no push is used (ie. callee_adjust == 0). */ 1077- poly_int64 callee_offset; 1078- 1079 /* The size of the stack adjustment before saving or after restoring 1080 SVE registers. */ 1081 poly_int64 sve_callee_adjust; 1082-- 10832.34.1 1084 1085 1086From 8ae9181426f2700c2e5a2909487fa630e6fa406b Mon Sep 17 00:00:00 2001 1087From: Richard Sandiford <richard.sandiford@arm.com> 1088Date: Tue, 12 Sep 2023 16:07:15 +0100 1089Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a 1090 chain 1091 1092After previous patches, it is no longer necessary to calculate 1093a chain_offset in cases where there is no chain record. 1094 1095gcc/ 1096 * config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the 1097 calculation of chain_offset into the emit_frame_chain block. 1098--- 1099 gcc/config/aarch64/aarch64.cc | 10 +++++----- 1100 1 file changed, 5 insertions(+), 5 deletions(-) 1101 1102diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 1103index 46ae5cf7673..0e9b9717c08 100644 1104--- a/gcc/config/aarch64/aarch64.cc 1105+++ b/gcc/config/aarch64/aarch64.cc 1106@@ -10062,16 +10062,16 @@ aarch64_expand_prologue (void) 1107 if (callee_adjust != 0) 1108 aarch64_push_regs (reg1, reg2, callee_adjust); 1109 1110- /* The offset of the frame chain record (if any) from the current SP. */ 1111- poly_int64 chain_offset = (initial_adjust + callee_adjust 1112- - frame.hard_fp_offset); 1113- gcc_assert (known_ge (chain_offset, 0)); 1114- 1115 /* The offset of the current SP from the bottom of the static frame. */ 1116 poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust; 1117 1118 if (emit_frame_chain) 1119 { 1120+ /* The offset of the frame chain record (if any) from the current SP. */ 1121+ poly_int64 chain_offset = (initial_adjust + callee_adjust 1122+ - frame.hard_fp_offset); 1123+ gcc_assert (known_ge (chain_offset, 0)); 1124+ 1125 if (callee_adjust == 0) 1126 { 1127 reg1 = R29_REGNUM; 1128-- 11292.34.1 1130 1131 1132From 375794feb614cee1f41b710b9cc1b6f25da6c1cb Mon Sep 17 00:00:00 2001 1133From: Richard Sandiford <richard.sandiford@arm.com> 1134Date: Tue, 12 Sep 2023 16:07:15 +0100 1135Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals 1136MIME-Version: 1.0 1137Content-Type: text/plain; charset=UTF-8 1138Content-Transfer-Encoding: 8bit 1139 1140locals_offset was described as: 1141 1142 /* Offset from the base of the frame (incomming SP) to the 1143 top of the locals area. This value is always a multiple of 1144 STACK_BOUNDARY. */ 1145 1146This is implicitly an “upside down” view of the frame: the incoming 1147SP is at offset 0, and anything N bytes below the incoming SP is at 1148offset N (rather than -N). 1149 1150However, reg_offset instead uses a “right way up” view; that is, 1151it views offsets in address terms. Something above X is at a 1152positive offset from X and something below X is at a negative 1153offset from X. 1154 1155Also, even on FRAME_GROWS_DOWNWARD targets like AArch64, 1156target-independent code views offsets in address terms too: 1157locals are allocated at negative offsets to virtual_stack_vars. 1158 1159It seems confusing to have *_offset fields of the same structure 1160using different polarities like this. This patch tries to avoid 1161that by renaming locals_offset to bytes_above_locals. 1162 1163gcc/ 1164 * config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename to... 1165 (aarch64_frame::bytes_above_locals): ...this. 1166 * config/aarch64/aarch64.cc (aarch64_layout_frame) 1167 (aarch64_initial_elimination_offset): Update accordingly. 1168--- 1169 gcc/config/aarch64/aarch64.cc | 6 +++--- 1170 gcc/config/aarch64/aarch64.h | 6 +++--- 1171 2 files changed, 6 insertions(+), 6 deletions(-) 1172 1173diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 1174index 0e9b9717c08..0a22f91520e 100644 1175--- a/gcc/config/aarch64/aarch64.cc 1176+++ b/gcc/config/aarch64/aarch64.cc 1177@@ -8598,7 +8598,7 @@ aarch64_layout_frame (void) 1178 STACK_BOUNDARY / BITS_PER_UNIT)); 1179 frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; 1180 1181- frame.locals_offset = frame.saved_varargs_size; 1182+ frame.bytes_above_locals = frame.saved_varargs_size; 1183 1184 frame.initial_adjust = 0; 1185 frame.final_adjust = 0; 1186@@ -12754,13 +12754,13 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to) 1187 return frame.hard_fp_offset; 1188 1189 if (from == FRAME_POINTER_REGNUM) 1190- return frame.hard_fp_offset - frame.locals_offset; 1191+ return frame.hard_fp_offset - frame.bytes_above_locals; 1192 } 1193 1194 if (to == STACK_POINTER_REGNUM) 1195 { 1196 if (from == FRAME_POINTER_REGNUM) 1197- return frame.frame_size - frame.locals_offset; 1198+ return frame.frame_size - frame.bytes_above_locals; 1199 } 1200 1201 return frame.frame_size; 1202diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 1203index fd820b1be4e..7ae12d13e2b 100644 1204--- a/gcc/config/aarch64/aarch64.h 1205+++ b/gcc/config/aarch64/aarch64.h 1206@@ -791,10 +791,10 @@ struct GTY (()) aarch64_frame 1207 always a multiple of STACK_BOUNDARY. */ 1208 poly_int64 bytes_below_hard_fp; 1209 1210- /* Offset from the base of the frame (incomming SP) to the 1211- top of the locals area. This value is always a multiple of 1212+ /* The number of bytes between the top of the locals area and the top 1213+ of the frame (the incomming SP). This value is always a multiple of 1214 STACK_BOUNDARY. */ 1215- poly_int64 locals_offset; 1216+ poly_int64 bytes_above_locals; 1217 1218 /* Offset from the base of the frame (incomming SP) to the 1219 hard_frame_pointer. This value is always a multiple of 1220-- 12212.34.1 1222 1223 1224From 1a9ea1c45c75615ffbfabe652b3598a1d7be2168 Mon Sep 17 00:00:00 2001 1225From: Richard Sandiford <richard.sandiford@arm.com> 1226Date: Tue, 12 Sep 2023 16:07:16 +0100 1227Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to bytes_above_hard_fp 1228MIME-Version: 1.0 1229Content-Type: text/plain; charset=UTF-8 1230Content-Transfer-Encoding: 8bit 1231 1232Similarly to the previous locals_offset patch, hard_fp_offset 1233was described as: 1234 1235 /* Offset from the base of the frame (incomming SP) to the 1236 hard_frame_pointer. This value is always a multiple of 1237 STACK_BOUNDARY. */ 1238 poly_int64 hard_fp_offset; 1239 1240which again took an “upside-down” view: higher offsets meant lower 1241addresses. This patch renames the field to bytes_above_hard_fp instead. 1242 1243gcc/ 1244 * config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename 1245 to... 1246 (aarch64_frame::bytes_above_hard_fp): ...this. 1247 * config/aarch64/aarch64.cc (aarch64_layout_frame) 1248 (aarch64_expand_prologue): Update accordingly. 1249 (aarch64_initial_elimination_offset): Likewise. 1250--- 1251 gcc/config/aarch64/aarch64.cc | 26 +++++++++++++------------- 1252 gcc/config/aarch64/aarch64.h | 6 +++--- 1253 2 files changed, 16 insertions(+), 16 deletions(-) 1254 1255diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 1256index 0a22f91520e..95499ae49ba 100644 1257--- a/gcc/config/aarch64/aarch64.cc 1258+++ b/gcc/config/aarch64/aarch64.cc 1259@@ -8590,7 +8590,7 @@ aarch64_layout_frame (void) 1260 + get_frame_size (), 1261 STACK_BOUNDARY / BITS_PER_UNIT); 1262 1263- frame.hard_fp_offset 1264+ frame.bytes_above_hard_fp 1265 = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; 1266 1267 /* Both these values are already aligned. */ 1268@@ -8639,13 +8639,13 @@ aarch64_layout_frame (void) 1269 else if (frame.wb_pop_candidate1 != INVALID_REGNUM) 1270 max_push_offset = 256; 1271 1272- HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset; 1273+ HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; 1274 HOST_WIDE_INT const_saved_regs_size; 1275 if (known_eq (frame.saved_regs_size, 0)) 1276 frame.initial_adjust = frame.frame_size; 1277 else if (frame.frame_size.is_constant (&const_size) 1278 && const_size < max_push_offset 1279- && known_eq (frame.hard_fp_offset, const_size)) 1280+ && known_eq (frame.bytes_above_hard_fp, const_size)) 1281 { 1282 /* Simple, small frame with no data below the saved registers. 1283 1284@@ -8662,8 +8662,8 @@ aarch64_layout_frame (void) 1285 case that it hardly seems worth the effort though. */ 1286 && (!saves_below_hard_fp_p || const_below_saved_regs == 0) 1287 && !(cfun->calls_alloca 1288- && frame.hard_fp_offset.is_constant (&const_fp_offset) 1289- && const_fp_offset < max_push_offset)) 1290+ && frame.bytes_above_hard_fp.is_constant (&const_above_fp) 1291+ && const_above_fp < max_push_offset)) 1292 { 1293 /* Frame with small area below the saved registers: 1294 1295@@ -8681,12 +8681,12 @@ aarch64_layout_frame (void) 1296 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size 1297 save SVE registers relative to SP 1298 sub sp, sp, bytes_below_saved_regs */ 1299- frame.initial_adjust = (frame.hard_fp_offset 1300+ frame.initial_adjust = (frame.bytes_above_hard_fp 1301 + frame.below_hard_fp_saved_regs_size); 1302 frame.final_adjust = frame.bytes_below_saved_regs; 1303 } 1304- else if (frame.hard_fp_offset.is_constant (&const_fp_offset) 1305- && const_fp_offset < max_push_offset) 1306+ else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) 1307+ && const_above_fp < max_push_offset) 1308 { 1309 /* Frame with large area below the saved registers, or with SVE saves, 1310 but with a small area above: 1311@@ -8696,7 +8696,7 @@ aarch64_layout_frame (void) 1312 [sub sp, sp, below_hard_fp_saved_regs_size] 1313 [save SVE registers relative to SP] 1314 sub sp, sp, bytes_below_saved_regs */ 1315- frame.callee_adjust = const_fp_offset; 1316+ frame.callee_adjust = const_above_fp; 1317 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; 1318 frame.final_adjust = frame.bytes_below_saved_regs; 1319 } 1320@@ -8711,7 +8711,7 @@ aarch64_layout_frame (void) 1321 [sub sp, sp, below_hard_fp_saved_regs_size] 1322 [save SVE registers relative to SP] 1323 sub sp, sp, bytes_below_saved_regs */ 1324- frame.initial_adjust = frame.hard_fp_offset; 1325+ frame.initial_adjust = frame.bytes_above_hard_fp; 1326 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; 1327 frame.final_adjust = frame.bytes_below_saved_regs; 1328 } 1329@@ -10069,7 +10069,7 @@ aarch64_expand_prologue (void) 1330 { 1331 /* The offset of the frame chain record (if any) from the current SP. */ 1332 poly_int64 chain_offset = (initial_adjust + callee_adjust 1333- - frame.hard_fp_offset); 1334+ - frame.bytes_above_hard_fp); 1335 gcc_assert (known_ge (chain_offset, 0)); 1336 1337 if (callee_adjust == 0) 1338@@ -12751,10 +12751,10 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to) 1339 if (to == HARD_FRAME_POINTER_REGNUM) 1340 { 1341 if (from == ARG_POINTER_REGNUM) 1342- return frame.hard_fp_offset; 1343+ return frame.bytes_above_hard_fp; 1344 1345 if (from == FRAME_POINTER_REGNUM) 1346- return frame.hard_fp_offset - frame.bytes_above_locals; 1347+ return frame.bytes_above_hard_fp - frame.bytes_above_locals; 1348 } 1349 1350 if (to == STACK_POINTER_REGNUM) 1351diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 1352index 7ae12d13e2b..3808f49e9ca 100644 1353--- a/gcc/config/aarch64/aarch64.h 1354+++ b/gcc/config/aarch64/aarch64.h 1355@@ -796,10 +796,10 @@ struct GTY (()) aarch64_frame 1356 STACK_BOUNDARY. */ 1357 poly_int64 bytes_above_locals; 1358 1359- /* Offset from the base of the frame (incomming SP) to the 1360- hard_frame_pointer. This value is always a multiple of 1361+ /* The number of bytes between the hard_frame_pointer and the top of 1362+ the frame (the incomming SP). This value is always a multiple of 1363 STACK_BOUNDARY. */ 1364- poly_int64 hard_fp_offset; 1365+ poly_int64 bytes_above_hard_fp; 1366 1367 /* The size of the frame. This value is the offset from base of the 1368 frame (incomming SP) to the stack_pointer. This value is always 1369-- 13702.34.1 1371 1372 1373From d202ce1ecf60a36a3e1009917dd76109248ce9be Mon Sep 17 00:00:00 2001 1374From: Richard Sandiford <richard.sandiford@arm.com> 1375Date: Tue, 12 Sep 2023 16:07:16 +0100 1376Subject: [PATCH 10/19] aarch64: Tweak frame_size comment 1377MIME-Version: 1.0 1378Content-Type: text/plain; charset=UTF-8 1379Content-Transfer-Encoding: 8bit 1380 1381This patch fixes another case in which a value was described with 1382an “upside-down” view. 1383 1384gcc/ 1385 * config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak comment. 1386--- 1387 gcc/config/aarch64/aarch64.h | 4 ++-- 1388 1 file changed, 2 insertions(+), 2 deletions(-) 1389 1390diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 1391index 3808f49e9ca..108a5731b0d 100644 1392--- a/gcc/config/aarch64/aarch64.h 1393+++ b/gcc/config/aarch64/aarch64.h 1394@@ -801,8 +801,8 @@ struct GTY (()) aarch64_frame 1395 STACK_BOUNDARY. */ 1396 poly_int64 bytes_above_hard_fp; 1397 1398- /* The size of the frame. This value is the offset from base of the 1399- frame (incomming SP) to the stack_pointer. This value is always 1400+ /* The size of the frame, i.e. the number of bytes between the bottom 1401+ of the outgoing arguments and the incoming SP. This value is always 1402 a multiple of STACK_BOUNDARY. */ 1403 poly_int64 frame_size; 1404 1405-- 14062.34.1 1407 1408 1409From f2b585375205b0a1802d79c682ba33766ecd1f0f Mon Sep 17 00:00:00 2001 1410From: Richard Sandiford <richard.sandiford@arm.com> 1411Date: Tue, 12 Sep 2023 16:07:17 +0100 1412Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the 1413 frame 1414 1415reg_offset was measured from the bottom of the saved register area. 1416This made perfect sense with the original layout, since the bottom 1417of the saved register area was also the hard frame pointer address. 1418It became slightly less obvious with SVE, since we save SVE 1419registers below the hard frame pointer, but it still made sense. 1420 1421However, if we want to allow different frame layouts, it's more 1422convenient and obvious to measure reg_offset from the bottom of 1423the frame. After previous patches, it's also a slight simplification 1424in its own right. 1425 1426gcc/ 1427 * config/aarch64/aarch64.h (aarch64_frame): Add comment above 1428 reg_offset. 1429 * config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets 1430 from the bottom of the frame, rather than the bottom of the saved 1431 register area. Measure reg_offset from the bottom of the frame 1432 rather than the bottom of the saved register area. 1433 (aarch64_save_callee_saves): Update accordingly. 1434 (aarch64_restore_callee_saves): Likewise. 1435 (aarch64_get_separate_components): Likewise. 1436 (aarch64_process_components): Likewise. 1437--- 1438 gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++------------------- 1439 gcc/config/aarch64/aarch64.h | 3 ++ 1440 2 files changed, 27 insertions(+), 29 deletions(-) 1441 1442diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 1443index 95499ae49ba..af99807ef8a 100644 1444--- a/gcc/config/aarch64/aarch64.cc 1445+++ b/gcc/config/aarch64/aarch64.cc 1446@@ -8400,7 +8400,6 @@ aarch64_needs_frame_chain (void) 1447 static void 1448 aarch64_layout_frame (void) 1449 { 1450- poly_int64 offset = 0; 1451 int regno, last_fp_reg = INVALID_REGNUM; 1452 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM); 1453 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); 1454@@ -8478,7 +8477,9 @@ aarch64_layout_frame (void) 1455 gcc_assert (crtl->is_leaf 1456 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); 1457 1458- frame.bytes_below_saved_regs = crtl->outgoing_args_size; 1459+ poly_int64 offset = crtl->outgoing_args_size; 1460+ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); 1461+ frame.bytes_below_saved_regs = offset; 1462 1463 /* Now assign stack slots for the registers. Start with the predicate 1464 registers, since predicate LDR and STR have a relatively small 1465@@ -8490,7 +8491,8 @@ aarch64_layout_frame (void) 1466 offset += BYTES_PER_SVE_PRED; 1467 } 1468 1469- if (maybe_ne (offset, 0)) 1470+ poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs; 1471+ if (maybe_ne (saved_prs_size, 0)) 1472 { 1473 /* If we have any vector registers to save above the predicate registers, 1474 the offset of the vector register save slots need to be a multiple 1475@@ -8508,10 +8510,10 @@ aarch64_layout_frame (void) 1476 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 1477 else 1478 { 1479- if (known_le (offset, vector_save_size)) 1480- offset = vector_save_size; 1481- else if (known_le (offset, vector_save_size * 2)) 1482- offset = vector_save_size * 2; 1483+ if (known_le (saved_prs_size, vector_save_size)) 1484+ offset = frame.bytes_below_saved_regs + vector_save_size; 1485+ else if (known_le (saved_prs_size, vector_save_size * 2)) 1486+ offset = frame.bytes_below_saved_regs + vector_save_size * 2; 1487 else 1488 gcc_unreachable (); 1489 } 1490@@ -8528,9 +8530,10 @@ aarch64_layout_frame (void) 1491 1492 /* OFFSET is now the offset of the hard frame pointer from the bottom 1493 of the callee save area. */ 1494- bool saves_below_hard_fp_p = maybe_ne (offset, 0); 1495- frame.below_hard_fp_saved_regs_size = offset; 1496- frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs; 1497+ frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; 1498+ bool saves_below_hard_fp_p 1499+ = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); 1500+ frame.bytes_below_hard_fp = offset; 1501 if (frame.emit_frame_chain) 1502 { 1503 /* FP and LR are placed in the linkage record. */ 1504@@ -8581,9 +8584,10 @@ aarch64_layout_frame (void) 1505 1506 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 1507 1508- frame.saved_regs_size = offset; 1509+ frame.saved_regs_size = offset - frame.bytes_below_saved_regs; 1510 1511- poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size; 1512+ poly_int64 varargs_and_saved_regs_size 1513+ = frame.saved_regs_size + frame.saved_varargs_size; 1514 1515 poly_int64 saved_regs_and_above 1516 = aligned_upper_bound (varargs_and_saved_regs_size 1517@@ -9105,9 +9109,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp, 1518 1519 machine_mode mode = aarch64_reg_save_mode (regno); 1520 reg = gen_rtx_REG (mode, regno); 1521- offset = (frame.reg_offset[regno] 1522- + frame.bytes_below_saved_regs 1523- - bytes_below_sp); 1524+ offset = frame.reg_offset[regno] - bytes_below_sp; 1525 rtx base_rtx = stack_pointer_rtx; 1526 poly_int64 sp_offset = offset; 1527 1528@@ -9214,9 +9216,7 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start, 1529 1530 machine_mode mode = aarch64_reg_save_mode (regno); 1531 reg = gen_rtx_REG (mode, regno); 1532- offset = (frame.reg_offset[regno] 1533- + frame.bytes_below_saved_regs 1534- - bytes_below_sp); 1535+ offset = frame.reg_offset[regno] - bytes_below_sp; 1536 rtx base_rtx = stack_pointer_rtx; 1537 if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 1538 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, 1539@@ -9355,14 +9355,12 @@ aarch64_get_separate_components (void) 1540 it as a stack probe for -fstack-clash-protection. */ 1541 if (flag_stack_clash_protection 1542 && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) 1543- && known_eq (offset, 0)) 1544+ && known_eq (offset, frame.bytes_below_saved_regs)) 1545 continue; 1546 1547 /* Get the offset relative to the register we'll use. */ 1548 if (frame_pointer_needed) 1549- offset -= frame.below_hard_fp_saved_regs_size; 1550- else 1551- offset += frame.bytes_below_saved_regs; 1552+ offset -= frame.bytes_below_hard_fp; 1553 1554 /* Check that we can access the stack slot of the register with one 1555 direct load with no adjustments needed. */ 1556@@ -9509,9 +9507,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) 1557 rtx reg = gen_rtx_REG (mode, regno); 1558 poly_int64 offset = frame.reg_offset[regno]; 1559 if (frame_pointer_needed) 1560- offset -= frame.below_hard_fp_saved_regs_size; 1561- else 1562- offset += frame.bytes_below_saved_regs; 1563+ offset -= frame.bytes_below_hard_fp; 1564 1565 rtx addr = plus_constant (Pmode, ptr_reg, offset); 1566 rtx mem = gen_frame_mem (mode, addr); 1567@@ -9563,9 +9559,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) 1568 /* REGNO2 can be saved/restored in a pair with REGNO. */ 1569 rtx reg2 = gen_rtx_REG (mode, regno2); 1570 if (frame_pointer_needed) 1571- offset2 -= frame.below_hard_fp_saved_regs_size; 1572- else 1573- offset2 += frame.bytes_below_saved_regs; 1574+ offset2 -= frame.bytes_below_hard_fp; 1575 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); 1576 rtx mem2 = gen_frame_mem (mode, addr2); 1577 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) 1578@@ -9681,7 +9675,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 1579 if (final_adjustment_p 1580 && known_eq (frame.below_hard_fp_saved_regs_size, 0)) 1581 { 1582- poly_int64 lr_offset = frame.reg_offset[LR_REGNUM]; 1583+ poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM] 1584+ - frame.bytes_below_saved_regs); 1585 if (known_ge (lr_offset, 0)) 1586 min_probe_threshold -= lr_offset.to_constant (); 1587 else 1588diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 1589index 108a5731b0d..c8becb098c8 100644 1590--- a/gcc/config/aarch64/aarch64.h 1591+++ b/gcc/config/aarch64/aarch64.h 1592@@ -766,6 +766,9 @@ extern enum aarch64_processor aarch64_tune; 1593 #ifdef HAVE_POLY_INT_H 1594 struct GTY (()) aarch64_frame 1595 { 1596+ /* The offset from the bottom of the static frame (the bottom of the 1597+ outgoing arguments) of each register save slot, or -2 if no save is 1598+ needed. */ 1599 poly_int64 reg_offset[LAST_SAVED_REGNUM + 1]; 1600 1601 /* The number of extra stack bytes taken up by register varargs. 1602-- 16032.34.1 1604 1605 1606From 79faabda181d0d9fd29a3cf5726ba65bdee945b5 Mon Sep 17 00:00:00 2001 1607From: Richard Sandiford <richard.sandiford@arm.com> 1608Date: Tue, 12 Sep 2023 16:07:17 +0100 1609Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation 1610 1611After previous patches, it no longer really makes sense to allocate 1612the top of the frame in terms of varargs_and_saved_regs_size and 1613saved_regs_and_above. 1614 1615gcc/ 1616 * config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify 1617 the allocation of the top of the frame. 1618--- 1619 gcc/config/aarch64/aarch64.cc | 23 ++++++++--------------- 1620 1 file changed, 8 insertions(+), 15 deletions(-) 1621 1622diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 1623index af99807ef8a..31b00094c2a 100644 1624--- a/gcc/config/aarch64/aarch64.cc 1625+++ b/gcc/config/aarch64/aarch64.cc 1626@@ -8586,23 +8586,16 @@ aarch64_layout_frame (void) 1627 1628 frame.saved_regs_size = offset - frame.bytes_below_saved_regs; 1629 1630- poly_int64 varargs_and_saved_regs_size 1631- = frame.saved_regs_size + frame.saved_varargs_size; 1632- 1633- poly_int64 saved_regs_and_above 1634- = aligned_upper_bound (varargs_and_saved_regs_size 1635- + get_frame_size (), 1636- STACK_BOUNDARY / BITS_PER_UNIT); 1637- 1638- frame.bytes_above_hard_fp 1639- = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; 1640+ offset += get_frame_size (); 1641+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 1642+ auto top_of_locals = offset; 1643 1644- /* Both these values are already aligned. */ 1645- gcc_assert (multiple_p (frame.bytes_below_saved_regs, 1646- STACK_BOUNDARY / BITS_PER_UNIT)); 1647- frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; 1648+ offset += frame.saved_varargs_size; 1649+ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); 1650+ frame.frame_size = offset; 1651 1652- frame.bytes_above_locals = frame.saved_varargs_size; 1653+ frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp; 1654+ frame.bytes_above_locals = frame.frame_size - top_of_locals; 1655 1656 frame.initial_adjust = 0; 1657 frame.final_adjust = 0; 1658-- 16592.34.1 1660 1661 1662From 4e62049e403b141e6f916176160dac8cbd65fe47 Mon Sep 17 00:00:00 2001 1663From: Richard Sandiford <richard.sandiford@arm.com> 1664Date: Tue, 12 Sep 2023 16:07:18 +0100 1665Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak 1666 1667This patch just changes a calculation of initial_adjust 1668to one that makes it slightly more obvious that the total 1669adjustment is frame.frame_size. 1670 1671gcc/ 1672 * config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak 1673 calculation of initial_adjust for frames in which all saves 1674 are SVE saves. 1675--- 1676 gcc/config/aarch64/aarch64.cc | 5 ++--- 1677 1 file changed, 2 insertions(+), 3 deletions(-) 1678 1679diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 1680index 31b00094c2a..1aa79da0673 100644 1681--- a/gcc/config/aarch64/aarch64.cc 1682+++ b/gcc/config/aarch64/aarch64.cc 1683@@ -8675,11 +8675,10 @@ aarch64_layout_frame (void) 1684 { 1685 /* Frame in which all saves are SVE saves: 1686 1687- sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size 1688+ sub sp, sp, frame_size - bytes_below_saved_regs 1689 save SVE registers relative to SP 1690 sub sp, sp, bytes_below_saved_regs */ 1691- frame.initial_adjust = (frame.bytes_above_hard_fp 1692- + frame.below_hard_fp_saved_regs_size); 1693+ frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs; 1694 frame.final_adjust = frame.bytes_below_saved_regs; 1695 } 1696 else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) 1697-- 16982.34.1 1699 1700 1701From aaa1a0a5912d9e5d571e5f1c6f09ceac99544ab5 Mon Sep 17 00:00:00 2001 1702From: Richard Sandiford <richard.sandiford@arm.com> 1703Date: Tue, 12 Sep 2023 16:07:18 +0100 1704Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition 1705 1706The AArch64 ABI says that, when stack clash protection is used, 1707there can be a maximum of 1KiB of unprobed space at sp on entry 1708to a function. Therefore, we need to probe when allocating 1709>= guard_size - 1KiB of data (>= rather than >). This is what 1710GCC does. 1711 1712If an allocation is exactly guard_size bytes, it is enough to allocate 1713those bytes and probe once at offset 1024. It isn't possible to use a 1714single probe at any other offset: higher would conmplicate later code, 1715by leaving more unprobed space than usual, while lower would risk 1716leaving an entire page unprobed. For simplicity, the code probes all 1717allocations at offset 1024. 1718 1719Some register saves also act as probes. If we need to allocate 1720more space below the last such register save probe, we need to 1721probe the allocation if it is > 1KiB. Again, this allocation is 1722then sometimes (but not always) probed at offset 1024. This sort of 1723allocation is currently only used for outgoing arguments, which are 1724rarely this big. 1725 1726However, the code also probed if this final outgoing-arguments 1727allocation was == 1KiB, rather than just > 1KiB. This isn't 1728necessary, since the register save then probes at offset 1024 1729as required. Continuing to probe allocations of exactly 1KiB 1730would complicate later patches. 1731 1732gcc/ 1733 * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space): 1734 Don't probe final allocations that are exactly 1KiB in size (after 1735 unprobed space above the final allocation has been deducted). 1736 1737gcc/testsuite/ 1738 * gcc.target/aarch64/stack-check-prologue-17.c: New test. 1739--- 1740 gcc/config/aarch64/aarch64.cc | 4 +- 1741 .../aarch64/stack-check-prologue-17.c | 55 +++++++++++++++++++ 1742 2 files changed, 58 insertions(+), 1 deletion(-) 1743 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c 1744 1745diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 1746index 1aa79da0673..5cad847977a 100644 1747--- a/gcc/config/aarch64/aarch64.cc 1748+++ b/gcc/config/aarch64/aarch64.cc 1749@@ -9648,9 +9648,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 1750 HOST_WIDE_INT guard_size 1751 = 1 << param_stack_clash_protection_guard_size; 1752 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; 1753+ HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT; 1754+ gcc_assert (multiple_p (poly_size, byte_sp_alignment)); 1755 HOST_WIDE_INT min_probe_threshold 1756 = (final_adjustment_p 1757- ? guard_used_by_caller 1758+ ? guard_used_by_caller + byte_sp_alignment 1759 : guard_size - guard_used_by_caller); 1760 /* When doing the final adjustment for the outgoing arguments, take into 1761 account any unprobed space there is above the current SP. There are 1762diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c 1763new file mode 100644 1764index 00000000000..0d8a25d73a2 1765--- /dev/null 1766+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c 1767@@ -0,0 +1,55 @@ 1768+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */ 1769+/* { dg-final { check-function-bodies "**" "" } } */ 1770+ 1771+void f(int, ...); 1772+void g(); 1773+ 1774+/* 1775+** test1: 1776+** ... 1777+** str x30, \[sp\] 1778+** sub sp, sp, #1024 1779+** cbnz w0, .* 1780+** bl g 1781+** ... 1782+*/ 1783+int test1(int z) { 1784+ __uint128_t x = 0; 1785+ int y[0x400]; 1786+ if (z) 1787+ { 1788+ f(0, 0, 0, 0, 0, 0, 0, &y, 1789+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 1790+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 1791+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 1792+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); 1793+ } 1794+ g(); 1795+ return 1; 1796+} 1797+ 1798+/* 1799+** test2: 1800+** ... 1801+** str x30, \[sp\] 1802+** sub sp, sp, #1040 1803+** str xzr, \[sp\] 1804+** cbnz w0, .* 1805+** bl g 1806+** ... 1807+*/ 1808+int test2(int z) { 1809+ __uint128_t x = 0; 1810+ int y[0x400]; 1811+ if (z) 1812+ { 1813+ f(0, 0, 0, 0, 0, 0, 0, &y, 1814+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 1815+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 1816+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 1817+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 1818+ x); 1819+ } 1820+ g(); 1821+ return 1; 1822+} 1823-- 18242.34.1 1825 1826 1827From 8433953434a7b58c0923140d39eb3c5988c1d097 Mon Sep 17 00:00:00 2001 1828From: Richard Sandiford <richard.sandiford@arm.com> 1829Date: Tue, 12 Sep 2023 16:07:19 +0100 1830Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes 1831 1832-fstack-clash-protection uses the save of LR as a probe for the next 1833allocation. The next allocation could be: 1834 1835* another part of the static frame, e.g. when allocating SVE save slots 1836 or outgoing arguments 1837 1838* an alloca in the same function 1839 1840* an allocation made by a callee function 1841 1842However, when -fomit-frame-pointer is used, the LR save slot is placed 1843above the other GPR save slots. It could therefore be up to 80 bytes 1844above the base of the GPR save area (which is also the hard fp address). 1845 1846aarch64_allocate_and_probe_stack_space took this into account when 1847deciding how much subsequent space could be allocated without needing 1848a probe. However, it interacted badly with: 1849 1850 /* If doing a small final adjustment, we always probe at offset 0. 1851 This is done to avoid issues when LR is not at position 0 or when 1852 the final adjustment is smaller than the probing offset. */ 1853 else if (final_adjustment_p && rounded_size == 0) 1854 residual_probe_offset = 0; 1855 1856which forces any allocation that is smaller than the guard page size 1857to be probed at offset 0 rather than the usual offset 1024. It was 1858therefore possible to construct cases in which we had: 1859 1860* a probe using LR at SP + 80 bytes (or some other value >= 16) 1861* an allocation of the guard page size - 16 bytes 1862* a probe at SP + 0 1863 1864which allocates guard page size + 64 consecutive unprobed bytes. 1865 1866This patch requires the LR probe to be in the first 16 bytes of the 1867save area when stack clash protection is active. Doing it 1868unconditionally would cause code-quality regressions. 1869 1870Putting LR before other registers prevents push/pop allocation 1871when shadow call stacks are enabled, since LR is restored 1872separately from the other callee-saved registers. 1873 1874The new comment doesn't say that the probe register is required 1875to be LR, since a later patch removes that restriction. 1876 1877gcc/ 1878 * config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that 1879 the LR save slot is in the first 16 bytes of the register save area. 1880 Only form STP/LDP push/pop candidates if both registers are valid. 1881 (aarch64_allocate_and_probe_stack_space): Remove workaround for 1882 when LR was not in the first 16 bytes. 1883 1884gcc/testsuite/ 1885 * gcc.target/aarch64/stack-check-prologue-18.c: New test. 1886 * gcc.target/aarch64/stack-check-prologue-19.c: Likewise. 1887 * gcc.target/aarch64/stack-check-prologue-20.c: Likewise. 1888--- 1889 gcc/config/aarch64/aarch64.cc | 72 ++++++------- 1890 .../aarch64/stack-check-prologue-18.c | 100 ++++++++++++++++++ 1891 .../aarch64/stack-check-prologue-19.c | 100 ++++++++++++++++++ 1892 .../aarch64/stack-check-prologue-20.c | 3 + 1893 4 files changed, 233 insertions(+), 42 deletions(-) 1894 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c 1895 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c 1896 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c 1897 1898diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 1899index 5cad847977a..a765f92329d 100644 1900--- a/gcc/config/aarch64/aarch64.cc 1901+++ b/gcc/config/aarch64/aarch64.cc 1902@@ -8534,26 +8534,34 @@ aarch64_layout_frame (void) 1903 bool saves_below_hard_fp_p 1904 = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); 1905 frame.bytes_below_hard_fp = offset; 1906+ 1907+ auto allocate_gpr_slot = [&](unsigned int regno) 1908+ { 1909+ frame.reg_offset[regno] = offset; 1910+ if (frame.wb_push_candidate1 == INVALID_REGNUM) 1911+ frame.wb_push_candidate1 = regno; 1912+ else if (frame.wb_push_candidate2 == INVALID_REGNUM) 1913+ frame.wb_push_candidate2 = regno; 1914+ offset += UNITS_PER_WORD; 1915+ }; 1916+ 1917 if (frame.emit_frame_chain) 1918 { 1919 /* FP and LR are placed in the linkage record. */ 1920- frame.reg_offset[R29_REGNUM] = offset; 1921- frame.wb_push_candidate1 = R29_REGNUM; 1922- frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD; 1923- frame.wb_push_candidate2 = R30_REGNUM; 1924- offset += 2 * UNITS_PER_WORD; 1925+ allocate_gpr_slot (R29_REGNUM); 1926+ allocate_gpr_slot (R30_REGNUM); 1927 } 1928+ else if (flag_stack_clash_protection 1929+ && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED)) 1930+ /* Put the LR save slot first, since it makes a good choice of probe 1931+ for stack clash purposes. The idea is that the link register usually 1932+ has to be saved before a call anyway, and so we lose little by 1933+ stopping it from being individually shrink-wrapped. */ 1934+ allocate_gpr_slot (R30_REGNUM); 1935 1936 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) 1937 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) 1938- { 1939- frame.reg_offset[regno] = offset; 1940- if (frame.wb_push_candidate1 == INVALID_REGNUM) 1941- frame.wb_push_candidate1 = regno; 1942- else if (frame.wb_push_candidate2 == INVALID_REGNUM) 1943- frame.wb_push_candidate2 = regno; 1944- offset += UNITS_PER_WORD; 1945- } 1946+ allocate_gpr_slot (regno); 1947 1948 poly_int64 max_int_offset = offset; 1949 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 1950@@ -8631,10 +8639,13 @@ aarch64_layout_frame (void) 1951 max_push_offset to 0, because no registers are popped at this time, 1952 so callee_adjust cannot be adjusted. */ 1953 HOST_WIDE_INT max_push_offset = 0; 1954- if (frame.wb_pop_candidate2 != INVALID_REGNUM) 1955- max_push_offset = 512; 1956- else if (frame.wb_pop_candidate1 != INVALID_REGNUM) 1957- max_push_offset = 256; 1958+ if (frame.wb_pop_candidate1 != INVALID_REGNUM) 1959+ { 1960+ if (frame.wb_pop_candidate2 != INVALID_REGNUM) 1961+ max_push_offset = 512; 1962+ else 1963+ max_push_offset = 256; 1964+ } 1965 1966 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; 1967 HOST_WIDE_INT const_saved_regs_size; 1968@@ -9654,29 +9665,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 1969 = (final_adjustment_p 1970 ? guard_used_by_caller + byte_sp_alignment 1971 : guard_size - guard_used_by_caller); 1972- /* When doing the final adjustment for the outgoing arguments, take into 1973- account any unprobed space there is above the current SP. There are 1974- two cases: 1975- 1976- - When saving SVE registers below the hard frame pointer, we force 1977- the lowest save to take place in the prologue before doing the final 1978- adjustment (i.e. we don't allow the save to be shrink-wrapped). 1979- This acts as a probe at SP, so there is no unprobed space. 1980- 1981- - When there are no SVE register saves, we use the store of the link 1982- register as a probe. We can't assume that LR was saved at position 0 1983- though, so treat any space below it as unprobed. */ 1984- if (final_adjustment_p 1985- && known_eq (frame.below_hard_fp_saved_regs_size, 0)) 1986- { 1987- poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM] 1988- - frame.bytes_below_saved_regs); 1989- if (known_ge (lr_offset, 0)) 1990- min_probe_threshold -= lr_offset.to_constant (); 1991- else 1992- gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0)); 1993- } 1994- 1995 poly_int64 frame_size = frame.frame_size; 1996 1997 /* We should always have a positive probe threshold. */ 1998@@ -9856,8 +9844,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 1999 if (final_adjustment_p && rounded_size != 0) 2000 min_probe_threshold = 0; 2001 /* If doing a small final adjustment, we always probe at offset 0. 2002- This is done to avoid issues when LR is not at position 0 or when 2003- the final adjustment is smaller than the probing offset. */ 2004+ This is done to avoid issues when the final adjustment is smaller 2005+ than the probing offset. */ 2006 else if (final_adjustment_p && rounded_size == 0) 2007 residual_probe_offset = 0; 2008 2009diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c 2010new file mode 100644 2011index 00000000000..82447d20fff 2012--- /dev/null 2013+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c 2014@@ -0,0 +1,100 @@ 2015+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */ 2016+/* { dg-final { check-function-bodies "**" "" } } */ 2017+ 2018+void f(int, ...); 2019+void g(); 2020+ 2021+/* 2022+** test1: 2023+** ... 2024+** str x30, \[sp\] 2025+** sub sp, sp, #4064 2026+** str xzr, \[sp\] 2027+** cbnz w0, .* 2028+** bl g 2029+** ... 2030+** str x26, \[sp, #?4128\] 2031+** ... 2032+*/ 2033+int test1(int z) { 2034+ __uint128_t x = 0; 2035+ int y[0x400]; 2036+ if (z) 2037+ { 2038+ asm volatile ("" ::: 2039+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); 2040+ f(0, 0, 0, 0, 0, 0, 0, &y, 2041+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2042+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2043+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2044+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2045+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2046+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2047+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2048+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2049+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2050+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2051+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2052+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2053+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2054+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2055+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2056+ x, x, x, x, x, x, x, x, x, x, x, x, x, x); 2057+ } 2058+ g(); 2059+ return 1; 2060+} 2061+ 2062+/* 2063+** test2: 2064+** ... 2065+** str x30, \[sp\] 2066+** sub sp, sp, #1040 2067+** str xzr, \[sp\] 2068+** cbnz w0, .* 2069+** bl g 2070+** ... 2071+*/ 2072+int test2(int z) { 2073+ __uint128_t x = 0; 2074+ int y[0x400]; 2075+ if (z) 2076+ { 2077+ asm volatile ("" ::: 2078+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); 2079+ f(0, 0, 0, 0, 0, 0, 0, &y, 2080+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2081+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2082+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2083+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2084+ x); 2085+ } 2086+ g(); 2087+ return 1; 2088+} 2089+ 2090+/* 2091+** test3: 2092+** ... 2093+** str x30, \[sp\] 2094+** sub sp, sp, #1024 2095+** cbnz w0, .* 2096+** bl g 2097+** ... 2098+*/ 2099+int test3(int z) { 2100+ __uint128_t x = 0; 2101+ int y[0x400]; 2102+ if (z) 2103+ { 2104+ asm volatile ("" ::: 2105+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); 2106+ f(0, 0, 0, 0, 0, 0, 0, &y, 2107+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2108+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2109+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2110+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); 2111+ } 2112+ g(); 2113+ return 1; 2114+} 2115diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c 2116new file mode 100644 2117index 00000000000..73ac3e4e4eb 2118--- /dev/null 2119+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c 2120@@ -0,0 +1,100 @@ 2121+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */ 2122+/* { dg-final { check-function-bodies "**" "" } } */ 2123+ 2124+void f(int, ...); 2125+void g(); 2126+ 2127+/* 2128+** test1: 2129+** ... 2130+** str x30, \[sp\] 2131+** sub sp, sp, #4064 2132+** str xzr, \[sp\] 2133+** cbnz w0, .* 2134+** bl g 2135+** ... 2136+** str x26, \[sp, #?4128\] 2137+** ... 2138+*/ 2139+int test1(int z) { 2140+ __uint128_t x = 0; 2141+ int y[0x400]; 2142+ if (z) 2143+ { 2144+ asm volatile ("" ::: 2145+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); 2146+ f(0, 0, 0, 0, 0, 0, 0, &y, 2147+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2148+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2149+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2150+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2151+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2152+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2153+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2154+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2155+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2156+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2157+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2158+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2159+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2160+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2161+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2162+ x, x, x, x, x, x, x, x, x, x, x, x, x, x); 2163+ } 2164+ g(); 2165+ return 1; 2166+} 2167+ 2168+/* 2169+** test2: 2170+** ... 2171+** str x30, \[sp\] 2172+** sub sp, sp, #1040 2173+** str xzr, \[sp\] 2174+** cbnz w0, .* 2175+** bl g 2176+** ... 2177+*/ 2178+int test2(int z) { 2179+ __uint128_t x = 0; 2180+ int y[0x400]; 2181+ if (z) 2182+ { 2183+ asm volatile ("" ::: 2184+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); 2185+ f(0, 0, 0, 0, 0, 0, 0, &y, 2186+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2187+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2188+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2189+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2190+ x); 2191+ } 2192+ g(); 2193+ return 1; 2194+} 2195+ 2196+/* 2197+** test3: 2198+** ... 2199+** str x30, \[sp\] 2200+** sub sp, sp, #1024 2201+** cbnz w0, .* 2202+** bl g 2203+** ... 2204+*/ 2205+int test3(int z) { 2206+ __uint128_t x = 0; 2207+ int y[0x400]; 2208+ if (z) 2209+ { 2210+ asm volatile ("" ::: 2211+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); 2212+ f(0, 0, 0, 0, 0, 0, 0, &y, 2213+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2214+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2215+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 2216+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); 2217+ } 2218+ g(); 2219+ return 1; 2220+} 2221diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c 2222new file mode 100644 2223index 00000000000..690aae8dfd5 2224--- /dev/null 2225+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c 2226@@ -0,0 +1,3 @@ 2227+/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */ 2228+ 2229+#include "stack-check-prologue-19.c" 2230-- 22312.34.1 2232 2233 2234From eea1759073e09dd1aefbc9a881601ab1eebfdd18 Mon Sep 17 00:00:00 2001 2235From: Richard Sandiford <richard.sandiford@arm.com> 2236Date: Tue, 12 Sep 2023 16:07:19 +0100 2237Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation 2238 2239Previous patches ensured that the final frame allocation only needs 2240a probe when the size is strictly greater than 1KiB. It's therefore 2241safe to use the normal 1024 probe offset in all cases. 2242 2243The main motivation for doing this is to simplify the code and 2244remove the number of special cases. 2245 2246gcc/ 2247 * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space): 2248 Always probe the residual allocation at offset 1024, asserting 2249 that that is in range. 2250 2251gcc/testsuite/ 2252 * gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe 2253 to be at offset 1024 rather than offset 0. 2254 * gcc.target/aarch64/stack-check-prologue-18.c: Likewise. 2255 * gcc.target/aarch64/stack-check-prologue-19.c: Likewise. 2256--- 2257 gcc/config/aarch64/aarch64.cc | 12 ++++-------- 2258 .../gcc.target/aarch64/stack-check-prologue-17.c | 2 +- 2259 .../gcc.target/aarch64/stack-check-prologue-18.c | 4 ++-- 2260 .../gcc.target/aarch64/stack-check-prologue-19.c | 4 ++-- 2261 4 files changed, 9 insertions(+), 13 deletions(-) 2262 2263diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 2264index a765f92329d..37809a306f7 100644 2265--- a/gcc/config/aarch64/aarch64.cc 2266+++ b/gcc/config/aarch64/aarch64.cc 2267@@ -9838,16 +9838,12 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 2268 are still safe. */ 2269 if (residual) 2270 { 2271- HOST_WIDE_INT residual_probe_offset = guard_used_by_caller; 2272+ gcc_assert (guard_used_by_caller + byte_sp_alignment <= size); 2273+ 2274 /* If we're doing final adjustments, and we've done any full page 2275 allocations then any residual needs to be probed. */ 2276 if (final_adjustment_p && rounded_size != 0) 2277 min_probe_threshold = 0; 2278- /* If doing a small final adjustment, we always probe at offset 0. 2279- This is done to avoid issues when the final adjustment is smaller 2280- than the probing offset. */ 2281- else if (final_adjustment_p && rounded_size == 0) 2282- residual_probe_offset = 0; 2283 2284 aarch64_sub_sp (temp1, temp2, residual, frame_related_p); 2285 if (residual >= min_probe_threshold) 2286@@ -9858,8 +9854,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 2287 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required." 2288 "\n", residual); 2289 2290- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, 2291- residual_probe_offset)); 2292+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, 2293+ guard_used_by_caller)); 2294 emit_insn (gen_blockage ()); 2295 } 2296 } 2297diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c 2298index 0d8a25d73a2..f0ec1389771 100644 2299--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c 2300+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c 2301@@ -33,7 +33,7 @@ int test1(int z) { 2302 ** ... 2303 ** str x30, \[sp\] 2304 ** sub sp, sp, #1040 2305-** str xzr, \[sp\] 2306+** str xzr, \[sp, #?1024\] 2307 ** cbnz w0, .* 2308 ** bl g 2309 ** ... 2310diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c 2311index 82447d20fff..6383bec5ebc 100644 2312--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c 2313+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c 2314@@ -9,7 +9,7 @@ void g(); 2315 ** ... 2316 ** str x30, \[sp\] 2317 ** sub sp, sp, #4064 2318-** str xzr, \[sp\] 2319+** str xzr, \[sp, #?1024\] 2320 ** cbnz w0, .* 2321 ** bl g 2322 ** ... 2323@@ -50,7 +50,7 @@ int test1(int z) { 2324 ** ... 2325 ** str x30, \[sp\] 2326 ** sub sp, sp, #1040 2327-** str xzr, \[sp\] 2328+** str xzr, \[sp, #?1024\] 2329 ** cbnz w0, .* 2330 ** bl g 2331 ** ... 2332diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c 2333index 73ac3e4e4eb..562039b5e9b 100644 2334--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c 2335+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c 2336@@ -9,7 +9,7 @@ void g(); 2337 ** ... 2338 ** str x30, \[sp\] 2339 ** sub sp, sp, #4064 2340-** str xzr, \[sp\] 2341+** str xzr, \[sp, #?1024\] 2342 ** cbnz w0, .* 2343 ** bl g 2344 ** ... 2345@@ -50,7 +50,7 @@ int test1(int z) { 2346 ** ... 2347 ** str x30, \[sp\] 2348 ** sub sp, sp, #1040 2349-** str xzr, \[sp\] 2350+** str xzr, \[sp, #?1024\] 2351 ** cbnz w0, .* 2352 ** bl g 2353 ** ... 2354-- 23552.34.1 2356 2357 2358From 96d85187c3b9c9a7efc2fd698c3d452e80d8aa47 Mon Sep 17 00:00:00 2001 2359From: Richard Sandiford <richard.sandiford@arm.com> 2360Date: Tue, 12 Sep 2023 16:07:20 +0100 2361Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame 2362 info 2363 2364The stack frame is currently divided into three areas: 2365 2366A: the area above the hard frame pointer 2367B: the SVE saves below the hard frame pointer 2368C: the outgoing arguments 2369 2370If the stack frame is allocated in one chunk, the allocation needs a 2371probe if the frame size is >= guard_size - 1KiB. In addition, if the 2372function is not a leaf function, it must probe an address no more than 23731KiB above the outgoing SP. We ensured the second condition by 2374 2375(1) using single-chunk allocations for non-leaf functions only if 2376 the link register save slot is within 512 bytes of the bottom 2377 of the frame; and 2378 2379(2) using the link register save as a probe (meaning, for instance, 2380 that it can't be individually shrink wrapped) 2381 2382If instead the stack is allocated in multiple chunks, then: 2383 2384* an allocation involving only the outgoing arguments (C above) requires 2385 a probe if the allocation size is > 1KiB 2386 2387* any other allocation requires a probe if the allocation size 2388 is >= guard_size - 1KiB 2389 2390* second and subsequent allocations require the previous allocation 2391 to probe at the bottom of the allocated area, regardless of the size 2392 of that previous allocation 2393 2394The final point means that, unlike for single allocations, 2395it can be necessary to have both a non-SVE register probe and 2396an SVE register probe. For example: 2397 2398* allocate A, probe using a non-SVE register save 2399* allocate B, probe using an SVE register save 2400* allocate C 2401 2402The non-SVE register used in this case was again the link register. 2403It was previously used even if the link register save slot was some 2404bytes above the bottom of the non-SVE register saves, but an earlier 2405patch avoided that by putting the link register save slot first. 2406 2407As a belt-and-braces fix, this patch explicitly records which 2408probe registers we're using and allows the non-SVE probe to be 2409whichever register comes first (as for SVE). 2410 2411The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c. 2412 2413gcc/ 2414 * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe) 2415 (aarch64_frame::hard_fp_save_and_probe): New fields. 2416 * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize them. 2417 Rather than asserting that a leaf function saves LR, instead assert 2418 that a leaf function saves something. 2419 (aarch64_get_separate_components): Prevent the chosen probe 2420 registers from being individually shrink-wrapped. 2421 (aarch64_allocate_and_probe_stack_space): Remove workaround for 2422 probe registers that aren't at the bottom of the previous allocation. 2423 2424gcc/testsuite/ 2425 * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes. 2426--- 2427 gcc/config/aarch64/aarch64.cc | 68 +++++++++++++++---- 2428 gcc/config/aarch64/aarch64.h | 8 +++ 2429 .../aarch64/sve/pcs/stack_clash_3.c | 6 +- 2430 3 files changed, 64 insertions(+), 18 deletions(-) 2431 2432diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 2433index 37809a306f7..6c59c39a639 100644 2434--- a/gcc/config/aarch64/aarch64.cc 2435+++ b/gcc/config/aarch64/aarch64.cc 2436@@ -8471,15 +8471,11 @@ aarch64_layout_frame (void) 2437 && !crtl->abi->clobbers_full_reg_p (regno)) 2438 frame.reg_offset[regno] = SLOT_REQUIRED; 2439 2440- /* With stack-clash, LR must be saved in non-leaf functions. The saving of 2441- LR counts as an implicit probe which allows us to maintain the invariant 2442- described in the comment at expand_prologue. */ 2443- gcc_assert (crtl->is_leaf 2444- || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); 2445 2446 poly_int64 offset = crtl->outgoing_args_size; 2447 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); 2448 frame.bytes_below_saved_regs = offset; 2449+ frame.sve_save_and_probe = INVALID_REGNUM; 2450 2451 /* Now assign stack slots for the registers. Start with the predicate 2452 registers, since predicate LDR and STR have a relatively small 2453@@ -8487,6 +8483,8 @@ aarch64_layout_frame (void) 2454 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) 2455 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) 2456 { 2457+ if (frame.sve_save_and_probe == INVALID_REGNUM) 2458+ frame.sve_save_and_probe = regno; 2459 frame.reg_offset[regno] = offset; 2460 offset += BYTES_PER_SVE_PRED; 2461 } 2462@@ -8524,6 +8522,8 @@ aarch64_layout_frame (void) 2463 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 2464 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) 2465 { 2466+ if (frame.sve_save_and_probe == INVALID_REGNUM) 2467+ frame.sve_save_and_probe = regno; 2468 frame.reg_offset[regno] = offset; 2469 offset += vector_save_size; 2470 } 2471@@ -8533,10 +8533,18 @@ aarch64_layout_frame (void) 2472 frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; 2473 bool saves_below_hard_fp_p 2474 = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); 2475+ gcc_assert (!saves_below_hard_fp_p 2476+ || (frame.sve_save_and_probe != INVALID_REGNUM 2477+ && known_eq (frame.reg_offset[frame.sve_save_and_probe], 2478+ frame.bytes_below_saved_regs))); 2479+ 2480 frame.bytes_below_hard_fp = offset; 2481+ frame.hard_fp_save_and_probe = INVALID_REGNUM; 2482 2483 auto allocate_gpr_slot = [&](unsigned int regno) 2484 { 2485+ if (frame.hard_fp_save_and_probe == INVALID_REGNUM) 2486+ frame.hard_fp_save_and_probe = regno; 2487 frame.reg_offset[regno] = offset; 2488 if (frame.wb_push_candidate1 == INVALID_REGNUM) 2489 frame.wb_push_candidate1 = regno; 2490@@ -8570,6 +8578,8 @@ aarch64_layout_frame (void) 2491 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 2492 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) 2493 { 2494+ if (frame.hard_fp_save_and_probe == INVALID_REGNUM) 2495+ frame.hard_fp_save_and_probe = regno; 2496 /* If there is an alignment gap between integer and fp callee-saves, 2497 allocate the last fp register to it if possible. */ 2498 if (regno == last_fp_reg 2499@@ -8593,6 +8603,17 @@ aarch64_layout_frame (void) 2500 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 2501 2502 frame.saved_regs_size = offset - frame.bytes_below_saved_regs; 2503+ gcc_assert (known_eq (frame.saved_regs_size, 2504+ frame.below_hard_fp_saved_regs_size) 2505+ || (frame.hard_fp_save_and_probe != INVALID_REGNUM 2506+ && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe], 2507+ frame.bytes_below_hard_fp))); 2508+ 2509+ /* With stack-clash, a register must be saved in non-leaf functions. 2510+ The saving of the bottommost register counts as an implicit probe, 2511+ which allows us to maintain the invariant described in the comment 2512+ at expand_prologue. */ 2513+ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); 2514 2515 offset += get_frame_size (); 2516 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 2517@@ -8723,6 +8744,25 @@ aarch64_layout_frame (void) 2518 frame.final_adjust = frame.bytes_below_saved_regs; 2519 } 2520 2521+ /* The frame is allocated in pieces, with each non-final piece 2522+ including a register save at offset 0 that acts as a probe for 2523+ the following piece. In addition, the save of the bottommost register 2524+ acts as a probe for callees and allocas. Roll back any probes that 2525+ aren't needed. 2526+ 2527+ A probe isn't needed if it is associated with the final allocation 2528+ (including callees and allocas) that happens before the epilogue is 2529+ executed. */ 2530+ if (crtl->is_leaf 2531+ && !cfun->calls_alloca 2532+ && known_eq (frame.final_adjust, 0)) 2533+ { 2534+ if (maybe_ne (frame.sve_callee_adjust, 0)) 2535+ frame.sve_save_and_probe = INVALID_REGNUM; 2536+ else 2537+ frame.hard_fp_save_and_probe = INVALID_REGNUM; 2538+ } 2539+ 2540 /* Make sure the individual adjustments add up to the full frame size. */ 2541 gcc_assert (known_eq (frame.initial_adjust 2542 + frame.callee_adjust 2543@@ -9354,13 +9394,6 @@ aarch64_get_separate_components (void) 2544 2545 poly_int64 offset = frame.reg_offset[regno]; 2546 2547- /* If the register is saved in the first SVE save slot, we use 2548- it as a stack probe for -fstack-clash-protection. */ 2549- if (flag_stack_clash_protection 2550- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) 2551- && known_eq (offset, frame.bytes_below_saved_regs)) 2552- continue; 2553- 2554 /* Get the offset relative to the register we'll use. */ 2555 if (frame_pointer_needed) 2556 offset -= frame.bytes_below_hard_fp; 2557@@ -9395,6 +9428,13 @@ aarch64_get_separate_components (void) 2558 2559 bitmap_clear_bit (components, LR_REGNUM); 2560 bitmap_clear_bit (components, SP_REGNUM); 2561+ if (flag_stack_clash_protection) 2562+ { 2563+ if (frame.sve_save_and_probe != INVALID_REGNUM) 2564+ bitmap_clear_bit (components, frame.sve_save_and_probe); 2565+ if (frame.hard_fp_save_and_probe != INVALID_REGNUM) 2566+ bitmap_clear_bit (components, frame.hard_fp_save_and_probe); 2567+ } 2568 2569 return components; 2570 } 2571@@ -9931,8 +9971,8 @@ aarch64_epilogue_uses (int regno) 2572 When probing is needed, we emit a probe at the start of the prologue 2573 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter. 2574 2575- We have to track how much space has been allocated and the only stores 2576- to the stack we track as implicit probes are the FP/LR stores. 2577+ We can also use register saves as probes. These are stored in 2578+ sve_save_and_probe and hard_fp_save_and_probe. 2579 2580 For outgoing arguments we probe if the size is larger than 1KB, such that 2581 the ABI specified buffer is maintained for the next callee. 2582diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 2583index c8becb098c8..fbfb73545ba 100644 2584--- a/gcc/config/aarch64/aarch64.h 2585+++ b/gcc/config/aarch64/aarch64.h 2586@@ -863,6 +863,14 @@ struct GTY (()) aarch64_frame 2587 This is the register they should use. */ 2588 unsigned spare_pred_reg; 2589 2590+ /* An SVE register that is saved below the hard frame pointer and that acts 2591+ as a probe for later allocations, or INVALID_REGNUM if none. */ 2592+ unsigned sve_save_and_probe; 2593+ 2594+ /* A register that is saved at the hard frame pointer and that acts 2595+ as a probe for later allocations, or INVALID_REGNUM if none. */ 2596+ unsigned hard_fp_save_and_probe; 2597+ 2598 bool laid_out; 2599 2600 /* True if shadow call stack should be enabled for the current function. */ 2601diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c 2602index 3e01ec36c3a..3530a0d504b 100644 2603--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c 2604+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c 2605@@ -11,11 +11,10 @@ 2606 ** mov x11, sp 2607 ** ... 2608 ** sub sp, sp, x13 2609-** str p4, \[sp\] 2610 ** cbz w0, [^\n]* 2611+** str p4, \[sp\] 2612 ** ... 2613 ** ptrue p0\.b, all 2614-** ldr p4, \[sp\] 2615 ** addvl sp, sp, #1 2616 ** ldr x24, \[sp\], 32 2617 ** ret 2618@@ -39,13 +38,12 @@ test_1 (int n) 2619 ** mov x11, sp 2620 ** ... 2621 ** sub sp, sp, x13 2622-** str p4, \[sp\] 2623 ** cbz w0, [^\n]* 2624+** str p4, \[sp\] 2625 ** str p5, \[sp, #1, mul vl\] 2626 ** str p6, \[sp, #2, mul vl\] 2627 ** ... 2628 ** ptrue p0\.b, all 2629-** ldr p4, \[sp\] 2630 ** addvl sp, sp, #1 2631 ** ldr x24, \[sp\], 32 2632 ** ret 2633-- 26342.34.1 2635 2636 2637From 56df065080950bb30dda9c260f71be54269bdda5 Mon Sep 17 00:00:00 2001 2638From: Richard Sandiford <richard.sandiford@arm.com> 2639Date: Tue, 12 Sep 2023 16:07:20 +0100 2640Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size 2641 2642After previous patches, it's no longer necessary to store 2643saved_regs_size and below_hard_fp_saved_regs_size in the frame info. 2644All measurements instead use the top or bottom of the frame as 2645reference points. 2646 2647gcc/ 2648 * config/aarch64/aarch64.h (aarch64_frame::saved_regs_size) 2649 (aarch64_frame::below_hard_fp_saved_regs_size): Delete. 2650 * config/aarch64/aarch64.cc (aarch64_layout_frame): Update accordingly. 2651--- 2652 gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++------------------- 2653 gcc/config/aarch64/aarch64.h | 7 ------ 2654 2 files changed, 21 insertions(+), 31 deletions(-) 2655 2656diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 2657index 6c59c39a639..b95e805a8cc 100644 2658--- a/gcc/config/aarch64/aarch64.cc 2659+++ b/gcc/config/aarch64/aarch64.cc 2660@@ -8530,9 +8530,8 @@ aarch64_layout_frame (void) 2661 2662 /* OFFSET is now the offset of the hard frame pointer from the bottom 2663 of the callee save area. */ 2664- frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; 2665- bool saves_below_hard_fp_p 2666- = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); 2667+ auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; 2668+ bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0); 2669 gcc_assert (!saves_below_hard_fp_p 2670 || (frame.sve_save_and_probe != INVALID_REGNUM 2671 && known_eq (frame.reg_offset[frame.sve_save_and_probe], 2672@@ -8602,9 +8601,8 @@ aarch64_layout_frame (void) 2673 2674 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 2675 2676- frame.saved_regs_size = offset - frame.bytes_below_saved_regs; 2677- gcc_assert (known_eq (frame.saved_regs_size, 2678- frame.below_hard_fp_saved_regs_size) 2679+ auto saved_regs_size = offset - frame.bytes_below_saved_regs; 2680+ gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size) 2681 || (frame.hard_fp_save_and_probe != INVALID_REGNUM 2682 && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe], 2683 frame.bytes_below_hard_fp))); 2684@@ -8613,7 +8611,7 @@ aarch64_layout_frame (void) 2685 The saving of the bottommost register counts as an implicit probe, 2686 which allows us to maintain the invariant described in the comment 2687 at expand_prologue. */ 2688- gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); 2689+ gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0)); 2690 2691 offset += get_frame_size (); 2692 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 2693@@ -8670,7 +8668,7 @@ aarch64_layout_frame (void) 2694 2695 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; 2696 HOST_WIDE_INT const_saved_regs_size; 2697- if (known_eq (frame.saved_regs_size, 0)) 2698+ if (known_eq (saved_regs_size, 0)) 2699 frame.initial_adjust = frame.frame_size; 2700 else if (frame.frame_size.is_constant (&const_size) 2701 && const_size < max_push_offset 2702@@ -8683,7 +8681,7 @@ aarch64_layout_frame (void) 2703 frame.callee_adjust = const_size; 2704 } 2705 else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs) 2706- && frame.saved_regs_size.is_constant (&const_saved_regs_size) 2707+ && saved_regs_size.is_constant (&const_saved_regs_size) 2708 && const_below_saved_regs + const_saved_regs_size < 512 2709 /* We could handle this case even with data below the saved 2710 registers, provided that that data left us with valid offsets 2711@@ -8702,8 +8700,7 @@ aarch64_layout_frame (void) 2712 frame.initial_adjust = frame.frame_size; 2713 } 2714 else if (saves_below_hard_fp_p 2715- && known_eq (frame.saved_regs_size, 2716- frame.below_hard_fp_saved_regs_size)) 2717+ && known_eq (saved_regs_size, below_hard_fp_saved_regs_size)) 2718 { 2719 /* Frame in which all saves are SVE saves: 2720 2721@@ -8725,7 +8722,7 @@ aarch64_layout_frame (void) 2722 [save SVE registers relative to SP] 2723 sub sp, sp, bytes_below_saved_regs */ 2724 frame.callee_adjust = const_above_fp; 2725- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; 2726+ frame.sve_callee_adjust = below_hard_fp_saved_regs_size; 2727 frame.final_adjust = frame.bytes_below_saved_regs; 2728 } 2729 else 2730@@ -8740,7 +8737,7 @@ aarch64_layout_frame (void) 2731 [save SVE registers relative to SP] 2732 sub sp, sp, bytes_below_saved_regs */ 2733 frame.initial_adjust = frame.bytes_above_hard_fp; 2734- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; 2735+ frame.sve_callee_adjust = below_hard_fp_saved_regs_size; 2736 frame.final_adjust = frame.bytes_below_saved_regs; 2737 } 2738 2739@@ -9936,17 +9933,17 @@ aarch64_epilogue_uses (int regno) 2740 | local variables | <-- frame_pointer_rtx 2741 | | 2742 +-------------------------------+ 2743- | padding | \ 2744- +-------------------------------+ | 2745- | callee-saved registers | | frame.saved_regs_size 2746- +-------------------------------+ | 2747- | LR' | | 2748- +-------------------------------+ | 2749- | FP' | | 2750- +-------------------------------+ |<- hard_frame_pointer_rtx (aligned) 2751- | SVE vector registers | | \ 2752- +-------------------------------+ | | below_hard_fp_saved_regs_size 2753- | SVE predicate registers | / / 2754+ | padding | 2755+ +-------------------------------+ 2756+ | callee-saved registers | 2757+ +-------------------------------+ 2758+ | LR' | 2759+ +-------------------------------+ 2760+ | FP' | 2761+ +-------------------------------+ <-- hard_frame_pointer_rtx (aligned) 2762+ | SVE vector registers | 2763+ +-------------------------------+ 2764+ | SVE predicate registers | 2765 +-------------------------------+ 2766 | dynamic allocation | 2767 +-------------------------------+ 2768diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h 2769index fbfb73545ba..cfeaf4657ab 100644 2770--- a/gcc/config/aarch64/aarch64.h 2771+++ b/gcc/config/aarch64/aarch64.h 2772@@ -777,18 +777,11 @@ struct GTY (()) aarch64_frame 2773 STACK_BOUNDARY. */ 2774 HOST_WIDE_INT saved_varargs_size; 2775 2776- /* The size of the callee-save registers with a slot in REG_OFFSET. */ 2777- poly_int64 saved_regs_size; 2778- 2779 /* The number of bytes between the bottom of the static frame (the bottom 2780 of the outgoing arguments) and the bottom of the register save area. 2781 This value is always a multiple of STACK_BOUNDARY. */ 2782 poly_int64 bytes_below_saved_regs; 2783 2784- /* The size of the callee-save registers with a slot in REG_OFFSET that 2785- are saved below the hard frame pointer. */ 2786- poly_int64 below_hard_fp_saved_regs_size; 2787- 2788 /* The number of bytes between the bottom of the static frame (the bottom 2789 of the outgoing arguments) and the hard frame pointer. This value is 2790 always a multiple of STACK_BOUNDARY. */ 2791-- 27922.34.1 2793 2794 2795From b96e66fd4ef3e36983969fb8cdd1956f551a074b Mon Sep 17 00:00:00 2001 2796From: Richard Sandiford <richard.sandiford@arm.com> 2797Date: Tue, 12 Sep 2023 16:07:21 +0100 2798Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved 2799 registers 2800 2801AArch64 normally puts the saved registers near the bottom of the frame, 2802immediately above any dynamic allocations. But this means that a 2803stack-smash attack on those dynamic allocations could overwrite the 2804saved registers without needing to reach as far as the stack smash 2805canary. 2806 2807The same thing could also happen for variable-sized arguments that are 2808passed by value, since those are allocated before a call and popped on 2809return. 2810 2811This patch avoids that by putting the locals (and thus the canary) below 2812the saved registers when stack smash protection is active. 2813 2814The patch fixes CVE-2023-4039. 2815 2816gcc/ 2817 * config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p): 2818 New function. 2819 (aarch64_layout_frame): Use it to decide whether locals should 2820 go above or below the saved registers. 2821 (aarch64_expand_prologue): Update stack layout comment. 2822 Emit a stack tie after the final adjustment. 2823 2824gcc/testsuite/ 2825 * gcc.target/aarch64/stack-protector-8.c: New test. 2826 * gcc.target/aarch64/stack-protector-9.c: Likewise. 2827--- 2828 gcc/config/aarch64/aarch64.cc | 46 +++++++-- 2829 .../gcc.target/aarch64/stack-protector-8.c | 95 +++++++++++++++++++ 2830 .../gcc.target/aarch64/stack-protector-9.c | 33 +++++++ 2831 3 files changed, 168 insertions(+), 6 deletions(-) 2832 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c 2833 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c 2834 2835diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc 2836index b95e805a8cc..389c0e29353 100644 2837--- a/gcc/config/aarch64/aarch64.cc 2838+++ b/gcc/config/aarch64/aarch64.cc 2839@@ -8394,6 +8394,20 @@ aarch64_needs_frame_chain (void) 2840 return aarch64_use_frame_pointer; 2841 } 2842 2843+/* Return true if the current function should save registers above 2844+ the locals area, rather than below it. */ 2845+ 2846+static bool 2847+aarch64_save_regs_above_locals_p () 2848+{ 2849+ /* When using stack smash protection, make sure that the canary slot 2850+ comes between the locals and the saved registers. Otherwise, 2851+ it would be possible for a carefully sized smash attack to change 2852+ the saved registers (particularly LR and FP) without reaching the 2853+ canary. */ 2854+ return crtl->stack_protect_guard; 2855+} 2856+ 2857 /* Mark the registers that need to be saved by the callee and calculate 2858 the size of the callee-saved registers area and frame record (both FP 2859 and LR may be omitted). */ 2860@@ -8405,6 +8419,7 @@ aarch64_layout_frame (void) 2861 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); 2862 bool frame_related_fp_reg_p = false; 2863 aarch64_frame &frame = cfun->machine->frame; 2864+ poly_int64 top_of_locals = -1; 2865 2866 frame.emit_frame_chain = aarch64_needs_frame_chain (); 2867 2868@@ -8471,9 +8486,16 @@ aarch64_layout_frame (void) 2869 && !crtl->abi->clobbers_full_reg_p (regno)) 2870 frame.reg_offset[regno] = SLOT_REQUIRED; 2871 2872+ bool regs_at_top_p = aarch64_save_regs_above_locals_p (); 2873 2874 poly_int64 offset = crtl->outgoing_args_size; 2875 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); 2876+ if (regs_at_top_p) 2877+ { 2878+ offset += get_frame_size (); 2879+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 2880+ top_of_locals = offset; 2881+ } 2882 frame.bytes_below_saved_regs = offset; 2883 frame.sve_save_and_probe = INVALID_REGNUM; 2884 2885@@ -8613,15 +8635,18 @@ aarch64_layout_frame (void) 2886 at expand_prologue. */ 2887 gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0)); 2888 2889- offset += get_frame_size (); 2890- offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 2891- auto top_of_locals = offset; 2892- 2893+ if (!regs_at_top_p) 2894+ { 2895+ offset += get_frame_size (); 2896+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 2897+ top_of_locals = offset; 2898+ } 2899 offset += frame.saved_varargs_size; 2900 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); 2901 frame.frame_size = offset; 2902 2903 frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp; 2904+ gcc_assert (known_ge (top_of_locals, 0)); 2905 frame.bytes_above_locals = frame.frame_size - top_of_locals; 2906 2907 frame.initial_adjust = 0; 2908@@ -9930,10 +9955,10 @@ aarch64_epilogue_uses (int regno) 2909 | for register varargs | 2910 | | 2911 +-------------------------------+ 2912- | local variables | <-- frame_pointer_rtx 2913+ | local variables (1) | <-- frame_pointer_rtx 2914 | | 2915 +-------------------------------+ 2916- | padding | 2917+ | padding (1) | 2918 +-------------------------------+ 2919 | callee-saved registers | 2920 +-------------------------------+ 2921@@ -9945,6 +9970,10 @@ aarch64_epilogue_uses (int regno) 2922 +-------------------------------+ 2923 | SVE predicate registers | 2924 +-------------------------------+ 2925+ | local variables (2) | 2926+ +-------------------------------+ 2927+ | padding (2) | 2928+ +-------------------------------+ 2929 | dynamic allocation | 2930 +-------------------------------+ 2931 | padding | 2932@@ -9954,6 +9983,9 @@ aarch64_epilogue_uses (int regno) 2933 +-------------------------------+ 2934 | | <-- stack_pointer_rtx (aligned) 2935 2936+ The regions marked (1) and (2) are mutually exclusive. (2) is used 2937+ when aarch64_save_regs_above_locals_p is true. 2938+ 2939 Dynamic stack allocations via alloca() decrease stack_pointer_rtx 2940 but leave frame_pointer_rtx and hard_frame_pointer_rtx 2941 unchanged. 2942@@ -10149,6 +10181,8 @@ aarch64_expand_prologue (void) 2943 gcc_assert (known_eq (bytes_below_sp, final_adjust)); 2944 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, 2945 !frame_pointer_needed, true); 2946+ if (emit_frame_chain && maybe_ne (final_adjust, 0)) 2947+ emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); 2948 } 2949 2950 /* Return TRUE if we can use a simple_return insn. 2951diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c 2952new file mode 100644 2953index 00000000000..e71d820e365 2954--- /dev/null 2955+++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c 2956@@ -0,0 +1,95 @@ 2957+/* { dg-options " -O -fstack-protector-strong -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 -mstack-protector-guard-offset=16" } */ 2958+/* { dg-final { check-function-bodies "**" "" } } */ 2959+ 2960+void g(void *); 2961+__SVBool_t *h(void *); 2962+ 2963+/* 2964+** test1: 2965+** sub sp, sp, #288 2966+** stp x29, x30, \[sp, #?272\] 2967+** add x29, sp, #?272 2968+** mrs (x[0-9]+), tpidr2_el0 2969+** ldr (x[0-9]+), \[\1, #?16\] 2970+** str \2, \[sp, #?264\] 2971+** mov \2, #?0 2972+** add x0, sp, #?8 2973+** bl g 2974+** ... 2975+** mrs .* 2976+** ... 2977+** bne .* 2978+** ... 2979+** ldp x29, x30, \[sp, #?272\] 2980+** add sp, sp, #?288 2981+** ret 2982+** bl __stack_chk_fail 2983+*/ 2984+int test1() { 2985+ int y[0x40]; 2986+ g(y); 2987+ return 1; 2988+} 2989+ 2990+/* 2991+** test2: 2992+** stp x29, x30, \[sp, #?-16\]! 2993+** mov x29, sp 2994+** sub sp, sp, #1040 2995+** mrs (x[0-9]+), tpidr2_el0 2996+** ldr (x[0-9]+), \[\1, #?16\] 2997+** str \2, \[sp, #?1032\] 2998+** mov \2, #?0 2999+** add x0, sp, #?8 3000+** bl g 3001+** ... 3002+** mrs .* 3003+** ... 3004+** bne .* 3005+** ... 3006+** add sp, sp, #?1040 3007+** ldp x29, x30, \[sp\], #?16 3008+** ret 3009+** bl __stack_chk_fail 3010+*/ 3011+int test2() { 3012+ int y[0x100]; 3013+ g(y); 3014+ return 1; 3015+} 3016+ 3017+#pragma GCC target "+sve" 3018+ 3019+/* 3020+** test3: 3021+** stp x29, x30, \[sp, #?-16\]! 3022+** mov x29, sp 3023+** addvl sp, sp, #-18 3024+** ... 3025+** str p4, \[sp\] 3026+** ... 3027+** sub sp, sp, #272 3028+** mrs (x[0-9]+), tpidr2_el0 3029+** ldr (x[0-9]+), \[\1, #?16\] 3030+** str \2, \[sp, #?264\] 3031+** mov \2, #?0 3032+** add x0, sp, #?8 3033+** bl h 3034+** ... 3035+** mrs .* 3036+** ... 3037+** bne .* 3038+** ... 3039+** add sp, sp, #?272 3040+** ... 3041+** ldr p4, \[sp\] 3042+** ... 3043+** addvl sp, sp, #18 3044+** ldp x29, x30, \[sp\], #?16 3045+** ret 3046+** bl __stack_chk_fail 3047+*/ 3048+__SVBool_t test3() { 3049+ int y[0x40]; 3050+ return *h(y); 3051+} 3052diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c 3053new file mode 100644 3054index 00000000000..58f322aa480 3055--- /dev/null 3056+++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c 3057@@ -0,0 +1,33 @@ 3058+/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */ 3059+/* { dg-final { check-function-bodies "**" "" } } */ 3060+ 3061+/* 3062+** main: 3063+** ... 3064+** stp x29, x30, \[sp, #?-[0-9]+\]! 3065+** ... 3066+** sub sp, sp, #[0-9]+ 3067+** ... 3068+** str x[0-9]+, \[x29, #?-8\] 3069+** ... 3070+*/ 3071+int f(const char *); 3072+void g(void *); 3073+int main(int argc, char* argv[]) 3074+{ 3075+ int a; 3076+ int b; 3077+ char c[2+f(argv[1])]; 3078+ int d[0x100]; 3079+ char y; 3080+ 3081+ y=42; a=4; b=10; 3082+ c[0] = 'h'; c[1] = '\0'; 3083+ 3084+ c[f(argv[2])] = '\0'; 3085+ 3086+ __builtin_printf("%d %d\n%s\n", a, b, c); 3087+ g(d); 3088+ 3089+ return 0; 3090+} 3091-- 30922.34.1 3093 3094