1From: Richard Sandiford <richard.sandiford@arm.com>
2Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue
3Date: Tue, 12 Sep 2023 16:25:10 +0100
4
5This series of patches fixes deficiencies in GCC's -fstack-protector
6implementation for AArch64 when using dynamically allocated stack space.
7This is CVE-2023-4039.  See:
8
9https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
10https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
11
12for more details.
13
14The fix is to put the saved registers above the locals area when
15-fstack-protector is used.
16
17The series also fixes a stack-clash problem that I found while working
18on the CVE.  In unpatched sources, the stack-clash problem would only
19trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an
20equivalent).  But it would be a more significant issue with the new
21-fstack-protector frame layout.  It's therefore important that both
22problems are fixed together.
23
24Some reorganisation of the code seemed necessary to fix the problems in a
25cleanish way.  The series is therefore quite long, but only a handful of
26patches should have any effect on code generation.
27
28See the individual patches for a detailed description.
29
30Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches.
31I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039.
32
33CVE: CVE-2023-4039
34Upstream-Status: Backport
35Signed-off-by: Ross Burton <ross.burton@arm.com>
36
37
38From 71a2aa2127283f450c623d3604dbcabe0e14a8d4 Mon Sep 17 00:00:00 2001
39From: Richard Sandiford <richard.sandiford@arm.com>
40Date: Tue, 12 Sep 2023 16:07:12 +0100
41Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping code
42
43aarch64_layout_frame uses a shorthand for referring to
44cfun->machine->frame:
45
46  aarch64_frame &frame = cfun->machine->frame;
47
48This patch does the same for some other heavy users of the structure.
49No functional change intended.
50
51gcc/
52	* config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use
53	a local shorthand for cfun->machine->frame.
54	(aarch64_restore_callee_saves, aarch64_get_separate_components):
55	(aarch64_process_components): Likewise.
56	(aarch64_allocate_and_probe_stack_space): Likewise.
57	(aarch64_expand_prologue, aarch64_expand_epilogue): Likewise.
58	(aarch64_layout_frame): Use existing shorthand for one more case.
59---
60 gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++----------------
61 1 file changed, 64 insertions(+), 59 deletions(-)
62
63diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
64index 822a2b49a46..5d473d161d9 100644
65--- a/gcc/config/aarch64/aarch64.cc
66+++ b/gcc/config/aarch64/aarch64.cc
67@@ -8612,7 +8612,7 @@ aarch64_layout_frame (void)
68   frame.is_scs_enabled
69     = (!crtl->calls_eh_return
70        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
71-       && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
72+       && known_ge (frame.reg_offset[LR_REGNUM], 0));
73
74   /* When shadow call stack is enabled, the scs_pop in the epilogue will
75      restore x30, and we don't need to pop x30 again in the traditional
76@@ -9078,6 +9078,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
77 			   unsigned start, unsigned limit, bool skip_wb,
78 			   bool hard_fp_valid_p)
79 {
80+  aarch64_frame &frame = cfun->machine->frame;
81   rtx_insn *insn;
82   unsigned regno;
83   unsigned regno2;
84@@ -9092,8 +9093,8 @@ aarch64_save_callee_saves (poly_int64 start_offset,
85       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
86
87       if (skip_wb
88-	  && (regno == cfun->machine->frame.wb_push_candidate1
89-	      || regno == cfun->machine->frame.wb_push_candidate2))
90+	  && (regno == frame.wb_push_candidate1
91+	      || regno == frame.wb_push_candidate2))
92 	continue;
93
94       if (cfun->machine->reg_is_wrapped_separately[regno])
95@@ -9101,7 +9102,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
96
97       machine_mode mode = aarch64_reg_save_mode (regno);
98       reg = gen_rtx_REG (mode, regno);
99-      offset = start_offset + cfun->machine->frame.reg_offset[regno];
100+      offset = start_offset + frame.reg_offset[regno];
101       rtx base_rtx = stack_pointer_rtx;
102       poly_int64 sp_offset = offset;
103
104@@ -9114,7 +9115,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
105 	{
106 	  gcc_assert (known_eq (start_offset, 0));
107 	  poly_int64 fp_offset
108-	    = cfun->machine->frame.below_hard_fp_saved_regs_size;
109+	    = frame.below_hard_fp_saved_regs_size;
110 	  if (hard_fp_valid_p)
111 	    base_rtx = hard_frame_pointer_rtx;
112 	  else
113@@ -9136,8 +9137,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
114 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
115 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
116 	  && known_eq (GET_MODE_SIZE (mode),
117-		       cfun->machine->frame.reg_offset[regno2]
118-		       - cfun->machine->frame.reg_offset[regno]))
119+		       frame.reg_offset[regno2] - frame.reg_offset[regno]))
120 	{
121 	  rtx reg2 = gen_rtx_REG (mode, regno2);
122 	  rtx mem2;
123@@ -9187,6 +9187,7 @@ static void
124 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
125 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
126 {
127+  aarch64_frame &frame = cfun->machine->frame;
128   unsigned regno;
129   unsigned regno2;
130   poly_int64 offset;
131@@ -9203,13 +9204,13 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
132       rtx reg, mem;
133
134       if (skip_wb
135-	  && (regno == cfun->machine->frame.wb_pop_candidate1
136-	      || regno == cfun->machine->frame.wb_pop_candidate2))
137+	  && (regno == frame.wb_pop_candidate1
138+	      || regno == frame.wb_pop_candidate2))
139 	continue;
140
141       machine_mode mode = aarch64_reg_save_mode (regno);
142       reg = gen_rtx_REG (mode, regno);
143-      offset = start_offset + cfun->machine->frame.reg_offset[regno];
144+      offset = start_offset + frame.reg_offset[regno];
145       rtx base_rtx = stack_pointer_rtx;
146       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
147 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
148@@ -9220,8 +9221,7 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
149 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
150 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
151 	  && known_eq (GET_MODE_SIZE (mode),
152-		       cfun->machine->frame.reg_offset[regno2]
153-		       - cfun->machine->frame.reg_offset[regno]))
154+		       frame.reg_offset[regno2] - frame.reg_offset[regno]))
155 	{
156 	  rtx reg2 = gen_rtx_REG (mode, regno2);
157 	  rtx mem2;
158@@ -9326,6 +9326,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
159 static sbitmap
160 aarch64_get_separate_components (void)
161 {
162+  aarch64_frame &frame = cfun->machine->frame;
163   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
164   bitmap_clear (components);
165
166@@ -9342,18 +9343,18 @@ aarch64_get_separate_components (void)
167 	if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
168 	  continue;
169
170-	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
171+	poly_int64 offset = frame.reg_offset[regno];
172
173 	/* If the register is saved in the first SVE save slot, we use
174 	   it as a stack probe for -fstack-clash-protection.  */
175 	if (flag_stack_clash_protection
176-	    && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
177+	    && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
178 	    && known_eq (offset, 0))
179 	  continue;
180
181 	/* Get the offset relative to the register we'll use.  */
182 	if (frame_pointer_needed)
183-	  offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
184+	  offset -= frame.below_hard_fp_saved_regs_size;
185 	else
186 	  offset += crtl->outgoing_args_size;
187
188@@ -9372,11 +9373,11 @@ aarch64_get_separate_components (void)
189   /* If the spare predicate register used by big-endian SVE code
190      is call-preserved, it must be saved in the main prologue
191      before any saves that use it.  */
192-  if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
193-    bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
194+  if (frame.spare_pred_reg != INVALID_REGNUM)
195+    bitmap_clear_bit (components, frame.spare_pred_reg);
196
197-  unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
198-  unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
199+  unsigned reg1 = frame.wb_push_candidate1;
200+  unsigned reg2 = frame.wb_push_candidate2;
201   /* If registers have been chosen to be stored/restored with
202      writeback don't interfere with them to avoid having to output explicit
203      stack adjustment instructions.  */
204@@ -9485,6 +9486,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
205 static void
206 aarch64_process_components (sbitmap components, bool prologue_p)
207 {
208+  aarch64_frame &frame = cfun->machine->frame;
209   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
210 			     ? HARD_FRAME_POINTER_REGNUM
211 			     : STACK_POINTER_REGNUM);
212@@ -9499,9 +9501,9 @@ aarch64_process_components (sbitmap components, bool prologue_p)
213       machine_mode mode = aarch64_reg_save_mode (regno);
214
215       rtx reg = gen_rtx_REG (mode, regno);
216-      poly_int64 offset = cfun->machine->frame.reg_offset[regno];
217+      poly_int64 offset = frame.reg_offset[regno];
218       if (frame_pointer_needed)
219-	offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
220+	offset -= frame.below_hard_fp_saved_regs_size;
221       else
222 	offset += crtl->outgoing_args_size;
223
224@@ -9526,14 +9528,14 @@ aarch64_process_components (sbitmap components, bool prologue_p)
225 	  break;
226 	}
227
228-      poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
229+      poly_int64 offset2 = frame.reg_offset[regno2];
230       /* The next register is not of the same class or its offset is not
231 	 mergeable with the current one into a pair.  */
232       if (aarch64_sve_mode_p (mode)
233 	  || !satisfies_constraint_Ump (mem)
234 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
235 	  || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
236-	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
237+	  || maybe_ne ((offset2 - frame.reg_offset[regno]),
238 		       GET_MODE_SIZE (mode)))
239 	{
240 	  insn = emit_insn (set);
241@@ -9555,7 +9557,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
242       /* REGNO2 can be saved/restored in a pair with REGNO.  */
243       rtx reg2 = gen_rtx_REG (mode, regno2);
244       if (frame_pointer_needed)
245-	offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
246+	offset2 -= frame.below_hard_fp_saved_regs_size;
247       else
248 	offset2 += crtl->outgoing_args_size;
249       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
250@@ -9650,6 +9652,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
251 					bool frame_related_p,
252 					bool final_adjustment_p)
253 {
254+  aarch64_frame &frame = cfun->machine->frame;
255   HOST_WIDE_INT guard_size
256     = 1 << param_stack_clash_protection_guard_size;
257   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
258@@ -9670,25 +9673,25 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
259        register as a probe.  We can't assume that LR was saved at position 0
260        though, so treat any space below it as unprobed.  */
261   if (final_adjustment_p
262-      && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
263+      && known_eq (frame.below_hard_fp_saved_regs_size, 0))
264     {
265-      poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
266+      poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
267       if (known_ge (lr_offset, 0))
268 	min_probe_threshold -= lr_offset.to_constant ();
269       else
270 	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
271     }
272
273-  poly_int64 frame_size = cfun->machine->frame.frame_size;
274+  poly_int64 frame_size = frame.frame_size;
275
276   /* We should always have a positive probe threshold.  */
277   gcc_assert (min_probe_threshold > 0);
278
279   if (flag_stack_clash_protection && !final_adjustment_p)
280     {
281-      poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
282-      poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
283-      poly_int64 final_adjust = cfun->machine->frame.final_adjust;
284+      poly_int64 initial_adjust = frame.initial_adjust;
285+      poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
286+      poly_int64 final_adjust = frame.final_adjust;
287
288       if (known_eq (frame_size, 0))
289 	{
290@@ -9977,17 +9980,18 @@ aarch64_epilogue_uses (int regno)
291 void
292 aarch64_expand_prologue (void)
293 {
294-  poly_int64 frame_size = cfun->machine->frame.frame_size;
295-  poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
296-  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
297-  poly_int64 final_adjust = cfun->machine->frame.final_adjust;
298-  poly_int64 callee_offset = cfun->machine->frame.callee_offset;
299-  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
300+  aarch64_frame &frame = cfun->machine->frame;
301+  poly_int64 frame_size = frame.frame_size;
302+  poly_int64 initial_adjust = frame.initial_adjust;
303+  HOST_WIDE_INT callee_adjust = frame.callee_adjust;
304+  poly_int64 final_adjust = frame.final_adjust;
305+  poly_int64 callee_offset = frame.callee_offset;
306+  poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
307   poly_int64 below_hard_fp_saved_regs_size
308-    = cfun->machine->frame.below_hard_fp_saved_regs_size;
309-  unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
310-  unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
311-  bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
312+    = frame.below_hard_fp_saved_regs_size;
313+  unsigned reg1 = frame.wb_push_candidate1;
314+  unsigned reg2 = frame.wb_push_candidate2;
315+  bool emit_frame_chain = frame.emit_frame_chain;
316   rtx_insn *insn;
317
318   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
319@@ -10018,7 +10022,7 @@ aarch64_expand_prologue (void)
320     }
321
322   /* Push return address to shadow call stack.  */
323-  if (cfun->machine->frame.is_scs_enabled)
324+  if (frame.is_scs_enabled)
325     emit_insn (gen_scs_push ());
326
327   if (flag_stack_usage_info)
328@@ -10057,7 +10061,7 @@ aarch64_expand_prologue (void)
329
330   /* The offset of the frame chain record (if any) from the current SP.  */
331   poly_int64 chain_offset = (initial_adjust + callee_adjust
332-			     - cfun->machine->frame.hard_fp_offset);
333+			     - frame.hard_fp_offset);
334   gcc_assert (known_ge (chain_offset, 0));
335
336   /* The offset of the bottom of the save area from the current SP.  */
337@@ -10160,16 +10164,17 @@ aarch64_use_return_insn_p (void)
338 void
339 aarch64_expand_epilogue (bool for_sibcall)
340 {
341-  poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
342-  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
343-  poly_int64 final_adjust = cfun->machine->frame.final_adjust;
344-  poly_int64 callee_offset = cfun->machine->frame.callee_offset;
345-  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
346+  aarch64_frame &frame = cfun->machine->frame;
347+  poly_int64 initial_adjust = frame.initial_adjust;
348+  HOST_WIDE_INT callee_adjust = frame.callee_adjust;
349+  poly_int64 final_adjust = frame.final_adjust;
350+  poly_int64 callee_offset = frame.callee_offset;
351+  poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
352   poly_int64 below_hard_fp_saved_regs_size
353-    = cfun->machine->frame.below_hard_fp_saved_regs_size;
354-  unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
355-  unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
356-  unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
357+    = frame.below_hard_fp_saved_regs_size;
358+  unsigned reg1 = frame.wb_pop_candidate1;
359+  unsigned reg2 = frame.wb_pop_candidate2;
360+  unsigned int last_gpr = (frame.is_scs_enabled
361 			   ? R29_REGNUM : R30_REGNUM);
362   rtx cfi_ops = NULL;
363   rtx_insn *insn;
364@@ -10203,7 +10208,7 @@ aarch64_expand_epilogue (bool for_sibcall)
365   /* We need to add memory barrier to prevent read from deallocated stack.  */
366   bool need_barrier_p
367     = maybe_ne (get_frame_size ()
368-		+ cfun->machine->frame.saved_varargs_size, 0);
369+		+ frame.saved_varargs_size, 0);
370
371   /* Emit a barrier to prevent loads from a deallocated stack.  */
372   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
373@@ -10284,7 +10289,7 @@ aarch64_expand_epilogue (bool for_sibcall)
374     }
375
376   /* Pop return address from shadow call stack.  */
377-  if (cfun->machine->frame.is_scs_enabled)
378+  if (frame.is_scs_enabled)
379     {
380       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
381       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
382@@ -12740,24 +12745,24 @@ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
383 poly_int64
384 aarch64_initial_elimination_offset (unsigned from, unsigned to)
385 {
386+  aarch64_frame &frame = cfun->machine->frame;
387+
388   if (to == HARD_FRAME_POINTER_REGNUM)
389     {
390       if (from == ARG_POINTER_REGNUM)
391-	return cfun->machine->frame.hard_fp_offset;
392+	return frame.hard_fp_offset;
393
394       if (from == FRAME_POINTER_REGNUM)
395-	return cfun->machine->frame.hard_fp_offset
396-	       - cfun->machine->frame.locals_offset;
397+	return frame.hard_fp_offset - frame.locals_offset;
398     }
399
400   if (to == STACK_POINTER_REGNUM)
401     {
402       if (from == FRAME_POINTER_REGNUM)
403-	  return cfun->machine->frame.frame_size
404-		 - cfun->machine->frame.locals_offset;
405+	return frame.frame_size - frame.locals_offset;
406     }
407
408-  return cfun->machine->frame.frame_size;
409+  return frame.frame_size;
410 }
411
412
413--
4142.34.1
415
416
417From 89a9fa287706c5011f61926eaf65e7b996b963a3 Mon Sep 17 00:00:00 2001
418From: Richard Sandiford <richard.sandiford@arm.com>
419Date: Tue, 12 Sep 2023 16:07:12 +0100
420Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset
421
422When we emit the frame chain, i.e. when we reach Here in this statement
423of aarch64_expand_prologue:
424
425  if (emit_frame_chain)
426    {
427      // Here
428      ...
429    }
430
431the stack is in one of two states:
432
433- We've allocated up to the frame chain, but no more.
434
435- We've allocated the whole frame, and the frame chain is within easy
436  reach of the new SP.
437
438The offset of the frame chain from the current SP is available
439in aarch64_frame as callee_offset.  It is also available as the
440chain_offset local variable, where the latter is calculated from other
441data.  (However, chain_offset is not always equal to callee_offset when
442!emit_frame_chain, so chain_offset isn't redundant.)
443
444In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using
445chain_offset for the initialisation of the hard frame pointer:
446
447       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
448-                         stack_pointer_rtx, callee_offset,
449+                         stack_pointer_rtx, chain_offset,
450                          tmp1_rtx, tmp0_rtx, frame_pointer_needed);
451
452But the later REG_CFA_ADJUST_CFA handling still used callee_offset.
453
454I think the difference is harmless, but it's more logical for the
455CFA note to be in sync, and it's more convenient for later patches
456if it uses chain_offset.
457
458gcc/
459	* config/aarch64/aarch64.cc (aarch64_expand_prologue): Use
460	chain_offset rather than callee_offset.
461---
462 gcc/config/aarch64/aarch64.cc | 4 +---
463 1 file changed, 1 insertion(+), 3 deletions(-)
464
465diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
466index 5d473d161d9..4f233c95140 100644
467--- a/gcc/config/aarch64/aarch64.cc
468+++ b/gcc/config/aarch64/aarch64.cc
469@@ -9985,7 +9985,6 @@ aarch64_expand_prologue (void)
470   poly_int64 initial_adjust = frame.initial_adjust;
471   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
472   poly_int64 final_adjust = frame.final_adjust;
473-  poly_int64 callee_offset = frame.callee_offset;
474   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
475   poly_int64 below_hard_fp_saved_regs_size
476     = frame.below_hard_fp_saved_regs_size;
477@@ -10098,8 +10097,7 @@ aarch64_expand_prologue (void)
478 	     implicit.  */
479 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
480 	    {
481-	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
482-				       callee_offset);
483+	      rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
484 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
485 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
486 	    }
487--
4882.34.1
489
490
491From b36a2a78040722dab6124366c5d6baf8eaf80aef Mon Sep 17 00:00:00 2001
492From: Richard Sandiford <richard.sandiford@arm.com>
493Date: Tue, 12 Sep 2023 16:07:13 +0100
494Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved
495 registers
496
497If a frame has no saved registers, it can be allocated in one go.
498There is no need to treat the areas below and above the saved
499registers as separate.
500
501And if we allocate the frame in one go, it should be allocated
502as the initial_adjust rather than the final_adjust.  This allows the
503frame size to grow to guard_size - guard_used_by_caller before a stack
504probe is needed.  (A frame with no register saves is necessarily a
505leaf frame.)
506
507This is a no-op as thing stand, since a leaf function will have
508no outgoing arguments, and so all the frame will be above where
509the saved registers normally go.
510
511gcc/
512	* config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly
513	allocate the frame in one go if there are no saved registers.
514---
515 gcc/config/aarch64/aarch64.cc | 8 +++++---
516 1 file changed, 5 insertions(+), 3 deletions(-)
517
518diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
519index 4f233c95140..37643041ffb 100644
520--- a/gcc/config/aarch64/aarch64.cc
521+++ b/gcc/config/aarch64/aarch64.cc
522@@ -8639,9 +8639,11 @@ aarch64_layout_frame (void)
523
524   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
525   HOST_WIDE_INT const_saved_regs_size;
526-  if (frame.frame_size.is_constant (&const_size)
527-      && const_size < max_push_offset
528-      && known_eq (frame.hard_fp_offset, const_size))
529+  if (known_eq (frame.saved_regs_size, 0))
530+    frame.initial_adjust = frame.frame_size;
531+  else if (frame.frame_size.is_constant (&const_size)
532+	   && const_size < max_push_offset
533+	   && known_eq (frame.hard_fp_offset, const_size))
534     {
535       /* Simple, small frame with no outgoing arguments:
536
537--
5382.34.1
539
540
541From ada2ab0093596be707f23a3466ac82cff59fcffe Mon Sep 17 00:00:00 2001
542From: Richard Sandiford <richard.sandiford@arm.com>
543Date: Tue, 12 Sep 2023 16:07:13 +0100
544Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info
545
546The frame layout code currently hard-codes the assumption that
547the number of bytes below the saved registers is equal to the
548size of the outgoing arguments.  This patch abstracts that
549value into a new field of aarch64_frame.
550
551gcc/
552	* config/aarch64/aarch64.h (aarch64_frame::bytes_below_saved_regs): New
553	field.
554	* config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it,
555	and use it instead of crtl->outgoing_args_size.
556	(aarch64_get_separate_components): Use bytes_below_saved_regs instead
557	of outgoing_args_size.
558	(aarch64_process_components): Likewise.
559---
560 gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++-----------------
561 gcc/config/aarch64/aarch64.h  |  5 +++
562 2 files changed, 41 insertions(+), 35 deletions(-)
563
564diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
565index 37643041ffb..dacc2b0e4dd 100644
566--- a/gcc/config/aarch64/aarch64.cc
567+++ b/gcc/config/aarch64/aarch64.cc
568@@ -8478,6 +8478,8 @@ aarch64_layout_frame (void)
569   gcc_assert (crtl->is_leaf
570 	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
571
572+  frame.bytes_below_saved_regs = crtl->outgoing_args_size;
573+
574   /* Now assign stack slots for the registers.  Start with the predicate
575      registers, since predicate LDR and STR have a relatively small
576      offset range.  These saves happen below the hard frame pointer.  */
577@@ -8582,18 +8584,18 @@ aarch64_layout_frame (void)
578
579   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
580
581-  poly_int64 above_outgoing_args
582+  poly_int64 saved_regs_and_above
583     = aligned_upper_bound (varargs_and_saved_regs_size
584 			   + get_frame_size (),
585 			   STACK_BOUNDARY / BITS_PER_UNIT);
586
587   frame.hard_fp_offset
588-    = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
589+    = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
590
591   /* Both these values are already aligned.  */
592-  gcc_assert (multiple_p (crtl->outgoing_args_size,
593+  gcc_assert (multiple_p (frame.bytes_below_saved_regs,
594 			  STACK_BOUNDARY / BITS_PER_UNIT));
595-  frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
596+  frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
597
598   frame.locals_offset = frame.saved_varargs_size;
599
600@@ -8637,7 +8639,7 @@ aarch64_layout_frame (void)
601   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
602     max_push_offset = 256;
603
604-  HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
605+  HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
606   HOST_WIDE_INT const_saved_regs_size;
607   if (known_eq (frame.saved_regs_size, 0))
608     frame.initial_adjust = frame.frame_size;
609@@ -8645,31 +8647,31 @@ aarch64_layout_frame (void)
610 	   && const_size < max_push_offset
611 	   && known_eq (frame.hard_fp_offset, const_size))
612     {
613-      /* Simple, small frame with no outgoing arguments:
614+      /* Simple, small frame with no data below the saved registers.
615
616 	 stp reg1, reg2, [sp, -frame_size]!
617 	 stp reg3, reg4, [sp, 16]  */
618       frame.callee_adjust = const_size;
619     }
620-  else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
621+  else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
622 	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
623-	   && const_outgoing_args_size + const_saved_regs_size < 512
624-	   /* We could handle this case even with outgoing args, provided
625-	      that the number of args left us with valid offsets for all
626-	      predicate and vector save slots.  It's such a rare case that
627-	      it hardly seems worth the effort though.  */
628-	   && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
629+	   && const_below_saved_regs + const_saved_regs_size < 512
630+	   /* We could handle this case even with data below the saved
631+	      registers, provided that that data left us with valid offsets
632+	      for all predicate and vector save slots.  It's such a rare
633+	      case that it hardly seems worth the effort though.  */
634+	   && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
635 	   && !(cfun->calls_alloca
636 		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
637 		&& const_fp_offset < max_push_offset))
638     {
639-      /* Frame with small outgoing arguments:
640+      /* Frame with small area below the saved registers:
641
642 	 sub sp, sp, frame_size
643-	 stp reg1, reg2, [sp, outgoing_args_size]
644-	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
645+	 stp reg1, reg2, [sp, bytes_below_saved_regs]
646+	 stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
647       frame.initial_adjust = frame.frame_size;
648-      frame.callee_offset = const_outgoing_args_size;
649+      frame.callee_offset = const_below_saved_regs;
650     }
651   else if (saves_below_hard_fp_p
652 	   && known_eq (frame.saved_regs_size,
653@@ -8679,30 +8681,29 @@ aarch64_layout_frame (void)
654
655 	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
656 	 save SVE registers relative to SP
657-	 sub sp, sp, outgoing_args_size  */
658+	 sub sp, sp, bytes_below_saved_regs  */
659       frame.initial_adjust = (frame.hard_fp_offset
660 			      + frame.below_hard_fp_saved_regs_size);
661-      frame.final_adjust = crtl->outgoing_args_size;
662+      frame.final_adjust = frame.bytes_below_saved_regs;
663     }
664   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
665 	   && const_fp_offset < max_push_offset)
666     {
667-      /* Frame with large outgoing arguments or SVE saves, but with
668-	 a small local area:
669+      /* Frame with large area below the saved registers, or with SVE saves,
670+	 but with a small area above:
671
672 	 stp reg1, reg2, [sp, -hard_fp_offset]!
673 	 stp reg3, reg4, [sp, 16]
674 	 [sub sp, sp, below_hard_fp_saved_regs_size]
675 	 [save SVE registers relative to SP]
676-	 sub sp, sp, outgoing_args_size  */
677+	 sub sp, sp, bytes_below_saved_regs  */
678       frame.callee_adjust = const_fp_offset;
679       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
680-      frame.final_adjust = crtl->outgoing_args_size;
681+      frame.final_adjust = frame.bytes_below_saved_regs;
682     }
683   else
684     {
685-      /* Frame with large local area and outgoing arguments or SVE saves,
686-	 using frame pointer:
687+      /* General case:
688
689 	 sub sp, sp, hard_fp_offset
690 	 stp x29, x30, [sp, 0]
691@@ -8710,10 +8711,10 @@ aarch64_layout_frame (void)
692 	 stp reg3, reg4, [sp, 16]
693 	 [sub sp, sp, below_hard_fp_saved_regs_size]
694 	 [save SVE registers relative to SP]
695-	 sub sp, sp, outgoing_args_size  */
696+	 sub sp, sp, bytes_below_saved_regs  */
697       frame.initial_adjust = frame.hard_fp_offset;
698       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
699-      frame.final_adjust = crtl->outgoing_args_size;
700+      frame.final_adjust = frame.bytes_below_saved_regs;
701     }
702
703   /* Make sure the individual adjustments add up to the full frame size.  */
704@@ -9358,7 +9359,7 @@ aarch64_get_separate_components (void)
705 	if (frame_pointer_needed)
706 	  offset -= frame.below_hard_fp_saved_regs_size;
707 	else
708-	  offset += crtl->outgoing_args_size;
709+	  offset += frame.bytes_below_saved_regs;
710
711 	/* Check that we can access the stack slot of the register with one
712 	   direct load with no adjustments needed.  */
713@@ -9507,7 +9508,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
714       if (frame_pointer_needed)
715 	offset -= frame.below_hard_fp_saved_regs_size;
716       else
717-	offset += crtl->outgoing_args_size;
718+	offset += frame.bytes_below_saved_regs;
719
720       rtx addr = plus_constant (Pmode, ptr_reg, offset);
721       rtx mem = gen_frame_mem (mode, addr);
722@@ -9561,7 +9562,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
723       if (frame_pointer_needed)
724 	offset2 -= frame.below_hard_fp_saved_regs_size;
725       else
726-	offset2 += crtl->outgoing_args_size;
727+	offset2 += frame.bytes_below_saved_regs;
728       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
729       rtx mem2 = gen_frame_mem (mode, addr2);
730       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
731@@ -9635,10 +9636,10 @@ aarch64_stack_clash_protection_alloca_probe_range (void)
732    registers.  If POLY_SIZE is not large enough to require a probe this function
733    will only adjust the stack.  When allocating the stack space
734    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
735-   FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
736-   arguments.  If we are then we ensure that any allocation larger than the ABI
737-   defined buffer needs a probe so that the invariant of having a 1KB buffer is
738-   maintained.
739+   FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
740+   the saved registers.  If we are then we ensure that any allocation
741+   larger than the ABI defined buffer needs a probe so that the
742+   invariant of having a 1KB buffer is maintained.
743
744    We emit barriers after each stack adjustment to prevent optimizations from
745    breaking the invariant that we never drop the stack more than a page.  This
746@@ -9847,7 +9848,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
747   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
748      be probed.  This maintains the requirement that each page is probed at
749      least once.  For initial probing we probe only if the allocation is
750-     more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
751+     more than GUARD_SIZE - buffer, and below the saved registers we probe
752      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
753      GUARD_SIZE.  This works that for any allocation that is large enough to
754      trigger a probe here, we'll have at least one, and if they're not large
755diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
756index 73b09e20508..0b6faa3ddf1 100644
757--- a/gcc/config/aarch64/aarch64.h
758+++ b/gcc/config/aarch64/aarch64.h
759@@ -777,6 +777,11 @@ struct GTY (()) aarch64_frame
760   /* The size of the callee-save registers with a slot in REG_OFFSET.  */
761   poly_int64 saved_regs_size;
762
763+  /* The number of bytes between the bottom of the static frame (the bottom
764+     of the outgoing arguments) and the bottom of the register save area.
765+     This value is always a multiple of STACK_BOUNDARY.  */
766+  poly_int64 bytes_below_saved_regs;
767+
768   /* The size of the callee-save registers with a slot in REG_OFFSET that
769      are saved below the hard frame pointer.  */
770   poly_int64 below_hard_fp_saved_regs_size;
771--
7722.34.1
773
774
775From 82f6b3e1b596ef0f4e3ac3bb9c6e88fb4458f402 Mon Sep 17 00:00:00 2001
776From: Richard Sandiford <richard.sandiford@arm.com>
777Date: Tue, 12 Sep 2023 16:07:14 +0100
778Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info
779
780Following on from the previous bytes_below_saved_regs patch, this one
781records the number of bytes that are below the hard frame pointer.
782This eventually replaces below_hard_fp_saved_regs_size.
783
784If a frame pointer is not needed, the epilogue adds final_adjust
785to the stack pointer before restoring registers:
786
787     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
788
789Therefore, if the epilogue needs to restore the stack pointer from
790the hard frame pointer, the directly corresponding offset is:
791
792     -bytes_below_hard_fp + final_adjust
793
794i.e. go from the hard frame pointer to the bottom of the frame,
795then add the same amount as if we were using the stack pointer
796from the outset.
797
798gcc/
799	* config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): New
800	field.
801	* config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it.
802	(aarch64_expand_epilogue): Use it instead of
803	below_hard_fp_saved_regs_size.
804---
805 gcc/config/aarch64/aarch64.cc | 6 +++---
806 gcc/config/aarch64/aarch64.h  | 5 +++++
807 2 files changed, 8 insertions(+), 3 deletions(-)
808
809diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
810index dacc2b0e4dd..a3f7aabcc59 100644
811--- a/gcc/config/aarch64/aarch64.cc
812+++ b/gcc/config/aarch64/aarch64.cc
813@@ -8530,6 +8530,7 @@ aarch64_layout_frame (void)
814      of the callee save area.  */
815   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
816   frame.below_hard_fp_saved_regs_size = offset;
817+  frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
818   if (frame.emit_frame_chain)
819     {
820       /* FP and LR are placed in the linkage record.  */
821@@ -10171,8 +10172,7 @@ aarch64_expand_epilogue (bool for_sibcall)
822   poly_int64 final_adjust = frame.final_adjust;
823   poly_int64 callee_offset = frame.callee_offset;
824   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
825-  poly_int64 below_hard_fp_saved_regs_size
826-    = frame.below_hard_fp_saved_regs_size;
827+  poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
828   unsigned reg1 = frame.wb_pop_candidate1;
829   unsigned reg2 = frame.wb_pop_candidate2;
830   unsigned int last_gpr = (frame.is_scs_enabled
831@@ -10230,7 +10230,7 @@ aarch64_expand_epilogue (bool for_sibcall)
832        is restored on the instruction doing the writeback.  */
833     aarch64_add_offset (Pmode, stack_pointer_rtx,
834 			hard_frame_pointer_rtx,
835-			-callee_offset - below_hard_fp_saved_regs_size,
836+			-bytes_below_hard_fp + final_adjust,
837 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
838   else
839      /* The case where we need to re-use the register here is very rare, so
840diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
841index 0b6faa3ddf1..4263d29d29d 100644
842--- a/gcc/config/aarch64/aarch64.h
843+++ b/gcc/config/aarch64/aarch64.h
844@@ -786,6 +786,11 @@ struct GTY (()) aarch64_frame
845      are saved below the hard frame pointer.  */
846   poly_int64 below_hard_fp_saved_regs_size;
847
848+  /* The number of bytes between the bottom of the static frame (the bottom
849+     of the outgoing arguments) and the hard frame pointer.  This value is
850+     always a multiple of STACK_BOUNDARY.  */
851+  poly_int64 bytes_below_hard_fp;
852+
853   /* Offset from the base of the frame (incomming SP) to the
854      top of the locals area.  This value is always a multiple of
855      STACK_BOUNDARY.  */
856--
8572.34.1
858
859
860From 86fa43e9fe4a8bf954f2919f07cbe3646d1d1df3 Mon Sep 17 00:00:00 2001
861From: Richard Sandiford <richard.sandiford@arm.com>
862Date: Tue, 12 Sep 2023 16:07:14 +0100
863Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves
864
865aarch64_save_callee_saves and aarch64_restore_callee_saves took
866a parameter called start_offset that gives the offset of the
867bottom of the saved register area from the current stack pointer.
868However, it's more convenient for later patches if we use the
869bottom of the entire frame as the reference point, rather than
870the bottom of the saved registers.
871
872Doing that removes the need for the callee_offset field.
873Other than that, this is not a win on its own.  It only really
874makes sense in combination with the follow-on patches.
875
876gcc/
877	* config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete.
878	* config/aarch64/aarch64.cc (aarch64_layout_frame): Remove
879	callee_offset handling.
880	(aarch64_save_callee_saves): Replace the start_offset parameter
881	with a bytes_below_sp parameter.
882	(aarch64_restore_callee_saves): Likewise.
883	(aarch64_expand_prologue): Update accordingly.
884	(aarch64_expand_epilogue): Likewise.
885---
886 gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------
887 gcc/config/aarch64/aarch64.h  |  4 ---
888 2 files changed, 28 insertions(+), 32 deletions(-)
889
890diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
891index a3f7aabcc59..46ae5cf7673 100644
892--- a/gcc/config/aarch64/aarch64.cc
893+++ b/gcc/config/aarch64/aarch64.cc
894@@ -8604,7 +8604,6 @@ aarch64_layout_frame (void)
895   frame.final_adjust = 0;
896   frame.callee_adjust = 0;
897   frame.sve_callee_adjust = 0;
898-  frame.callee_offset = 0;
899
900   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
901   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
902@@ -8672,7 +8671,6 @@ aarch64_layout_frame (void)
903 	 stp reg1, reg2, [sp, bytes_below_saved_regs]
904 	 stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
905       frame.initial_adjust = frame.frame_size;
906-      frame.callee_offset = const_below_saved_regs;
907     }
908   else if (saves_below_hard_fp_p
909 	   && known_eq (frame.saved_regs_size,
910@@ -9073,12 +9071,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
911 }
912
913 /* Emit code to save the callee-saved registers from register number START
914-   to LIMIT to the stack at the location starting at offset START_OFFSET,
915-   skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
916-   is true if the hard frame pointer has been set up.  */
917+   to LIMIT to the stack.  The stack pointer is currently BYTES_BELOW_SP
918+   bytes above the bottom of the static frame.  Skip any write-back
919+   candidates if SKIP_WB is true.  HARD_FP_VALID_P is true if the hard
920+   frame pointer has been set up.  */
921
922 static void
923-aarch64_save_callee_saves (poly_int64 start_offset,
924+aarch64_save_callee_saves (poly_int64 bytes_below_sp,
925 			   unsigned start, unsigned limit, bool skip_wb,
926 			   bool hard_fp_valid_p)
927 {
928@@ -9106,7 +9105,9 @@ aarch64_save_callee_saves (poly_int64 start_offset,
929
930       machine_mode mode = aarch64_reg_save_mode (regno);
931       reg = gen_rtx_REG (mode, regno);
932-      offset = start_offset + frame.reg_offset[regno];
933+      offset = (frame.reg_offset[regno]
934+		+ frame.bytes_below_saved_regs
935+		- bytes_below_sp);
936       rtx base_rtx = stack_pointer_rtx;
937       poly_int64 sp_offset = offset;
938
939@@ -9117,9 +9118,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
940       else if (GP_REGNUM_P (regno)
941 	       && (!offset.is_constant (&const_offset) || const_offset >= 512))
942 	{
943-	  gcc_assert (known_eq (start_offset, 0));
944-	  poly_int64 fp_offset
945-	    = frame.below_hard_fp_saved_regs_size;
946+	  poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
947 	  if (hard_fp_valid_p)
948 	    base_rtx = hard_frame_pointer_rtx;
949 	  else
950@@ -9183,12 +9182,13 @@ aarch64_save_callee_saves (poly_int64 start_offset,
951 }
952
953 /* Emit code to restore the callee registers from register number START
954-   up to and including LIMIT.  Restore from the stack offset START_OFFSET,
955-   skipping any write-back candidates if SKIP_WB is true.  Write the
956-   appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
957+   up to and including LIMIT.  The stack pointer is currently BYTES_BELOW_SP
958+   bytes above the bottom of the static frame.  Skip any write-back
959+   candidates if SKIP_WB is true.  Write the appropriate REG_CFA_RESTORE
960+   notes into CFI_OPS.  */
961
962 static void
963-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
964+aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
965 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
966 {
967   aarch64_frame &frame = cfun->machine->frame;
968@@ -9214,7 +9214,9 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
969
970       machine_mode mode = aarch64_reg_save_mode (regno);
971       reg = gen_rtx_REG (mode, regno);
972-      offset = start_offset + frame.reg_offset[regno];
973+      offset = (frame.reg_offset[regno]
974+		+ frame.bytes_below_saved_regs
975+		- bytes_below_sp);
976       rtx base_rtx = stack_pointer_rtx;
977       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
978 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
979@@ -9990,8 +9992,6 @@ aarch64_expand_prologue (void)
980   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
981   poly_int64 final_adjust = frame.final_adjust;
982   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
983-  poly_int64 below_hard_fp_saved_regs_size
984-    = frame.below_hard_fp_saved_regs_size;
985   unsigned reg1 = frame.wb_push_candidate1;
986   unsigned reg2 = frame.wb_push_candidate2;
987   bool emit_frame_chain = frame.emit_frame_chain;
988@@ -10067,8 +10067,8 @@ aarch64_expand_prologue (void)
989 			     - frame.hard_fp_offset);
990   gcc_assert (known_ge (chain_offset, 0));
991
992-  /* The offset of the bottom of the save area from the current SP.  */
993-  poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
994+  /* The offset of the current SP from the bottom of the static frame.  */
995+  poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
996
997   if (emit_frame_chain)
998     {
999@@ -10076,7 +10076,7 @@ aarch64_expand_prologue (void)
1000 	{
1001 	  reg1 = R29_REGNUM;
1002 	  reg2 = R30_REGNUM;
1003-	  aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
1004+	  aarch64_save_callee_saves (bytes_below_sp, reg1, reg2,
1005 				     false, false);
1006 	}
1007       else
1008@@ -10116,7 +10116,7 @@ aarch64_expand_prologue (void)
1009       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
1010     }
1011
1012-  aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
1013+  aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM,
1014 			     callee_adjust != 0 || emit_frame_chain,
1015 			     emit_frame_chain);
1016   if (maybe_ne (sve_callee_adjust, 0))
1017@@ -10126,16 +10126,17 @@ aarch64_expand_prologue (void)
1018       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
1019 					      sve_callee_adjust,
1020 					      !frame_pointer_needed, false);
1021-      saved_regs_offset += sve_callee_adjust;
1022+      bytes_below_sp -= sve_callee_adjust;
1023     }
1024-  aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
1025+  aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM,
1026 			     false, emit_frame_chain);
1027-  aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
1028+  aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM,
1029 			     callee_adjust != 0 || emit_frame_chain,
1030 			     emit_frame_chain);
1031
1032   /* We may need to probe the final adjustment if it is larger than the guard
1033      that is assumed by the called.  */
1034+  gcc_assert (known_eq (bytes_below_sp, final_adjust));
1035   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
1036 					  !frame_pointer_needed, true);
1037 }
1038@@ -10170,7 +10171,6 @@ aarch64_expand_epilogue (bool for_sibcall)
1039   poly_int64 initial_adjust = frame.initial_adjust;
1040   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
1041   poly_int64 final_adjust = frame.final_adjust;
1042-  poly_int64 callee_offset = frame.callee_offset;
1043   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
1044   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
1045   unsigned reg1 = frame.wb_pop_candidate1;
1046@@ -10240,9 +10240,9 @@ aarch64_expand_epilogue (bool for_sibcall)
1047
1048   /* Restore the vector registers before the predicate registers,
1049      so that we can use P4 as a temporary for big-endian SVE frames.  */
1050-  aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
1051+  aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM,
1052 				callee_adjust != 0, &cfi_ops);
1053-  aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
1054+  aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM,
1055 				false, &cfi_ops);
1056   if (maybe_ne (sve_callee_adjust, 0))
1057     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
1058@@ -10250,7 +10250,7 @@ aarch64_expand_epilogue (bool for_sibcall)
1059   /* When shadow call stack is enabled, the scs_pop in the epilogue will
1060      restore x30, we don't need to restore x30 again in the traditional
1061      way.  */
1062-  aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
1063+  aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
1064 				R0_REGNUM, last_gpr,
1065 				callee_adjust != 0, &cfi_ops);
1066
1067diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1068index 4263d29d29d..fd820b1be4e 100644
1069--- a/gcc/config/aarch64/aarch64.h
1070+++ b/gcc/config/aarch64/aarch64.h
1071@@ -813,10 +813,6 @@ struct GTY (()) aarch64_frame
1072      It is zero when no push is used.  */
1073   HOST_WIDE_INT callee_adjust;
1074
1075-  /* The offset from SP to the callee-save registers after initial_adjust.
1076-     It may be non-zero if no push is used (ie. callee_adjust == 0).  */
1077-  poly_int64 callee_offset;
1078-
1079   /* The size of the stack adjustment before saving or after restoring
1080      SVE registers.  */
1081   poly_int64 sve_callee_adjust;
1082--
10832.34.1
1084
1085
1086From 8ae9181426f2700c2e5a2909487fa630e6fa406b Mon Sep 17 00:00:00 2001
1087From: Richard Sandiford <richard.sandiford@arm.com>
1088Date: Tue, 12 Sep 2023 16:07:15 +0100
1089Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a
1090 chain
1091
1092After previous patches, it is no longer necessary to calculate
1093a chain_offset in cases where there is no chain record.
1094
1095gcc/
1096	* config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the
1097	calculation of chain_offset into the emit_frame_chain block.
1098---
1099 gcc/config/aarch64/aarch64.cc | 10 +++++-----
1100 1 file changed, 5 insertions(+), 5 deletions(-)
1101
1102diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1103index 46ae5cf7673..0e9b9717c08 100644
1104--- a/gcc/config/aarch64/aarch64.cc
1105+++ b/gcc/config/aarch64/aarch64.cc
1106@@ -10062,16 +10062,16 @@ aarch64_expand_prologue (void)
1107   if (callee_adjust != 0)
1108     aarch64_push_regs (reg1, reg2, callee_adjust);
1109
1110-  /* The offset of the frame chain record (if any) from the current SP.  */
1111-  poly_int64 chain_offset = (initial_adjust + callee_adjust
1112-			     - frame.hard_fp_offset);
1113-  gcc_assert (known_ge (chain_offset, 0));
1114-
1115   /* The offset of the current SP from the bottom of the static frame.  */
1116   poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
1117
1118   if (emit_frame_chain)
1119     {
1120+      /* The offset of the frame chain record (if any) from the current SP.  */
1121+      poly_int64 chain_offset = (initial_adjust + callee_adjust
1122+				 - frame.hard_fp_offset);
1123+      gcc_assert (known_ge (chain_offset, 0));
1124+
1125       if (callee_adjust == 0)
1126 	{
1127 	  reg1 = R29_REGNUM;
1128--
11292.34.1
1130
1131
1132From 375794feb614cee1f41b710b9cc1b6f25da6c1cb Mon Sep 17 00:00:00 2001
1133From: Richard Sandiford <richard.sandiford@arm.com>
1134Date: Tue, 12 Sep 2023 16:07:15 +0100
1135Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals
1136MIME-Version: 1.0
1137Content-Type: text/plain; charset=UTF-8
1138Content-Transfer-Encoding: 8bit
1139
1140locals_offset was described as:
1141
1142  /* Offset from the base of the frame (incomming SP) to the
1143     top of the locals area.  This value is always a multiple of
1144     STACK_BOUNDARY.  */
1145
1146This is implicitly an “upside down” view of the frame: the incoming
1147SP is at offset 0, and anything N bytes below the incoming SP is at
1148offset N (rather than -N).
1149
1150However, reg_offset instead uses a “right way up” view; that is,
1151it views offsets in address terms.  Something above X is at a
1152positive offset from X and something below X is at a negative
1153offset from X.
1154
1155Also, even on FRAME_GROWS_DOWNWARD targets like AArch64,
1156target-independent code views offsets in address terms too:
1157locals are allocated at negative offsets to virtual_stack_vars.
1158
1159It seems confusing to have *_offset fields of the same structure
1160using different polarities like this.  This patch tries to avoid
1161that by renaming locals_offset to bytes_above_locals.
1162
1163gcc/
1164	* config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename to...
1165	(aarch64_frame::bytes_above_locals): ...this.
1166	* config/aarch64/aarch64.cc (aarch64_layout_frame)
1167	(aarch64_initial_elimination_offset): Update accordingly.
1168---
1169 gcc/config/aarch64/aarch64.cc | 6 +++---
1170 gcc/config/aarch64/aarch64.h  | 6 +++---
1171 2 files changed, 6 insertions(+), 6 deletions(-)
1172
1173diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1174index 0e9b9717c08..0a22f91520e 100644
1175--- a/gcc/config/aarch64/aarch64.cc
1176+++ b/gcc/config/aarch64/aarch64.cc
1177@@ -8598,7 +8598,7 @@ aarch64_layout_frame (void)
1178 			  STACK_BOUNDARY / BITS_PER_UNIT));
1179   frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
1180
1181-  frame.locals_offset = frame.saved_varargs_size;
1182+  frame.bytes_above_locals = frame.saved_varargs_size;
1183
1184   frame.initial_adjust = 0;
1185   frame.final_adjust = 0;
1186@@ -12754,13 +12754,13 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
1187 	return frame.hard_fp_offset;
1188
1189       if (from == FRAME_POINTER_REGNUM)
1190-	return frame.hard_fp_offset - frame.locals_offset;
1191+	return frame.hard_fp_offset - frame.bytes_above_locals;
1192     }
1193
1194   if (to == STACK_POINTER_REGNUM)
1195     {
1196       if (from == FRAME_POINTER_REGNUM)
1197-	return frame.frame_size - frame.locals_offset;
1198+	return frame.frame_size - frame.bytes_above_locals;
1199     }
1200
1201   return frame.frame_size;
1202diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1203index fd820b1be4e..7ae12d13e2b 100644
1204--- a/gcc/config/aarch64/aarch64.h
1205+++ b/gcc/config/aarch64/aarch64.h
1206@@ -791,10 +791,10 @@ struct GTY (()) aarch64_frame
1207      always a multiple of STACK_BOUNDARY.  */
1208   poly_int64 bytes_below_hard_fp;
1209
1210-  /* Offset from the base of the frame (incomming SP) to the
1211-     top of the locals area.  This value is always a multiple of
1212+  /* The number of bytes between the top of the locals area and the top
1213+     of the frame (the incomming SP).  This value is always a multiple of
1214      STACK_BOUNDARY.  */
1215-  poly_int64 locals_offset;
1216+  poly_int64 bytes_above_locals;
1217
1218   /* Offset from the base of the frame (incomming SP) to the
1219      hard_frame_pointer.  This value is always a multiple of
1220--
12212.34.1
1222
1223
1224From 1a9ea1c45c75615ffbfabe652b3598a1d7be2168 Mon Sep 17 00:00:00 2001
1225From: Richard Sandiford <richard.sandiford@arm.com>
1226Date: Tue, 12 Sep 2023 16:07:16 +0100
1227Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to bytes_above_hard_fp
1228MIME-Version: 1.0
1229Content-Type: text/plain; charset=UTF-8
1230Content-Transfer-Encoding: 8bit
1231
1232Similarly to the previous locals_offset patch, hard_fp_offset
1233was described as:
1234
1235  /* Offset from the base of the frame (incomming SP) to the
1236     hard_frame_pointer.  This value is always a multiple of
1237     STACK_BOUNDARY.  */
1238  poly_int64 hard_fp_offset;
1239
1240which again took an “upside-down” view: higher offsets meant lower
1241addresses.  This patch renames the field to bytes_above_hard_fp instead.
1242
1243gcc/
1244	* config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename
1245	to...
1246	(aarch64_frame::bytes_above_hard_fp): ...this.
1247	* config/aarch64/aarch64.cc (aarch64_layout_frame)
1248	(aarch64_expand_prologue): Update accordingly.
1249	(aarch64_initial_elimination_offset): Likewise.
1250---
1251 gcc/config/aarch64/aarch64.cc | 26 +++++++++++++-------------
1252 gcc/config/aarch64/aarch64.h  |  6 +++---
1253 2 files changed, 16 insertions(+), 16 deletions(-)
1254
1255diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1256index 0a22f91520e..95499ae49ba 100644
1257--- a/gcc/config/aarch64/aarch64.cc
1258+++ b/gcc/config/aarch64/aarch64.cc
1259@@ -8590,7 +8590,7 @@ aarch64_layout_frame (void)
1260 			   + get_frame_size (),
1261 			   STACK_BOUNDARY / BITS_PER_UNIT);
1262
1263-  frame.hard_fp_offset
1264+  frame.bytes_above_hard_fp
1265     = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
1266
1267   /* Both these values are already aligned.  */
1268@@ -8639,13 +8639,13 @@ aarch64_layout_frame (void)
1269   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
1270     max_push_offset = 256;
1271
1272-  HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
1273+  HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
1274   HOST_WIDE_INT const_saved_regs_size;
1275   if (known_eq (frame.saved_regs_size, 0))
1276     frame.initial_adjust = frame.frame_size;
1277   else if (frame.frame_size.is_constant (&const_size)
1278 	   && const_size < max_push_offset
1279-	   && known_eq (frame.hard_fp_offset, const_size))
1280+	   && known_eq (frame.bytes_above_hard_fp, const_size))
1281     {
1282       /* Simple, small frame with no data below the saved registers.
1283
1284@@ -8662,8 +8662,8 @@ aarch64_layout_frame (void)
1285 	      case that it hardly seems worth the effort though.  */
1286 	   && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
1287 	   && !(cfun->calls_alloca
1288-		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
1289-		&& const_fp_offset < max_push_offset))
1290+		&& frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1291+		&& const_above_fp < max_push_offset))
1292     {
1293       /* Frame with small area below the saved registers:
1294
1295@@ -8681,12 +8681,12 @@ aarch64_layout_frame (void)
1296 	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
1297 	 save SVE registers relative to SP
1298 	 sub sp, sp, bytes_below_saved_regs  */
1299-      frame.initial_adjust = (frame.hard_fp_offset
1300+      frame.initial_adjust = (frame.bytes_above_hard_fp
1301 			      + frame.below_hard_fp_saved_regs_size);
1302       frame.final_adjust = frame.bytes_below_saved_regs;
1303     }
1304-  else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
1305-	   && const_fp_offset < max_push_offset)
1306+  else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1307+	   && const_above_fp < max_push_offset)
1308     {
1309       /* Frame with large area below the saved registers, or with SVE saves,
1310 	 but with a small area above:
1311@@ -8696,7 +8696,7 @@ aarch64_layout_frame (void)
1312 	 [sub sp, sp, below_hard_fp_saved_regs_size]
1313 	 [save SVE registers relative to SP]
1314 	 sub sp, sp, bytes_below_saved_regs  */
1315-      frame.callee_adjust = const_fp_offset;
1316+      frame.callee_adjust = const_above_fp;
1317       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
1318       frame.final_adjust = frame.bytes_below_saved_regs;
1319     }
1320@@ -8711,7 +8711,7 @@ aarch64_layout_frame (void)
1321 	 [sub sp, sp, below_hard_fp_saved_regs_size]
1322 	 [save SVE registers relative to SP]
1323 	 sub sp, sp, bytes_below_saved_regs  */
1324-      frame.initial_adjust = frame.hard_fp_offset;
1325+      frame.initial_adjust = frame.bytes_above_hard_fp;
1326       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
1327       frame.final_adjust = frame.bytes_below_saved_regs;
1328     }
1329@@ -10069,7 +10069,7 @@ aarch64_expand_prologue (void)
1330     {
1331       /* The offset of the frame chain record (if any) from the current SP.  */
1332       poly_int64 chain_offset = (initial_adjust + callee_adjust
1333-				 - frame.hard_fp_offset);
1334+				 - frame.bytes_above_hard_fp);
1335       gcc_assert (known_ge (chain_offset, 0));
1336
1337       if (callee_adjust == 0)
1338@@ -12751,10 +12751,10 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
1339   if (to == HARD_FRAME_POINTER_REGNUM)
1340     {
1341       if (from == ARG_POINTER_REGNUM)
1342-	return frame.hard_fp_offset;
1343+	return frame.bytes_above_hard_fp;
1344
1345       if (from == FRAME_POINTER_REGNUM)
1346-	return frame.hard_fp_offset - frame.bytes_above_locals;
1347+	return frame.bytes_above_hard_fp - frame.bytes_above_locals;
1348     }
1349
1350   if (to == STACK_POINTER_REGNUM)
1351diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1352index 7ae12d13e2b..3808f49e9ca 100644
1353--- a/gcc/config/aarch64/aarch64.h
1354+++ b/gcc/config/aarch64/aarch64.h
1355@@ -796,10 +796,10 @@ struct GTY (()) aarch64_frame
1356      STACK_BOUNDARY.  */
1357   poly_int64 bytes_above_locals;
1358
1359-  /* Offset from the base of the frame (incomming SP) to the
1360-     hard_frame_pointer.  This value is always a multiple of
1361+  /* The number of bytes between the hard_frame_pointer and the top of
1362+     the frame (the incomming SP).  This value is always a multiple of
1363      STACK_BOUNDARY.  */
1364-  poly_int64 hard_fp_offset;
1365+  poly_int64 bytes_above_hard_fp;
1366
1367   /* The size of the frame.  This value is the offset from base of the
1368      frame (incomming SP) to the stack_pointer.  This value is always
1369--
13702.34.1
1371
1372
1373From d202ce1ecf60a36a3e1009917dd76109248ce9be Mon Sep 17 00:00:00 2001
1374From: Richard Sandiford <richard.sandiford@arm.com>
1375Date: Tue, 12 Sep 2023 16:07:16 +0100
1376Subject: [PATCH 10/19] aarch64: Tweak frame_size comment
1377MIME-Version: 1.0
1378Content-Type: text/plain; charset=UTF-8
1379Content-Transfer-Encoding: 8bit
1380
1381This patch fixes another case in which a value was described with
1382an “upside-down” view.
1383
1384gcc/
1385	* config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak comment.
1386---
1387 gcc/config/aarch64/aarch64.h | 4 ++--
1388 1 file changed, 2 insertions(+), 2 deletions(-)
1389
1390diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1391index 3808f49e9ca..108a5731b0d 100644
1392--- a/gcc/config/aarch64/aarch64.h
1393+++ b/gcc/config/aarch64/aarch64.h
1394@@ -801,8 +801,8 @@ struct GTY (()) aarch64_frame
1395      STACK_BOUNDARY.  */
1396   poly_int64 bytes_above_hard_fp;
1397
1398-  /* The size of the frame.  This value is the offset from base of the
1399-     frame (incomming SP) to the stack_pointer.  This value is always
1400+  /* The size of the frame, i.e. the number of bytes between the bottom
1401+     of the outgoing arguments and the incoming SP.  This value is always
1402      a multiple of STACK_BOUNDARY.  */
1403   poly_int64 frame_size;
1404
1405--
14062.34.1
1407
1408
1409From f2b585375205b0a1802d79c682ba33766ecd1f0f Mon Sep 17 00:00:00 2001
1410From: Richard Sandiford <richard.sandiford@arm.com>
1411Date: Tue, 12 Sep 2023 16:07:17 +0100
1412Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the
1413 frame
1414
1415reg_offset was measured from the bottom of the saved register area.
1416This made perfect sense with the original layout, since the bottom
1417of the saved register area was also the hard frame pointer address.
1418It became slightly less obvious with SVE, since we save SVE
1419registers below the hard frame pointer, but it still made sense.
1420
1421However, if we want to allow different frame layouts, it's more
1422convenient and obvious to measure reg_offset from the bottom of
1423the frame.  After previous patches, it's also a slight simplification
1424in its own right.
1425
1426gcc/
1427	* config/aarch64/aarch64.h (aarch64_frame): Add comment above
1428	reg_offset.
1429	* config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets
1430	from the bottom of the frame, rather than the bottom of the saved
1431	register area.  Measure reg_offset from the bottom of the frame
1432	rather than the bottom of the saved register area.
1433	(aarch64_save_callee_saves): Update accordingly.
1434	(aarch64_restore_callee_saves): Likewise.
1435	(aarch64_get_separate_components): Likewise.
1436	(aarch64_process_components): Likewise.
1437---
1438 gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++-------------------
1439 gcc/config/aarch64/aarch64.h  |  3 ++
1440 2 files changed, 27 insertions(+), 29 deletions(-)
1441
1442diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1443index 95499ae49ba..af99807ef8a 100644
1444--- a/gcc/config/aarch64/aarch64.cc
1445+++ b/gcc/config/aarch64/aarch64.cc
1446@@ -8400,7 +8400,6 @@ aarch64_needs_frame_chain (void)
1447 static void
1448 aarch64_layout_frame (void)
1449 {
1450-  poly_int64 offset = 0;
1451   int regno, last_fp_reg = INVALID_REGNUM;
1452   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
1453   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
1454@@ -8478,7 +8477,9 @@ aarch64_layout_frame (void)
1455   gcc_assert (crtl->is_leaf
1456 	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
1457
1458-  frame.bytes_below_saved_regs = crtl->outgoing_args_size;
1459+  poly_int64 offset = crtl->outgoing_args_size;
1460+  gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
1461+  frame.bytes_below_saved_regs = offset;
1462
1463   /* Now assign stack slots for the registers.  Start with the predicate
1464      registers, since predicate LDR and STR have a relatively small
1465@@ -8490,7 +8491,8 @@ aarch64_layout_frame (void)
1466 	offset += BYTES_PER_SVE_PRED;
1467       }
1468
1469-  if (maybe_ne (offset, 0))
1470+  poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
1471+  if (maybe_ne (saved_prs_size, 0))
1472     {
1473       /* If we have any vector registers to save above the predicate registers,
1474 	 the offset of the vector register save slots need to be a multiple
1475@@ -8508,10 +8510,10 @@ aarch64_layout_frame (void)
1476 	offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1477       else
1478 	{
1479-	  if (known_le (offset, vector_save_size))
1480-	    offset = vector_save_size;
1481-	  else if (known_le (offset, vector_save_size * 2))
1482-	    offset = vector_save_size * 2;
1483+	  if (known_le (saved_prs_size, vector_save_size))
1484+	    offset = frame.bytes_below_saved_regs + vector_save_size;
1485+	  else if (known_le (saved_prs_size, vector_save_size * 2))
1486+	    offset = frame.bytes_below_saved_regs + vector_save_size * 2;
1487 	  else
1488 	    gcc_unreachable ();
1489 	}
1490@@ -8528,9 +8530,10 @@ aarch64_layout_frame (void)
1491
1492   /* OFFSET is now the offset of the hard frame pointer from the bottom
1493      of the callee save area.  */
1494-  bool saves_below_hard_fp_p = maybe_ne (offset, 0);
1495-  frame.below_hard_fp_saved_regs_size = offset;
1496-  frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
1497+  frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
1498+  bool saves_below_hard_fp_p
1499+    = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
1500+  frame.bytes_below_hard_fp = offset;
1501   if (frame.emit_frame_chain)
1502     {
1503       /* FP and LR are placed in the linkage record.  */
1504@@ -8581,9 +8584,10 @@ aarch64_layout_frame (void)
1505
1506   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1507
1508-  frame.saved_regs_size = offset;
1509+  frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
1510
1511-  poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
1512+  poly_int64 varargs_and_saved_regs_size
1513+    = frame.saved_regs_size + frame.saved_varargs_size;
1514
1515   poly_int64 saved_regs_and_above
1516     = aligned_upper_bound (varargs_and_saved_regs_size
1517@@ -9105,9 +9109,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
1518
1519       machine_mode mode = aarch64_reg_save_mode (regno);
1520       reg = gen_rtx_REG (mode, regno);
1521-      offset = (frame.reg_offset[regno]
1522-		+ frame.bytes_below_saved_regs
1523-		- bytes_below_sp);
1524+      offset = frame.reg_offset[regno] - bytes_below_sp;
1525       rtx base_rtx = stack_pointer_rtx;
1526       poly_int64 sp_offset = offset;
1527
1528@@ -9214,9 +9216,7 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
1529
1530       machine_mode mode = aarch64_reg_save_mode (regno);
1531       reg = gen_rtx_REG (mode, regno);
1532-      offset = (frame.reg_offset[regno]
1533-		+ frame.bytes_below_saved_regs
1534-		- bytes_below_sp);
1535+      offset = frame.reg_offset[regno] - bytes_below_sp;
1536       rtx base_rtx = stack_pointer_rtx;
1537       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
1538 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
1539@@ -9355,14 +9355,12 @@ aarch64_get_separate_components (void)
1540 	   it as a stack probe for -fstack-clash-protection.  */
1541 	if (flag_stack_clash_protection
1542 	    && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
1543-	    && known_eq (offset, 0))
1544+	    && known_eq (offset, frame.bytes_below_saved_regs))
1545 	  continue;
1546
1547 	/* Get the offset relative to the register we'll use.  */
1548 	if (frame_pointer_needed)
1549-	  offset -= frame.below_hard_fp_saved_regs_size;
1550-	else
1551-	  offset += frame.bytes_below_saved_regs;
1552+	  offset -= frame.bytes_below_hard_fp;
1553
1554 	/* Check that we can access the stack slot of the register with one
1555 	   direct load with no adjustments needed.  */
1556@@ -9509,9 +9507,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
1557       rtx reg = gen_rtx_REG (mode, regno);
1558       poly_int64 offset = frame.reg_offset[regno];
1559       if (frame_pointer_needed)
1560-	offset -= frame.below_hard_fp_saved_regs_size;
1561-      else
1562-	offset += frame.bytes_below_saved_regs;
1563+	offset -= frame.bytes_below_hard_fp;
1564
1565       rtx addr = plus_constant (Pmode, ptr_reg, offset);
1566       rtx mem = gen_frame_mem (mode, addr);
1567@@ -9563,9 +9559,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
1568       /* REGNO2 can be saved/restored in a pair with REGNO.  */
1569       rtx reg2 = gen_rtx_REG (mode, regno2);
1570       if (frame_pointer_needed)
1571-	offset2 -= frame.below_hard_fp_saved_regs_size;
1572-      else
1573-	offset2 += frame.bytes_below_saved_regs;
1574+	offset2 -= frame.bytes_below_hard_fp;
1575       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
1576       rtx mem2 = gen_frame_mem (mode, addr2);
1577       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
1578@@ -9681,7 +9675,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1579   if (final_adjustment_p
1580       && known_eq (frame.below_hard_fp_saved_regs_size, 0))
1581     {
1582-      poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
1583+      poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
1584+			      - frame.bytes_below_saved_regs);
1585       if (known_ge (lr_offset, 0))
1586 	min_probe_threshold -= lr_offset.to_constant ();
1587       else
1588diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1589index 108a5731b0d..c8becb098c8 100644
1590--- a/gcc/config/aarch64/aarch64.h
1591+++ b/gcc/config/aarch64/aarch64.h
1592@@ -766,6 +766,9 @@ extern enum aarch64_processor aarch64_tune;
1593 #ifdef HAVE_POLY_INT_H
1594 struct GTY (()) aarch64_frame
1595 {
1596+  /* The offset from the bottom of the static frame (the bottom of the
1597+     outgoing arguments) of each register save slot, or -2 if no save is
1598+     needed.  */
1599   poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
1600
1601   /* The number of extra stack bytes taken up by register varargs.
1602--
16032.34.1
1604
1605
1606From 79faabda181d0d9fd29a3cf5726ba65bdee945b5 Mon Sep 17 00:00:00 2001
1607From: Richard Sandiford <richard.sandiford@arm.com>
1608Date: Tue, 12 Sep 2023 16:07:17 +0100
1609Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation
1610
1611After previous patches, it no longer really makes sense to allocate
1612the top of the frame in terms of varargs_and_saved_regs_size and
1613saved_regs_and_above.
1614
1615gcc/
1616	* config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify
1617	the allocation of the top of the frame.
1618---
1619 gcc/config/aarch64/aarch64.cc | 23 ++++++++---------------
1620 1 file changed, 8 insertions(+), 15 deletions(-)
1621
1622diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1623index af99807ef8a..31b00094c2a 100644
1624--- a/gcc/config/aarch64/aarch64.cc
1625+++ b/gcc/config/aarch64/aarch64.cc
1626@@ -8586,23 +8586,16 @@ aarch64_layout_frame (void)
1627
1628   frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
1629
1630-  poly_int64 varargs_and_saved_regs_size
1631-    = frame.saved_regs_size + frame.saved_varargs_size;
1632-
1633-  poly_int64 saved_regs_and_above
1634-    = aligned_upper_bound (varargs_and_saved_regs_size
1635-			   + get_frame_size (),
1636-			   STACK_BOUNDARY / BITS_PER_UNIT);
1637-
1638-  frame.bytes_above_hard_fp
1639-    = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
1640+  offset += get_frame_size ();
1641+  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1642+  auto top_of_locals = offset;
1643
1644-  /* Both these values are already aligned.  */
1645-  gcc_assert (multiple_p (frame.bytes_below_saved_regs,
1646-			  STACK_BOUNDARY / BITS_PER_UNIT));
1647-  frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
1648+  offset += frame.saved_varargs_size;
1649+  gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
1650+  frame.frame_size = offset;
1651
1652-  frame.bytes_above_locals = frame.saved_varargs_size;
1653+  frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
1654+  frame.bytes_above_locals = frame.frame_size - top_of_locals;
1655
1656   frame.initial_adjust = 0;
1657   frame.final_adjust = 0;
1658--
16592.34.1
1660
1661
1662From 4e62049e403b141e6f916176160dac8cbd65fe47 Mon Sep 17 00:00:00 2001
1663From: Richard Sandiford <richard.sandiford@arm.com>
1664Date: Tue, 12 Sep 2023 16:07:18 +0100
1665Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak
1666
1667This patch just changes a calculation of initial_adjust
1668to one that makes it slightly more obvious that the total
1669adjustment is frame.frame_size.
1670
1671gcc/
1672	* config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak
1673	calculation of initial_adjust for frames in which all saves
1674	are SVE saves.
1675---
1676 gcc/config/aarch64/aarch64.cc | 5 ++---
1677 1 file changed, 2 insertions(+), 3 deletions(-)
1678
1679diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1680index 31b00094c2a..1aa79da0673 100644
1681--- a/gcc/config/aarch64/aarch64.cc
1682+++ b/gcc/config/aarch64/aarch64.cc
1683@@ -8675,11 +8675,10 @@ aarch64_layout_frame (void)
1684     {
1685       /* Frame in which all saves are SVE saves:
1686
1687-	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
1688+	 sub sp, sp, frame_size - bytes_below_saved_regs
1689 	 save SVE registers relative to SP
1690 	 sub sp, sp, bytes_below_saved_regs  */
1691-      frame.initial_adjust = (frame.bytes_above_hard_fp
1692-			      + frame.below_hard_fp_saved_regs_size);
1693+      frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
1694       frame.final_adjust = frame.bytes_below_saved_regs;
1695     }
1696   else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1697--
16982.34.1
1699
1700
1701From aaa1a0a5912d9e5d571e5f1c6f09ceac99544ab5 Mon Sep 17 00:00:00 2001
1702From: Richard Sandiford <richard.sandiford@arm.com>
1703Date: Tue, 12 Sep 2023 16:07:18 +0100
1704Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition
1705
1706The AArch64 ABI says that, when stack clash protection is used,
1707there can be a maximum of 1KiB of unprobed space at sp on entry
1708to a function.  Therefore, we need to probe when allocating
1709>= guard_size - 1KiB of data (>= rather than >).  This is what
1710GCC does.
1711
1712If an allocation is exactly guard_size bytes, it is enough to allocate
1713those bytes and probe once at offset 1024.  It isn't possible to use a
1714single probe at any other offset: higher would conmplicate later code,
1715by leaving more unprobed space than usual, while lower would risk
1716leaving an entire page unprobed.  For simplicity, the code probes all
1717allocations at offset 1024.
1718
1719Some register saves also act as probes.  If we need to allocate
1720more space below the last such register save probe, we need to
1721probe the allocation if it is > 1KiB.  Again, this allocation is
1722then sometimes (but not always) probed at offset 1024.  This sort of
1723allocation is currently only used for outgoing arguments, which are
1724rarely this big.
1725
1726However, the code also probed if this final outgoing-arguments
1727allocation was == 1KiB, rather than just > 1KiB.  This isn't
1728necessary, since the register save then probes at offset 1024
1729as required.  Continuing to probe allocations of exactly 1KiB
1730would complicate later patches.
1731
1732gcc/
1733	* config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space):
1734	Don't probe final allocations that are exactly 1KiB in size (after
1735	unprobed space above the final allocation has been deducted).
1736
1737gcc/testsuite/
1738	* gcc.target/aarch64/stack-check-prologue-17.c: New test.
1739---
1740 gcc/config/aarch64/aarch64.cc                 |  4 +-
1741 .../aarch64/stack-check-prologue-17.c         | 55 +++++++++++++++++++
1742 2 files changed, 58 insertions(+), 1 deletion(-)
1743 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1744
1745diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1746index 1aa79da0673..5cad847977a 100644
1747--- a/gcc/config/aarch64/aarch64.cc
1748+++ b/gcc/config/aarch64/aarch64.cc
1749@@ -9648,9 +9648,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1750   HOST_WIDE_INT guard_size
1751     = 1 << param_stack_clash_protection_guard_size;
1752   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
1753+  HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
1754+  gcc_assert (multiple_p (poly_size, byte_sp_alignment));
1755   HOST_WIDE_INT min_probe_threshold
1756     = (final_adjustment_p
1757-       ? guard_used_by_caller
1758+       ? guard_used_by_caller + byte_sp_alignment
1759        : guard_size - guard_used_by_caller);
1760   /* When doing the final adjustment for the outgoing arguments, take into
1761      account any unprobed space there is above the current SP.  There are
1762diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1763new file mode 100644
1764index 00000000000..0d8a25d73a2
1765--- /dev/null
1766+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1767@@ -0,0 +1,55 @@
1768+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
1769+/* { dg-final { check-function-bodies "**" "" } } */
1770+
1771+void f(int, ...);
1772+void g();
1773+
1774+/*
1775+** test1:
1776+**	...
1777+**	str	x30, \[sp\]
1778+**	sub	sp, sp, #1024
1779+**	cbnz	w0, .*
1780+**	bl	g
1781+**	...
1782+*/
1783+int test1(int z) {
1784+  __uint128_t x = 0;
1785+  int y[0x400];
1786+  if (z)
1787+    {
1788+      f(0, 0, 0, 0, 0, 0, 0, &y,
1789+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1790+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1791+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1792+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
1793+    }
1794+  g();
1795+  return 1;
1796+}
1797+
1798+/*
1799+** test2:
1800+**	...
1801+**	str	x30, \[sp\]
1802+**	sub	sp, sp, #1040
1803+**	str	xzr, \[sp\]
1804+**	cbnz	w0, .*
1805+**	bl	g
1806+**	...
1807+*/
1808+int test2(int z) {
1809+  __uint128_t x = 0;
1810+  int y[0x400];
1811+  if (z)
1812+    {
1813+      f(0, 0, 0, 0, 0, 0, 0, &y,
1814+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1815+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1816+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1817+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1818+	x);
1819+    }
1820+  g();
1821+  return 1;
1822+}
1823--
18242.34.1
1825
1826
1827From 8433953434a7b58c0923140d39eb3c5988c1d097 Mon Sep 17 00:00:00 2001
1828From: Richard Sandiford <richard.sandiford@arm.com>
1829Date: Tue, 12 Sep 2023 16:07:19 +0100
1830Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes
1831
1832-fstack-clash-protection uses the save of LR as a probe for the next
1833allocation.  The next allocation could be:
1834
1835* another part of the static frame, e.g. when allocating SVE save slots
1836  or outgoing arguments
1837
1838* an alloca in the same function
1839
1840* an allocation made by a callee function
1841
1842However, when -fomit-frame-pointer is used, the LR save slot is placed
1843above the other GPR save slots.  It could therefore be up to 80 bytes
1844above the base of the GPR save area (which is also the hard fp address).
1845
1846aarch64_allocate_and_probe_stack_space took this into account when
1847deciding how much subsequent space could be allocated without needing
1848a probe.  However, it interacted badly with:
1849
1850      /* If doing a small final adjustment, we always probe at offset 0.
1851	 This is done to avoid issues when LR is not at position 0 or when
1852	 the final adjustment is smaller than the probing offset.  */
1853      else if (final_adjustment_p && rounded_size == 0)
1854	residual_probe_offset = 0;
1855
1856which forces any allocation that is smaller than the guard page size
1857to be probed at offset 0 rather than the usual offset 1024.  It was
1858therefore possible to construct cases in which we had:
1859
1860* a probe using LR at SP + 80 bytes (or some other value >= 16)
1861* an allocation of the guard page size - 16 bytes
1862* a probe at SP + 0
1863
1864which allocates guard page size + 64 consecutive unprobed bytes.
1865
1866This patch requires the LR probe to be in the first 16 bytes of the
1867save area when stack clash protection is active.  Doing it
1868unconditionally would cause code-quality regressions.
1869
1870Putting LR before other registers prevents push/pop allocation
1871when shadow call stacks are enabled, since LR is restored
1872separately from the other callee-saved registers.
1873
1874The new comment doesn't say that the probe register is required
1875to be LR, since a later patch removes that restriction.
1876
1877gcc/
1878	* config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that
1879	the LR save slot is in the first 16 bytes of the register save area.
1880	Only form STP/LDP push/pop candidates if both registers are valid.
1881	(aarch64_allocate_and_probe_stack_space): Remove workaround for
1882	when LR was not in the first 16 bytes.
1883
1884gcc/testsuite/
1885	* gcc.target/aarch64/stack-check-prologue-18.c: New test.
1886	* gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
1887	* gcc.target/aarch64/stack-check-prologue-20.c: Likewise.
1888---
1889 gcc/config/aarch64/aarch64.cc                 |  72 ++++++-------
1890 .../aarch64/stack-check-prologue-18.c         | 100 ++++++++++++++++++
1891 .../aarch64/stack-check-prologue-19.c         | 100 ++++++++++++++++++
1892 .../aarch64/stack-check-prologue-20.c         |   3 +
1893 4 files changed, 233 insertions(+), 42 deletions(-)
1894 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
1895 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
1896 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
1897
1898diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1899index 5cad847977a..a765f92329d 100644
1900--- a/gcc/config/aarch64/aarch64.cc
1901+++ b/gcc/config/aarch64/aarch64.cc
1902@@ -8534,26 +8534,34 @@ aarch64_layout_frame (void)
1903   bool saves_below_hard_fp_p
1904     = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
1905   frame.bytes_below_hard_fp = offset;
1906+
1907+  auto allocate_gpr_slot = [&](unsigned int regno)
1908+    {
1909+      frame.reg_offset[regno] = offset;
1910+      if (frame.wb_push_candidate1 == INVALID_REGNUM)
1911+	frame.wb_push_candidate1 = regno;
1912+      else if (frame.wb_push_candidate2 == INVALID_REGNUM)
1913+	frame.wb_push_candidate2 = regno;
1914+      offset += UNITS_PER_WORD;
1915+    };
1916+
1917   if (frame.emit_frame_chain)
1918     {
1919       /* FP and LR are placed in the linkage record.  */
1920-      frame.reg_offset[R29_REGNUM] = offset;
1921-      frame.wb_push_candidate1 = R29_REGNUM;
1922-      frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
1923-      frame.wb_push_candidate2 = R30_REGNUM;
1924-      offset += 2 * UNITS_PER_WORD;
1925+      allocate_gpr_slot (R29_REGNUM);
1926+      allocate_gpr_slot (R30_REGNUM);
1927     }
1928+  else if (flag_stack_clash_protection
1929+	   && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
1930+    /* Put the LR save slot first, since it makes a good choice of probe
1931+       for stack clash purposes.  The idea is that the link register usually
1932+       has to be saved before a call anyway, and so we lose little by
1933+       stopping it from being individually shrink-wrapped.  */
1934+    allocate_gpr_slot (R30_REGNUM);
1935
1936   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1937     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
1938-      {
1939-	frame.reg_offset[regno] = offset;
1940-	if (frame.wb_push_candidate1 == INVALID_REGNUM)
1941-	  frame.wb_push_candidate1 = regno;
1942-	else if (frame.wb_push_candidate2 == INVALID_REGNUM)
1943-	  frame.wb_push_candidate2 = regno;
1944-	offset += UNITS_PER_WORD;
1945-      }
1946+      allocate_gpr_slot (regno);
1947
1948   poly_int64 max_int_offset = offset;
1949   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1950@@ -8631,10 +8639,13 @@ aarch64_layout_frame (void)
1951      max_push_offset to 0, because no registers are popped at this time,
1952      so callee_adjust cannot be adjusted.  */
1953   HOST_WIDE_INT max_push_offset = 0;
1954-  if (frame.wb_pop_candidate2 != INVALID_REGNUM)
1955-    max_push_offset = 512;
1956-  else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
1957-    max_push_offset = 256;
1958+  if (frame.wb_pop_candidate1 != INVALID_REGNUM)
1959+    {
1960+      if (frame.wb_pop_candidate2 != INVALID_REGNUM)
1961+	max_push_offset = 512;
1962+      else
1963+	max_push_offset = 256;
1964+    }
1965
1966   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
1967   HOST_WIDE_INT const_saved_regs_size;
1968@@ -9654,29 +9665,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1969     = (final_adjustment_p
1970        ? guard_used_by_caller + byte_sp_alignment
1971        : guard_size - guard_used_by_caller);
1972-  /* When doing the final adjustment for the outgoing arguments, take into
1973-     account any unprobed space there is above the current SP.  There are
1974-     two cases:
1975-
1976-     - When saving SVE registers below the hard frame pointer, we force
1977-       the lowest save to take place in the prologue before doing the final
1978-       adjustment (i.e. we don't allow the save to be shrink-wrapped).
1979-       This acts as a probe at SP, so there is no unprobed space.
1980-
1981-     - When there are no SVE register saves, we use the store of the link
1982-       register as a probe.  We can't assume that LR was saved at position 0
1983-       though, so treat any space below it as unprobed.  */
1984-  if (final_adjustment_p
1985-      && known_eq (frame.below_hard_fp_saved_regs_size, 0))
1986-    {
1987-      poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
1988-			      - frame.bytes_below_saved_regs);
1989-      if (known_ge (lr_offset, 0))
1990-	min_probe_threshold -= lr_offset.to_constant ();
1991-      else
1992-	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
1993-    }
1994-
1995   poly_int64 frame_size = frame.frame_size;
1996
1997   /* We should always have a positive probe threshold.  */
1998@@ -9856,8 +9844,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1999       if (final_adjustment_p && rounded_size != 0)
2000 	min_probe_threshold = 0;
2001       /* If doing a small final adjustment, we always probe at offset 0.
2002-	 This is done to avoid issues when LR is not at position 0 or when
2003-	 the final adjustment is smaller than the probing offset.  */
2004+	 This is done to avoid issues when the final adjustment is smaller
2005+	 than the probing offset.  */
2006       else if (final_adjustment_p && rounded_size == 0)
2007 	residual_probe_offset = 0;
2008
2009diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2010new file mode 100644
2011index 00000000000..82447d20fff
2012--- /dev/null
2013+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2014@@ -0,0 +1,100 @@
2015+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
2016+/* { dg-final { check-function-bodies "**" "" } } */
2017+
2018+void f(int, ...);
2019+void g();
2020+
2021+/*
2022+** test1:
2023+**	...
2024+**	str	x30, \[sp\]
2025+**	sub	sp, sp, #4064
2026+**	str	xzr, \[sp\]
2027+**	cbnz	w0, .*
2028+**	bl	g
2029+**	...
2030+**	str	x26, \[sp, #?4128\]
2031+**	...
2032+*/
2033+int test1(int z) {
2034+  __uint128_t x = 0;
2035+  int y[0x400];
2036+  if (z)
2037+    {
2038+      asm volatile ("" :::
2039+		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2040+      f(0, 0, 0, 0, 0, 0, 0, &y,
2041+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2042+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2043+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2044+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2045+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2046+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2047+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2048+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2049+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2050+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2051+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2052+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2053+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2054+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2055+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2056+	x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2057+    }
2058+  g();
2059+  return 1;
2060+}
2061+
2062+/*
2063+** test2:
2064+**	...
2065+**	str	x30, \[sp\]
2066+**	sub	sp, sp, #1040
2067+**	str	xzr, \[sp\]
2068+**	cbnz	w0, .*
2069+**	bl	g
2070+**	...
2071+*/
2072+int test2(int z) {
2073+  __uint128_t x = 0;
2074+  int y[0x400];
2075+  if (z)
2076+    {
2077+      asm volatile ("" :::
2078+		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2079+      f(0, 0, 0, 0, 0, 0, 0, &y,
2080+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2081+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2082+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2083+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2084+	x);
2085+    }
2086+  g();
2087+  return 1;
2088+}
2089+
2090+/*
2091+** test3:
2092+**	...
2093+**	str	x30, \[sp\]
2094+**	sub	sp, sp, #1024
2095+**	cbnz	w0, .*
2096+**	bl	g
2097+**	...
2098+*/
2099+int test3(int z) {
2100+  __uint128_t x = 0;
2101+  int y[0x400];
2102+  if (z)
2103+    {
2104+      asm volatile ("" :::
2105+		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2106+      f(0, 0, 0, 0, 0, 0, 0, &y,
2107+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2108+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2109+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2110+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2111+    }
2112+  g();
2113+  return 1;
2114+}
2115diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2116new file mode 100644
2117index 00000000000..73ac3e4e4eb
2118--- /dev/null
2119+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2120@@ -0,0 +1,100 @@
2121+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */
2122+/* { dg-final { check-function-bodies "**" "" } } */
2123+
2124+void f(int, ...);
2125+void g();
2126+
2127+/*
2128+** test1:
2129+**	...
2130+**	str	x30, \[sp\]
2131+**	sub	sp, sp, #4064
2132+**	str	xzr, \[sp\]
2133+**	cbnz	w0, .*
2134+**	bl	g
2135+**	...
2136+**	str	x26, \[sp, #?4128\]
2137+**	...
2138+*/
2139+int test1(int z) {
2140+  __uint128_t x = 0;
2141+  int y[0x400];
2142+  if (z)
2143+    {
2144+      asm volatile ("" :::
2145+		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2146+      f(0, 0, 0, 0, 0, 0, 0, &y,
2147+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2148+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2149+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2150+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2151+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2152+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2153+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2154+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2155+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2156+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2157+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2158+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2159+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2160+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2161+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2162+	x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2163+    }
2164+  g();
2165+  return 1;
2166+}
2167+
2168+/*
2169+** test2:
2170+**	...
2171+**	str	x30, \[sp\]
2172+**	sub	sp, sp, #1040
2173+**	str	xzr, \[sp\]
2174+**	cbnz	w0, .*
2175+**	bl	g
2176+**	...
2177+*/
2178+int test2(int z) {
2179+  __uint128_t x = 0;
2180+  int y[0x400];
2181+  if (z)
2182+    {
2183+      asm volatile ("" :::
2184+		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2185+      f(0, 0, 0, 0, 0, 0, 0, &y,
2186+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2187+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2188+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2189+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2190+	x);
2191+    }
2192+  g();
2193+  return 1;
2194+}
2195+
2196+/*
2197+** test3:
2198+**	...
2199+**	str	x30, \[sp\]
2200+**	sub	sp, sp, #1024
2201+**	cbnz	w0, .*
2202+**	bl	g
2203+**	...
2204+*/
2205+int test3(int z) {
2206+  __uint128_t x = 0;
2207+  int y[0x400];
2208+  if (z)
2209+    {
2210+      asm volatile ("" :::
2211+		    "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2212+      f(0, 0, 0, 0, 0, 0, 0, &y,
2213+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2214+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2215+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2216+	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2217+    }
2218+  g();
2219+  return 1;
2220+}
2221diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
2222new file mode 100644
2223index 00000000000..690aae8dfd5
2224--- /dev/null
2225+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
2226@@ -0,0 +1,3 @@
2227+/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */
2228+
2229+#include "stack-check-prologue-19.c"
2230--
22312.34.1
2232
2233
2234From eea1759073e09dd1aefbc9a881601ab1eebfdd18 Mon Sep 17 00:00:00 2001
2235From: Richard Sandiford <richard.sandiford@arm.com>
2236Date: Tue, 12 Sep 2023 16:07:19 +0100
2237Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation
2238
2239Previous patches ensured that the final frame allocation only needs
2240a probe when the size is strictly greater than 1KiB.  It's therefore
2241safe to use the normal 1024 probe offset in all cases.
2242
2243The main motivation for doing this is to simplify the code and
2244remove the number of special cases.
2245
2246gcc/
2247	* config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space):
2248	Always probe the residual allocation at offset 1024, asserting
2249	that that is in range.
2250
2251gcc/testsuite/
2252	* gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe
2253	to be at offset 1024 rather than offset 0.
2254	* gcc.target/aarch64/stack-check-prologue-18.c: Likewise.
2255	* gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
2256---
2257 gcc/config/aarch64/aarch64.cc                        | 12 ++++--------
2258 .../gcc.target/aarch64/stack-check-prologue-17.c     |  2 +-
2259 .../gcc.target/aarch64/stack-check-prologue-18.c     |  4 ++--
2260 .../gcc.target/aarch64/stack-check-prologue-19.c     |  4 ++--
2261 4 files changed, 9 insertions(+), 13 deletions(-)
2262
2263diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
2264index a765f92329d..37809a306f7 100644
2265--- a/gcc/config/aarch64/aarch64.cc
2266+++ b/gcc/config/aarch64/aarch64.cc
2267@@ -9838,16 +9838,12 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
2268      are still safe.  */
2269   if (residual)
2270     {
2271-      HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
2272+      gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
2273+
2274       /* If we're doing final adjustments, and we've done any full page
2275 	 allocations then any residual needs to be probed.  */
2276       if (final_adjustment_p && rounded_size != 0)
2277 	min_probe_threshold = 0;
2278-      /* If doing a small final adjustment, we always probe at offset 0.
2279-	 This is done to avoid issues when the final adjustment is smaller
2280-	 than the probing offset.  */
2281-      else if (final_adjustment_p && rounded_size == 0)
2282-	residual_probe_offset = 0;
2283
2284       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
2285       if (residual >= min_probe_threshold)
2286@@ -9858,8 +9854,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
2287 		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
2288 		     "\n", residual);
2289
2290-	    emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
2291-					     residual_probe_offset));
2292+	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
2293+					   guard_used_by_caller));
2294 	  emit_insn (gen_blockage ());
2295 	}
2296     }
2297diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2298index 0d8a25d73a2..f0ec1389771 100644
2299--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2300+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2301@@ -33,7 +33,7 @@ int test1(int z) {
2302 **	...
2303 **	str	x30, \[sp\]
2304 **	sub	sp, sp, #1040
2305-**	str	xzr, \[sp\]
2306+**	str	xzr, \[sp, #?1024\]
2307 **	cbnz	w0, .*
2308 **	bl	g
2309 **	...
2310diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2311index 82447d20fff..6383bec5ebc 100644
2312--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2313+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2314@@ -9,7 +9,7 @@ void g();
2315 **	...
2316 **	str	x30, \[sp\]
2317 **	sub	sp, sp, #4064
2318-**	str	xzr, \[sp\]
2319+**	str	xzr, \[sp, #?1024\]
2320 **	cbnz	w0, .*
2321 **	bl	g
2322 **	...
2323@@ -50,7 +50,7 @@ int test1(int z) {
2324 **	...
2325 **	str	x30, \[sp\]
2326 **	sub	sp, sp, #1040
2327-**	str	xzr, \[sp\]
2328+**	str	xzr, \[sp, #?1024\]
2329 **	cbnz	w0, .*
2330 **	bl	g
2331 **	...
2332diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2333index 73ac3e4e4eb..562039b5e9b 100644
2334--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2335+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2336@@ -9,7 +9,7 @@ void g();
2337 **	...
2338 **	str	x30, \[sp\]
2339 **	sub	sp, sp, #4064
2340-**	str	xzr, \[sp\]
2341+**	str	xzr, \[sp, #?1024\]
2342 **	cbnz	w0, .*
2343 **	bl	g
2344 **	...
2345@@ -50,7 +50,7 @@ int test1(int z) {
2346 **	...
2347 **	str	x30, \[sp\]
2348 **	sub	sp, sp, #1040
2349-**	str	xzr, \[sp\]
2350+**	str	xzr, \[sp, #?1024\]
2351 **	cbnz	w0, .*
2352 **	bl	g
2353 **	...
2354--
23552.34.1
2356
2357
2358From 96d85187c3b9c9a7efc2fd698c3d452e80d8aa47 Mon Sep 17 00:00:00 2001
2359From: Richard Sandiford <richard.sandiford@arm.com>
2360Date: Tue, 12 Sep 2023 16:07:20 +0100
2361Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame
2362 info
2363
2364The stack frame is currently divided into three areas:
2365
2366A: the area above the hard frame pointer
2367B: the SVE saves below the hard frame pointer
2368C: the outgoing arguments
2369
2370If the stack frame is allocated in one chunk, the allocation needs a
2371probe if the frame size is >= guard_size - 1KiB.  In addition, if the
2372function is not a leaf function, it must probe an address no more than
23731KiB above the outgoing SP.  We ensured the second condition by
2374
2375(1) using single-chunk allocations for non-leaf functions only if
2376    the link register save slot is within 512 bytes of the bottom
2377    of the frame; and
2378
2379(2) using the link register save as a probe (meaning, for instance,
2380    that it can't be individually shrink wrapped)
2381
2382If instead the stack is allocated in multiple chunks, then:
2383
2384* an allocation involving only the outgoing arguments (C above) requires
2385  a probe if the allocation size is > 1KiB
2386
2387* any other allocation requires a probe if the allocation size
2388  is >= guard_size - 1KiB
2389
2390* second and subsequent allocations require the previous allocation
2391  to probe at the bottom of the allocated area, regardless of the size
2392  of that previous allocation
2393
2394The final point means that, unlike for single allocations,
2395it can be necessary to have both a non-SVE register probe and
2396an SVE register probe.  For example:
2397
2398* allocate A, probe using a non-SVE register save
2399* allocate B, probe using an SVE register save
2400* allocate C
2401
2402The non-SVE register used in this case was again the link register.
2403It was previously used even if the link register save slot was some
2404bytes above the bottom of the non-SVE register saves, but an earlier
2405patch avoided that by putting the link register save slot first.
2406
2407As a belt-and-braces fix, this patch explicitly records which
2408probe registers we're using and allows the non-SVE probe to be
2409whichever register comes first (as for SVE).
2410
2411The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
2412
2413gcc/
2414	* config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
2415	(aarch64_frame::hard_fp_save_and_probe): New fields.
2416	* config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize them.
2417	Rather than asserting that a leaf function saves LR, instead assert
2418	that a leaf function saves something.
2419	(aarch64_get_separate_components): Prevent the chosen probe
2420	registers from being individually shrink-wrapped.
2421	(aarch64_allocate_and_probe_stack_space): Remove workaround for
2422	probe registers that aren't at the bottom of the previous allocation.
2423
2424gcc/testsuite/
2425	* gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes.
2426---
2427 gcc/config/aarch64/aarch64.cc                 | 68 +++++++++++++++----
2428 gcc/config/aarch64/aarch64.h                  |  8 +++
2429 .../aarch64/sve/pcs/stack_clash_3.c           |  6 +-
2430 3 files changed, 64 insertions(+), 18 deletions(-)
2431
2432diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
2433index 37809a306f7..6c59c39a639 100644
2434--- a/gcc/config/aarch64/aarch64.cc
2435+++ b/gcc/config/aarch64/aarch64.cc
2436@@ -8471,15 +8471,11 @@ aarch64_layout_frame (void)
2437 	&& !crtl->abi->clobbers_full_reg_p (regno))
2438       frame.reg_offset[regno] = SLOT_REQUIRED;
2439
2440-  /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
2441-     LR counts as an implicit probe which allows us to maintain the invariant
2442-     described in the comment at expand_prologue.  */
2443-  gcc_assert (crtl->is_leaf
2444-	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
2445
2446   poly_int64 offset = crtl->outgoing_args_size;
2447   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2448   frame.bytes_below_saved_regs = offset;
2449+  frame.sve_save_and_probe = INVALID_REGNUM;
2450
2451   /* Now assign stack slots for the registers.  Start with the predicate
2452      registers, since predicate LDR and STR have a relatively small
2453@@ -8487,6 +8483,8 @@ aarch64_layout_frame (void)
2454   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
2455     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2456       {
2457+	if (frame.sve_save_and_probe == INVALID_REGNUM)
2458+	  frame.sve_save_and_probe = regno;
2459 	frame.reg_offset[regno] = offset;
2460 	offset += BYTES_PER_SVE_PRED;
2461       }
2462@@ -8524,6 +8522,8 @@ aarch64_layout_frame (void)
2463     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2464       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2465 	{
2466+	  if (frame.sve_save_and_probe == INVALID_REGNUM)
2467+	    frame.sve_save_and_probe = regno;
2468 	  frame.reg_offset[regno] = offset;
2469 	  offset += vector_save_size;
2470 	}
2471@@ -8533,10 +8533,18 @@ aarch64_layout_frame (void)
2472   frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2473   bool saves_below_hard_fp_p
2474     = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
2475+  gcc_assert (!saves_below_hard_fp_p
2476+	      || (frame.sve_save_and_probe != INVALID_REGNUM
2477+		  && known_eq (frame.reg_offset[frame.sve_save_and_probe],
2478+			       frame.bytes_below_saved_regs)));
2479+
2480   frame.bytes_below_hard_fp = offset;
2481+  frame.hard_fp_save_and_probe = INVALID_REGNUM;
2482
2483   auto allocate_gpr_slot = [&](unsigned int regno)
2484     {
2485+      if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
2486+	frame.hard_fp_save_and_probe = regno;
2487       frame.reg_offset[regno] = offset;
2488       if (frame.wb_push_candidate1 == INVALID_REGNUM)
2489 	frame.wb_push_candidate1 = regno;
2490@@ -8570,6 +8578,8 @@ aarch64_layout_frame (void)
2491   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2492     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2493       {
2494+	if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
2495+	  frame.hard_fp_save_and_probe = regno;
2496 	/* If there is an alignment gap between integer and fp callee-saves,
2497 	   allocate the last fp register to it if possible.  */
2498 	if (regno == last_fp_reg
2499@@ -8593,6 +8603,17 @@ aarch64_layout_frame (void)
2500   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2501
2502   frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
2503+  gcc_assert (known_eq (frame.saved_regs_size,
2504+			frame.below_hard_fp_saved_regs_size)
2505+	      || (frame.hard_fp_save_and_probe != INVALID_REGNUM
2506+		  && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
2507+			       frame.bytes_below_hard_fp)));
2508+
2509+  /* With stack-clash, a register must be saved in non-leaf functions.
2510+     The saving of the bottommost register counts as an implicit probe,
2511+     which allows us to maintain the invariant described in the comment
2512+     at expand_prologue.  */
2513+  gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
2514
2515   offset += get_frame_size ();
2516   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2517@@ -8723,6 +8744,25 @@ aarch64_layout_frame (void)
2518       frame.final_adjust = frame.bytes_below_saved_regs;
2519     }
2520
2521+  /* The frame is allocated in pieces, with each non-final piece
2522+     including a register save at offset 0 that acts as a probe for
2523+     the following piece.  In addition, the save of the bottommost register
2524+     acts as a probe for callees and allocas.  Roll back any probes that
2525+     aren't needed.
2526+
2527+     A probe isn't needed if it is associated with the final allocation
2528+     (including callees and allocas) that happens before the epilogue is
2529+     executed.  */
2530+  if (crtl->is_leaf
2531+      && !cfun->calls_alloca
2532+      && known_eq (frame.final_adjust, 0))
2533+    {
2534+      if (maybe_ne (frame.sve_callee_adjust, 0))
2535+	frame.sve_save_and_probe = INVALID_REGNUM;
2536+      else
2537+	frame.hard_fp_save_and_probe = INVALID_REGNUM;
2538+    }
2539+
2540   /* Make sure the individual adjustments add up to the full frame size.  */
2541   gcc_assert (known_eq (frame.initial_adjust
2542 			+ frame.callee_adjust
2543@@ -9354,13 +9394,6 @@ aarch64_get_separate_components (void)
2544
2545 	poly_int64 offset = frame.reg_offset[regno];
2546
2547-	/* If the register is saved in the first SVE save slot, we use
2548-	   it as a stack probe for -fstack-clash-protection.  */
2549-	if (flag_stack_clash_protection
2550-	    && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
2551-	    && known_eq (offset, frame.bytes_below_saved_regs))
2552-	  continue;
2553-
2554 	/* Get the offset relative to the register we'll use.  */
2555 	if (frame_pointer_needed)
2556 	  offset -= frame.bytes_below_hard_fp;
2557@@ -9395,6 +9428,13 @@ aarch64_get_separate_components (void)
2558
2559   bitmap_clear_bit (components, LR_REGNUM);
2560   bitmap_clear_bit (components, SP_REGNUM);
2561+  if (flag_stack_clash_protection)
2562+    {
2563+      if (frame.sve_save_and_probe != INVALID_REGNUM)
2564+	bitmap_clear_bit (components, frame.sve_save_and_probe);
2565+      if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
2566+	bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
2567+    }
2568
2569   return components;
2570 }
2571@@ -9931,8 +9971,8 @@ aarch64_epilogue_uses (int regno)
2572    When probing is needed, we emit a probe at the start of the prologue
2573    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
2574
2575-   We have to track how much space has been allocated and the only stores
2576-   to the stack we track as implicit probes are the FP/LR stores.
2577+   We can also use register saves as probes.  These are stored in
2578+   sve_save_and_probe and hard_fp_save_and_probe.
2579
2580    For outgoing arguments we probe if the size is larger than 1KB, such that
2581    the ABI specified buffer is maintained for the next callee.
2582diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
2583index c8becb098c8..fbfb73545ba 100644
2584--- a/gcc/config/aarch64/aarch64.h
2585+++ b/gcc/config/aarch64/aarch64.h
2586@@ -863,6 +863,14 @@ struct GTY (()) aarch64_frame
2587      This is the register they should use.  */
2588   unsigned spare_pred_reg;
2589
2590+  /* An SVE register that is saved below the hard frame pointer and that acts
2591+     as a probe for later allocations, or INVALID_REGNUM if none.  */
2592+  unsigned sve_save_and_probe;
2593+
2594+  /* A register that is saved at the hard frame pointer and that acts
2595+     as a probe for later allocations, or INVALID_REGNUM if none.  */
2596+  unsigned hard_fp_save_and_probe;
2597+
2598   bool laid_out;
2599
2600   /* True if shadow call stack should be enabled for the current function.  */
2601diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2602index 3e01ec36c3a..3530a0d504b 100644
2603--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2604+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2605@@ -11,11 +11,10 @@
2606 **	mov	x11, sp
2607 **	...
2608 **	sub	sp, sp, x13
2609-**	str	p4, \[sp\]
2610 **	cbz	w0, [^\n]*
2611+**	str	p4, \[sp\]
2612 **	...
2613 **	ptrue	p0\.b, all
2614-**	ldr	p4, \[sp\]
2615 **	addvl	sp, sp, #1
2616 **	ldr	x24, \[sp\], 32
2617 **	ret
2618@@ -39,13 +38,12 @@ test_1 (int n)
2619 **	mov	x11, sp
2620 **	...
2621 **	sub	sp, sp, x13
2622-**	str	p4, \[sp\]
2623 **	cbz	w0, [^\n]*
2624+**	str	p4, \[sp\]
2625 **	str	p5, \[sp, #1, mul vl\]
2626 **	str	p6, \[sp, #2, mul vl\]
2627 **	...
2628 **	ptrue	p0\.b, all
2629-**	ldr	p4, \[sp\]
2630 **	addvl	sp, sp, #1
2631 **	ldr	x24, \[sp\], 32
2632 **	ret
2633--
26342.34.1
2635
2636
2637From 56df065080950bb30dda9c260f71be54269bdda5 Mon Sep 17 00:00:00 2001
2638From: Richard Sandiford <richard.sandiford@arm.com>
2639Date: Tue, 12 Sep 2023 16:07:20 +0100
2640Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size
2641
2642After previous patches, it's no longer necessary to store
2643saved_regs_size and below_hard_fp_saved_regs_size in the frame info.
2644All measurements instead use the top or bottom of the frame as
2645reference points.
2646
2647gcc/
2648	* config/aarch64/aarch64.h (aarch64_frame::saved_regs_size)
2649	(aarch64_frame::below_hard_fp_saved_regs_size): Delete.
2650	* config/aarch64/aarch64.cc (aarch64_layout_frame): Update accordingly.
2651---
2652 gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++-------------------
2653 gcc/config/aarch64/aarch64.h  |  7 ------
2654 2 files changed, 21 insertions(+), 31 deletions(-)
2655
2656diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
2657index 6c59c39a639..b95e805a8cc 100644
2658--- a/gcc/config/aarch64/aarch64.cc
2659+++ b/gcc/config/aarch64/aarch64.cc
2660@@ -8530,9 +8530,8 @@ aarch64_layout_frame (void)
2661
2662   /* OFFSET is now the offset of the hard frame pointer from the bottom
2663      of the callee save area.  */
2664-  frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2665-  bool saves_below_hard_fp_p
2666-    = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
2667+  auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2668+  bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
2669   gcc_assert (!saves_below_hard_fp_p
2670 	      || (frame.sve_save_and_probe != INVALID_REGNUM
2671 		  && known_eq (frame.reg_offset[frame.sve_save_and_probe],
2672@@ -8602,9 +8601,8 @@ aarch64_layout_frame (void)
2673
2674   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2675
2676-  frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
2677-  gcc_assert (known_eq (frame.saved_regs_size,
2678-			frame.below_hard_fp_saved_regs_size)
2679+  auto saved_regs_size = offset - frame.bytes_below_saved_regs;
2680+  gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)
2681 	      || (frame.hard_fp_save_and_probe != INVALID_REGNUM
2682 		  && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
2683 			       frame.bytes_below_hard_fp)));
2684@@ -8613,7 +8611,7 @@ aarch64_layout_frame (void)
2685      The saving of the bottommost register counts as an implicit probe,
2686      which allows us to maintain the invariant described in the comment
2687      at expand_prologue.  */
2688-  gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
2689+  gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
2690
2691   offset += get_frame_size ();
2692   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2693@@ -8670,7 +8668,7 @@ aarch64_layout_frame (void)
2694
2695   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
2696   HOST_WIDE_INT const_saved_regs_size;
2697-  if (known_eq (frame.saved_regs_size, 0))
2698+  if (known_eq (saved_regs_size, 0))
2699     frame.initial_adjust = frame.frame_size;
2700   else if (frame.frame_size.is_constant (&const_size)
2701 	   && const_size < max_push_offset
2702@@ -8683,7 +8681,7 @@ aarch64_layout_frame (void)
2703       frame.callee_adjust = const_size;
2704     }
2705   else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
2706-	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
2707+	   && saved_regs_size.is_constant (&const_saved_regs_size)
2708 	   && const_below_saved_regs + const_saved_regs_size < 512
2709 	   /* We could handle this case even with data below the saved
2710 	      registers, provided that that data left us with valid offsets
2711@@ -8702,8 +8700,7 @@ aarch64_layout_frame (void)
2712       frame.initial_adjust = frame.frame_size;
2713     }
2714   else if (saves_below_hard_fp_p
2715-	   && known_eq (frame.saved_regs_size,
2716-			frame.below_hard_fp_saved_regs_size))
2717+	   && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
2718     {
2719       /* Frame in which all saves are SVE saves:
2720
2721@@ -8725,7 +8722,7 @@ aarch64_layout_frame (void)
2722 	 [save SVE registers relative to SP]
2723 	 sub sp, sp, bytes_below_saved_regs  */
2724       frame.callee_adjust = const_above_fp;
2725-      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
2726+      frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
2727       frame.final_adjust = frame.bytes_below_saved_regs;
2728     }
2729   else
2730@@ -8740,7 +8737,7 @@ aarch64_layout_frame (void)
2731 	 [save SVE registers relative to SP]
2732 	 sub sp, sp, bytes_below_saved_regs  */
2733       frame.initial_adjust = frame.bytes_above_hard_fp;
2734-      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
2735+      frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
2736       frame.final_adjust = frame.bytes_below_saved_regs;
2737     }
2738
2739@@ -9936,17 +9933,17 @@ aarch64_epilogue_uses (int regno)
2740 	|  local variables              | <-- frame_pointer_rtx
2741 	|                               |
2742 	+-------------------------------+
2743-	|  padding                      | \
2744-	+-------------------------------+  |
2745-	|  callee-saved registers       |  | frame.saved_regs_size
2746-	+-------------------------------+  |
2747-	|  LR'                          |  |
2748-	+-------------------------------+  |
2749-	|  FP'                          |  |
2750-	+-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
2751-	|  SVE vector registers         |  | \
2752-	+-------------------------------+  |  | below_hard_fp_saved_regs_size
2753-	|  SVE predicate registers      | /  /
2754+	|  padding                      |
2755+	+-------------------------------+
2756+	|  callee-saved registers       |
2757+	+-------------------------------+
2758+	|  LR'                          |
2759+	+-------------------------------+
2760+	|  FP'                          |
2761+	+-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
2762+	|  SVE vector registers         |
2763+	+-------------------------------+
2764+	|  SVE predicate registers      |
2765 	+-------------------------------+
2766 	|  dynamic allocation           |
2767 	+-------------------------------+
2768diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
2769index fbfb73545ba..cfeaf4657ab 100644
2770--- a/gcc/config/aarch64/aarch64.h
2771+++ b/gcc/config/aarch64/aarch64.h
2772@@ -777,18 +777,11 @@ struct GTY (()) aarch64_frame
2773      STACK_BOUNDARY.  */
2774   HOST_WIDE_INT saved_varargs_size;
2775
2776-  /* The size of the callee-save registers with a slot in REG_OFFSET.  */
2777-  poly_int64 saved_regs_size;
2778-
2779   /* The number of bytes between the bottom of the static frame (the bottom
2780      of the outgoing arguments) and the bottom of the register save area.
2781      This value is always a multiple of STACK_BOUNDARY.  */
2782   poly_int64 bytes_below_saved_regs;
2783
2784-  /* The size of the callee-save registers with a slot in REG_OFFSET that
2785-     are saved below the hard frame pointer.  */
2786-  poly_int64 below_hard_fp_saved_regs_size;
2787-
2788   /* The number of bytes between the bottom of the static frame (the bottom
2789      of the outgoing arguments) and the hard frame pointer.  This value is
2790      always a multiple of STACK_BOUNDARY.  */
2791--
27922.34.1
2793
2794
2795From b96e66fd4ef3e36983969fb8cdd1956f551a074b Mon Sep 17 00:00:00 2001
2796From: Richard Sandiford <richard.sandiford@arm.com>
2797Date: Tue, 12 Sep 2023 16:07:21 +0100
2798Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved
2799 registers
2800
2801AArch64 normally puts the saved registers near the bottom of the frame,
2802immediately above any dynamic allocations.  But this means that a
2803stack-smash attack on those dynamic allocations could overwrite the
2804saved registers without needing to reach as far as the stack smash
2805canary.
2806
2807The same thing could also happen for variable-sized arguments that are
2808passed by value, since those are allocated before a call and popped on
2809return.
2810
2811This patch avoids that by putting the locals (and thus the canary) below
2812the saved registers when stack smash protection is active.
2813
2814The patch fixes CVE-2023-4039.
2815
2816gcc/
2817	* config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p):
2818	New function.
2819	(aarch64_layout_frame): Use it to decide whether locals should
2820	go above or below the saved registers.
2821	(aarch64_expand_prologue): Update stack layout comment.
2822	Emit a stack tie after the final adjustment.
2823
2824gcc/testsuite/
2825	* gcc.target/aarch64/stack-protector-8.c: New test.
2826	* gcc.target/aarch64/stack-protector-9.c: Likewise.
2827---
2828 gcc/config/aarch64/aarch64.cc                 | 46 +++++++--
2829 .../gcc.target/aarch64/stack-protector-8.c    | 95 +++++++++++++++++++
2830 .../gcc.target/aarch64/stack-protector-9.c    | 33 +++++++
2831 3 files changed, 168 insertions(+), 6 deletions(-)
2832 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2833 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
2834
2835diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
2836index b95e805a8cc..389c0e29353 100644
2837--- a/gcc/config/aarch64/aarch64.cc
2838+++ b/gcc/config/aarch64/aarch64.cc
2839@@ -8394,6 +8394,20 @@ aarch64_needs_frame_chain (void)
2840   return aarch64_use_frame_pointer;
2841 }
2842
2843+/* Return true if the current function should save registers above
2844+   the locals area, rather than below it.  */
2845+
2846+static bool
2847+aarch64_save_regs_above_locals_p ()
2848+{
2849+  /* When using stack smash protection, make sure that the canary slot
2850+     comes between the locals and the saved registers.  Otherwise,
2851+     it would be possible for a carefully sized smash attack to change
2852+     the saved registers (particularly LR and FP) without reaching the
2853+     canary.  */
2854+  return crtl->stack_protect_guard;
2855+}
2856+
2857 /* Mark the registers that need to be saved by the callee and calculate
2858    the size of the callee-saved registers area and frame record (both FP
2859    and LR may be omitted).  */
2860@@ -8405,6 +8419,7 @@ aarch64_layout_frame (void)
2861   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
2862   bool frame_related_fp_reg_p = false;
2863   aarch64_frame &frame = cfun->machine->frame;
2864+  poly_int64 top_of_locals = -1;
2865
2866   frame.emit_frame_chain = aarch64_needs_frame_chain ();
2867
2868@@ -8471,9 +8486,16 @@ aarch64_layout_frame (void)
2869 	&& !crtl->abi->clobbers_full_reg_p (regno))
2870       frame.reg_offset[regno] = SLOT_REQUIRED;
2871
2872+  bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
2873
2874   poly_int64 offset = crtl->outgoing_args_size;
2875   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2876+  if (regs_at_top_p)
2877+    {
2878+      offset += get_frame_size ();
2879+      offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2880+      top_of_locals = offset;
2881+    }
2882   frame.bytes_below_saved_regs = offset;
2883   frame.sve_save_and_probe = INVALID_REGNUM;
2884
2885@@ -8613,15 +8635,18 @@ aarch64_layout_frame (void)
2886      at expand_prologue.  */
2887   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
2888
2889-  offset += get_frame_size ();
2890-  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2891-  auto top_of_locals = offset;
2892-
2893+  if (!regs_at_top_p)
2894+    {
2895+      offset += get_frame_size ();
2896+      offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2897+      top_of_locals = offset;
2898+    }
2899   offset += frame.saved_varargs_size;
2900   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2901   frame.frame_size = offset;
2902
2903   frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
2904+  gcc_assert (known_ge (top_of_locals, 0));
2905   frame.bytes_above_locals = frame.frame_size - top_of_locals;
2906
2907   frame.initial_adjust = 0;
2908@@ -9930,10 +9955,10 @@ aarch64_epilogue_uses (int regno)
2909 	|  for register varargs         |
2910 	|                               |
2911 	+-------------------------------+
2912-	|  local variables              | <-- frame_pointer_rtx
2913+	|  local variables (1)          | <-- frame_pointer_rtx
2914 	|                               |
2915 	+-------------------------------+
2916-	|  padding                      |
2917+	|  padding (1)                  |
2918 	+-------------------------------+
2919 	|  callee-saved registers       |
2920 	+-------------------------------+
2921@@ -9945,6 +9970,10 @@ aarch64_epilogue_uses (int regno)
2922 	+-------------------------------+
2923 	|  SVE predicate registers      |
2924 	+-------------------------------+
2925+	|  local variables (2)          |
2926+	+-------------------------------+
2927+	|  padding (2)                  |
2928+	+-------------------------------+
2929 	|  dynamic allocation           |
2930 	+-------------------------------+
2931 	|  padding                      |
2932@@ -9954,6 +9983,9 @@ aarch64_epilogue_uses (int regno)
2933 	+-------------------------------+
2934 	|                               | <-- stack_pointer_rtx (aligned)
2935
2936+   The regions marked (1) and (2) are mutually exclusive.  (2) is used
2937+   when aarch64_save_regs_above_locals_p is true.
2938+
2939    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2940    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2941    unchanged.
2942@@ -10149,6 +10181,8 @@ aarch64_expand_prologue (void)
2943   gcc_assert (known_eq (bytes_below_sp, final_adjust));
2944   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
2945 					  !frame_pointer_needed, true);
2946+  if (emit_frame_chain && maybe_ne (final_adjust, 0))
2947+    emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2948 }
2949
2950 /* Return TRUE if we can use a simple_return insn.
2951diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2952new file mode 100644
2953index 00000000000..e71d820e365
2954--- /dev/null
2955+++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2956@@ -0,0 +1,95 @@
2957+/* { dg-options " -O -fstack-protector-strong -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 -mstack-protector-guard-offset=16" } */
2958+/* { dg-final { check-function-bodies "**" "" } } */
2959+
2960+void g(void *);
2961+__SVBool_t *h(void *);
2962+
2963+/*
2964+** test1:
2965+**	sub	sp, sp, #288
2966+**	stp	x29, x30, \[sp, #?272\]
2967+**	add	x29, sp, #?272
2968+**	mrs	(x[0-9]+), tpidr2_el0
2969+**	ldr	(x[0-9]+), \[\1, #?16\]
2970+**	str	\2, \[sp, #?264\]
2971+**	mov	\2, #?0
2972+**	add	x0, sp, #?8
2973+**	bl	g
2974+**	...
2975+**	mrs	.*
2976+**	...
2977+**	bne	.*
2978+**	...
2979+**	ldp	x29, x30, \[sp, #?272\]
2980+**	add	sp, sp, #?288
2981+**	ret
2982+**	bl	__stack_chk_fail
2983+*/
2984+int test1() {
2985+  int y[0x40];
2986+  g(y);
2987+  return 1;
2988+}
2989+
2990+/*
2991+** test2:
2992+**	stp	x29, x30, \[sp, #?-16\]!
2993+**	mov	x29, sp
2994+**	sub	sp, sp, #1040
2995+**	mrs	(x[0-9]+), tpidr2_el0
2996+**	ldr	(x[0-9]+), \[\1, #?16\]
2997+**	str	\2, \[sp, #?1032\]
2998+**	mov	\2, #?0
2999+**	add	x0, sp, #?8
3000+**	bl	g
3001+**	...
3002+**	mrs	.*
3003+**	...
3004+**	bne	.*
3005+**	...
3006+**	add	sp, sp, #?1040
3007+**	ldp	x29, x30, \[sp\], #?16
3008+**	ret
3009+**	bl	__stack_chk_fail
3010+*/
3011+int test2() {
3012+  int y[0x100];
3013+  g(y);
3014+  return 1;
3015+}
3016+
3017+#pragma GCC target "+sve"
3018+
3019+/*
3020+** test3:
3021+**	stp	x29, x30, \[sp, #?-16\]!
3022+**	mov	x29, sp
3023+**	addvl	sp, sp, #-18
3024+**	...
3025+**	str	p4, \[sp\]
3026+**	...
3027+**	sub	sp, sp, #272
3028+**	mrs	(x[0-9]+), tpidr2_el0
3029+**	ldr	(x[0-9]+), \[\1, #?16\]
3030+**	str	\2, \[sp, #?264\]
3031+**	mov	\2, #?0
3032+**	add	x0, sp, #?8
3033+**	bl	h
3034+**	...
3035+**	mrs	.*
3036+**	...
3037+**	bne	.*
3038+**	...
3039+**	add	sp, sp, #?272
3040+**	...
3041+**	ldr	p4, \[sp\]
3042+**	...
3043+**	addvl	sp, sp, #18
3044+**	ldp	x29, x30, \[sp\], #?16
3045+**	ret
3046+**	bl	__stack_chk_fail
3047+*/
3048+__SVBool_t test3() {
3049+  int y[0x40];
3050+  return *h(y);
3051+}
3052diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
3053new file mode 100644
3054index 00000000000..58f322aa480
3055--- /dev/null
3056+++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
3057@@ -0,0 +1,33 @@
3058+/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */
3059+/* { dg-final { check-function-bodies "**" "" } } */
3060+
3061+/*
3062+** main:
3063+**	...
3064+**	stp	x29, x30, \[sp, #?-[0-9]+\]!
3065+**	...
3066+**	sub	sp, sp, #[0-9]+
3067+**	...
3068+**	str	x[0-9]+, \[x29, #?-8\]
3069+**	...
3070+*/
3071+int f(const char *);
3072+void g(void *);
3073+int main(int argc, char* argv[])
3074+{
3075+  int a;
3076+  int b;
3077+  char c[2+f(argv[1])];
3078+  int d[0x100];
3079+  char y;
3080+
3081+  y=42; a=4; b=10;
3082+  c[0] = 'h'; c[1] = '\0';
3083+
3084+  c[f(argv[2])] = '\0';
3085+
3086+  __builtin_printf("%d %d\n%s\n", a, b, c);
3087+  g(d);
3088+
3089+  return 0;
3090+}
3091--
30922.34.1
3093
3094