xref: /openbmc/qemu/target/arm/tcg/helper-a64.c (revision ed1e71dac96a3bf2236ece81916d4fc1ccbce029)
1 /*
2  *  AArch64 specific helpers
3  *
4  *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "qemu/units.h"
22 #include "cpu.h"
23 #include "gdbstub/helpers.h"
24 #include "exec/helper-proto.h"
25 #include "qemu/host-utils.h"
26 #include "qemu/log.h"
27 #include "qemu/main-loop.h"
28 #include "qemu/bitops.h"
29 #include "internals.h"
30 #include "qemu/crc32c.h"
31 #include "exec/exec-all.h"
32 #include "exec/cpu_ldst.h"
33 #include "qemu/int128.h"
34 #include "qemu/atomic128.h"
35 #include "fpu/softfloat.h"
36 #include <zlib.h> /* for crc32 */
37 
38 /* C2.4.7 Multiply and divide */
39 /* special cases for 0 and LLONG_MIN are mandated by the standard */
40 uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
41 {
42     if (den == 0) {
43         return 0;
44     }
45     return num / den;
46 }
47 
48 int64_t HELPER(sdiv64)(int64_t num, int64_t den)
49 {
50     if (den == 0) {
51         return 0;
52     }
53     if (num == LLONG_MIN && den == -1) {
54         return LLONG_MIN;
55     }
56     return num / den;
57 }
58 
59 uint64_t HELPER(rbit64)(uint64_t x)
60 {
61     return revbit64(x);
62 }
63 
64 void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
65 {
66     update_spsel(env, imm);
67 }
68 
69 void HELPER(msr_set_allint_el1)(CPUARMState *env)
70 {
71     /* ALLINT update to PSTATE. */
72     if (arm_hcrx_el2_eff(env) & HCRX_TALLINT) {
73         raise_exception_ra(env, EXCP_UDEF,
74                            syn_aa64_sysregtrap(0, 1, 0, 4, 1, 0x1f, 0), 2,
75                            GETPC());
76     }
77 
78     env->pstate |= PSTATE_ALLINT;
79 }
80 
81 static void daif_check(CPUARMState *env, uint32_t op,
82                        uint32_t imm, uintptr_t ra)
83 {
84     /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set.  */
85     if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) {
86         raise_exception_ra(env, EXCP_UDEF,
87                            syn_aa64_sysregtrap(0, extract32(op, 0, 3),
88                                                extract32(op, 3, 3), 4,
89                                                imm, 0x1f, 0),
90                            exception_target_el(env), ra);
91     }
92 }
93 
94 void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm)
95 {
96     daif_check(env, 0x1e, imm, GETPC());
97     env->daif |= (imm << 6) & PSTATE_DAIF;
98     arm_rebuild_hflags(env);
99 }
100 
101 void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm)
102 {
103     daif_check(env, 0x1f, imm, GETPC());
104     env->daif &= ~((imm << 6) & PSTATE_DAIF);
105     arm_rebuild_hflags(env);
106 }
107 
108 /* Convert a softfloat float_relation_ (as returned by
109  * the float*_compare functions) to the correct ARM
110  * NZCV flag state.
111  */
112 static inline uint32_t float_rel_to_flags(int res)
113 {
114     uint64_t flags;
115     switch (res) {
116     case float_relation_equal:
117         flags = PSTATE_Z | PSTATE_C;
118         break;
119     case float_relation_less:
120         flags = PSTATE_N;
121         break;
122     case float_relation_greater:
123         flags = PSTATE_C;
124         break;
125     case float_relation_unordered:
126     default:
127         flags = PSTATE_C | PSTATE_V;
128         break;
129     }
130     return flags;
131 }
132 
133 uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status)
134 {
135     return float_rel_to_flags(float16_compare_quiet(x, y, fp_status));
136 }
137 
138 uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status)
139 {
140     return float_rel_to_flags(float16_compare(x, y, fp_status));
141 }
142 
143 uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status)
144 {
145     return float_rel_to_flags(float32_compare_quiet(x, y, fp_status));
146 }
147 
148 uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status)
149 {
150     return float_rel_to_flags(float32_compare(x, y, fp_status));
151 }
152 
153 uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status)
154 {
155     return float_rel_to_flags(float64_compare_quiet(x, y, fp_status));
156 }
157 
158 uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
159 {
160     return float_rel_to_flags(float64_compare(x, y, fp_status));
161 }
162 
163 float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
164 {
165     float_status *fpst = fpstp;
166 
167     a = float32_squash_input_denormal(a, fpst);
168     b = float32_squash_input_denormal(b, fpst);
169 
170     if ((float32_is_zero(a) && float32_is_infinity(b)) ||
171         (float32_is_infinity(a) && float32_is_zero(b))) {
172         /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
173         return make_float32((1U << 30) |
174                             ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
175     }
176     return float32_mul(a, b, fpst);
177 }
178 
179 float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
180 {
181     float_status *fpst = fpstp;
182 
183     a = float64_squash_input_denormal(a, fpst);
184     b = float64_squash_input_denormal(b, fpst);
185 
186     if ((float64_is_zero(a) && float64_is_infinity(b)) ||
187         (float64_is_infinity(a) && float64_is_zero(b))) {
188         /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
189         return make_float64((1ULL << 62) |
190                             ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
191     }
192     return float64_mul(a, b, fpst);
193 }
194 
195 /* 64bit/double versions of the neon float compare functions */
196 uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
197 {
198     float_status *fpst = fpstp;
199     return -float64_eq_quiet(a, b, fpst);
200 }
201 
202 uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
203 {
204     float_status *fpst = fpstp;
205     return -float64_le(b, a, fpst);
206 }
207 
208 uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
209 {
210     float_status *fpst = fpstp;
211     return -float64_lt(b, a, fpst);
212 }
213 
214 /* Reciprocal step and sqrt step. Note that unlike the A32/T32
215  * versions, these do a fully fused multiply-add or
216  * multiply-add-and-halve.
217  */
218 
219 uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
220 {
221     float_status *fpst = fpstp;
222 
223     a = float16_squash_input_denormal(a, fpst);
224     b = float16_squash_input_denormal(b, fpst);
225 
226     a = float16_chs(a);
227     if ((float16_is_infinity(a) && float16_is_zero(b)) ||
228         (float16_is_infinity(b) && float16_is_zero(a))) {
229         return float16_two;
230     }
231     return float16_muladd(a, b, float16_two, 0, fpst);
232 }
233 
234 float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
235 {
236     float_status *fpst = fpstp;
237 
238     a = float32_squash_input_denormal(a, fpst);
239     b = float32_squash_input_denormal(b, fpst);
240 
241     a = float32_chs(a);
242     if ((float32_is_infinity(a) && float32_is_zero(b)) ||
243         (float32_is_infinity(b) && float32_is_zero(a))) {
244         return float32_two;
245     }
246     return float32_muladd(a, b, float32_two, 0, fpst);
247 }
248 
249 float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
250 {
251     float_status *fpst = fpstp;
252 
253     a = float64_squash_input_denormal(a, fpst);
254     b = float64_squash_input_denormal(b, fpst);
255 
256     a = float64_chs(a);
257     if ((float64_is_infinity(a) && float64_is_zero(b)) ||
258         (float64_is_infinity(b) && float64_is_zero(a))) {
259         return float64_two;
260     }
261     return float64_muladd(a, b, float64_two, 0, fpst);
262 }
263 
264 uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp)
265 {
266     float_status *fpst = fpstp;
267 
268     a = float16_squash_input_denormal(a, fpst);
269     b = float16_squash_input_denormal(b, fpst);
270 
271     a = float16_chs(a);
272     if ((float16_is_infinity(a) && float16_is_zero(b)) ||
273         (float16_is_infinity(b) && float16_is_zero(a))) {
274         return float16_one_point_five;
275     }
276     return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
277 }
278 
279 float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
280 {
281     float_status *fpst = fpstp;
282 
283     a = float32_squash_input_denormal(a, fpst);
284     b = float32_squash_input_denormal(b, fpst);
285 
286     a = float32_chs(a);
287     if ((float32_is_infinity(a) && float32_is_zero(b)) ||
288         (float32_is_infinity(b) && float32_is_zero(a))) {
289         return float32_one_point_five;
290     }
291     return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
292 }
293 
294 float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
295 {
296     float_status *fpst = fpstp;
297 
298     a = float64_squash_input_denormal(a, fpst);
299     b = float64_squash_input_denormal(b, fpst);
300 
301     a = float64_chs(a);
302     if ((float64_is_infinity(a) && float64_is_zero(b)) ||
303         (float64_is_infinity(b) && float64_is_zero(a))) {
304         return float64_one_point_five;
305     }
306     return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
307 }
308 
309 /* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
310 uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
311 {
312     float_status *fpst = fpstp;
313     uint16_t val16, sbit;
314     int16_t exp;
315 
316     if (float16_is_any_nan(a)) {
317         float16 nan = a;
318         if (float16_is_signaling_nan(a, fpst)) {
319             float_raise(float_flag_invalid, fpst);
320             if (!fpst->default_nan_mode) {
321                 nan = float16_silence_nan(a, fpst);
322             }
323         }
324         if (fpst->default_nan_mode) {
325             nan = float16_default_nan(fpst);
326         }
327         return nan;
328     }
329 
330     a = float16_squash_input_denormal(a, fpst);
331 
332     val16 = float16_val(a);
333     sbit = 0x8000 & val16;
334     exp = extract32(val16, 10, 5);
335 
336     if (exp == 0) {
337         return make_float16(deposit32(sbit, 10, 5, 0x1e));
338     } else {
339         return make_float16(deposit32(sbit, 10, 5, ~exp));
340     }
341 }
342 
343 float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
344 {
345     float_status *fpst = fpstp;
346     uint32_t val32, sbit;
347     int32_t exp;
348 
349     if (float32_is_any_nan(a)) {
350         float32 nan = a;
351         if (float32_is_signaling_nan(a, fpst)) {
352             float_raise(float_flag_invalid, fpst);
353             if (!fpst->default_nan_mode) {
354                 nan = float32_silence_nan(a, fpst);
355             }
356         }
357         if (fpst->default_nan_mode) {
358             nan = float32_default_nan(fpst);
359         }
360         return nan;
361     }
362 
363     a = float32_squash_input_denormal(a, fpst);
364 
365     val32 = float32_val(a);
366     sbit = 0x80000000ULL & val32;
367     exp = extract32(val32, 23, 8);
368 
369     if (exp == 0) {
370         return make_float32(sbit | (0xfe << 23));
371     } else {
372         return make_float32(sbit | (~exp & 0xff) << 23);
373     }
374 }
375 
376 float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
377 {
378     float_status *fpst = fpstp;
379     uint64_t val64, sbit;
380     int64_t exp;
381 
382     if (float64_is_any_nan(a)) {
383         float64 nan = a;
384         if (float64_is_signaling_nan(a, fpst)) {
385             float_raise(float_flag_invalid, fpst);
386             if (!fpst->default_nan_mode) {
387                 nan = float64_silence_nan(a, fpst);
388             }
389         }
390         if (fpst->default_nan_mode) {
391             nan = float64_default_nan(fpst);
392         }
393         return nan;
394     }
395 
396     a = float64_squash_input_denormal(a, fpst);
397 
398     val64 = float64_val(a);
399     sbit = 0x8000000000000000ULL & val64;
400     exp = extract64(float64_val(a), 52, 11);
401 
402     if (exp == 0) {
403         return make_float64(sbit | (0x7feULL << 52));
404     } else {
405         return make_float64(sbit | (~exp & 0x7ffULL) << 52);
406     }
407 }
408 
409 float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
410 {
411     float32 r;
412     float_status *fpst = &env->vfp.fp_status;
413     int old = get_float_rounding_mode(fpst);
414 
415     set_float_rounding_mode(float_round_to_odd, fpst);
416     r = float64_to_float32(a, fpst);
417     set_float_rounding_mode(old, fpst);
418     return r;
419 }
420 
421 /* 64-bit versions of the CRC helpers. Note that although the operation
422  * (and the prototypes of crc32c() and crc32() mean that only the bottom
423  * 32 bits of the accumulator and result are used, we pass and return
424  * uint64_t for convenience of the generated code. Unlike the 32-bit
425  * instruction set versions, val may genuinely have 64 bits of data in it.
426  * The upper bytes of val (above the number specified by 'bytes') must have
427  * been zeroed out by the caller.
428  */
429 uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
430 {
431     uint8_t buf[8];
432 
433     stq_le_p(buf, val);
434 
435     /* zlib crc32 converts the accumulator and output to one's complement.  */
436     return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
437 }
438 
439 uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
440 {
441     uint8_t buf[8];
442 
443     stq_le_p(buf, val);
444 
445     /* Linux crc32c converts the output to one's complement.  */
446     return crc32c(acc, buf, bytes) ^ 0xffffffff;
447 }
448 
449 /*
450  * AdvSIMD half-precision
451  */
452 
453 #define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
454 
455 #define ADVSIMD_HALFOP(name) \
456 uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \
457 { \
458     float_status *fpst = fpstp; \
459     return float16_ ## name(a, b, fpst);    \
460 }
461 
462 ADVSIMD_HALFOP(add)
463 ADVSIMD_HALFOP(sub)
464 ADVSIMD_HALFOP(mul)
465 ADVSIMD_HALFOP(div)
466 ADVSIMD_HALFOP(min)
467 ADVSIMD_HALFOP(max)
468 ADVSIMD_HALFOP(minnum)
469 ADVSIMD_HALFOP(maxnum)
470 
471 #define ADVSIMD_TWOHALFOP(name)                                         \
472 uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
473 { \
474     float16  a1, a2, b1, b2;                        \
475     uint32_t r1, r2;                                \
476     float_status *fpst = fpstp;                     \
477     a1 = extract32(two_a, 0, 16);                   \
478     a2 = extract32(two_a, 16, 16);                  \
479     b1 = extract32(two_b, 0, 16);                   \
480     b2 = extract32(two_b, 16, 16);                  \
481     r1 = float16_ ## name(a1, b1, fpst);            \
482     r2 = float16_ ## name(a2, b2, fpst);            \
483     return deposit32(r1, 16, 16, r2);               \
484 }
485 
486 ADVSIMD_TWOHALFOP(add)
487 ADVSIMD_TWOHALFOP(sub)
488 ADVSIMD_TWOHALFOP(mul)
489 ADVSIMD_TWOHALFOP(div)
490 ADVSIMD_TWOHALFOP(min)
491 ADVSIMD_TWOHALFOP(max)
492 ADVSIMD_TWOHALFOP(minnum)
493 ADVSIMD_TWOHALFOP(maxnum)
494 
495 /* Data processing - scalar floating-point and advanced SIMD */
496 static float16 float16_mulx(float16 a, float16 b, void *fpstp)
497 {
498     float_status *fpst = fpstp;
499 
500     a = float16_squash_input_denormal(a, fpst);
501     b = float16_squash_input_denormal(b, fpst);
502 
503     if ((float16_is_zero(a) && float16_is_infinity(b)) ||
504         (float16_is_infinity(a) && float16_is_zero(b))) {
505         /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
506         return make_float16((1U << 14) |
507                             ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
508     }
509     return float16_mul(a, b, fpst);
510 }
511 
512 ADVSIMD_HALFOP(mulx)
513 ADVSIMD_TWOHALFOP(mulx)
514 
515 /* fused multiply-accumulate */
516 uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c,
517                                  void *fpstp)
518 {
519     float_status *fpst = fpstp;
520     return float16_muladd(a, b, c, 0, fpst);
521 }
522 
523 uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
524                                   uint32_t two_c, void *fpstp)
525 {
526     float_status *fpst = fpstp;
527     float16  a1, a2, b1, b2, c1, c2;
528     uint32_t r1, r2;
529     a1 = extract32(two_a, 0, 16);
530     a2 = extract32(two_a, 16, 16);
531     b1 = extract32(two_b, 0, 16);
532     b2 = extract32(two_b, 16, 16);
533     c1 = extract32(two_c, 0, 16);
534     c2 = extract32(two_c, 16, 16);
535     r1 = float16_muladd(a1, b1, c1, 0, fpst);
536     r2 = float16_muladd(a2, b2, c2, 0, fpst);
537     return deposit32(r1, 16, 16, r2);
538 }
539 
540 /*
541  * Floating point comparisons produce an integer result. Softfloat
542  * routines return float_relation types which we convert to the 0/-1
543  * Neon requires.
544  */
545 
546 #define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
547 
548 uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp)
549 {
550     float_status *fpst = fpstp;
551     int compare = float16_compare_quiet(a, b, fpst);
552     return ADVSIMD_CMPRES(compare == float_relation_equal);
553 }
554 
555 uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp)
556 {
557     float_status *fpst = fpstp;
558     int compare = float16_compare(a, b, fpst);
559     return ADVSIMD_CMPRES(compare == float_relation_greater ||
560                           compare == float_relation_equal);
561 }
562 
563 uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp)
564 {
565     float_status *fpst = fpstp;
566     int compare = float16_compare(a, b, fpst);
567     return ADVSIMD_CMPRES(compare == float_relation_greater);
568 }
569 
570 uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp)
571 {
572     float_status *fpst = fpstp;
573     float16 f0 = float16_abs(a);
574     float16 f1 = float16_abs(b);
575     int compare = float16_compare(f0, f1, fpst);
576     return ADVSIMD_CMPRES(compare == float_relation_greater ||
577                           compare == float_relation_equal);
578 }
579 
580 uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
581 {
582     float_status *fpst = fpstp;
583     float16 f0 = float16_abs(a);
584     float16 f1 = float16_abs(b);
585     int compare = float16_compare(f0, f1, fpst);
586     return ADVSIMD_CMPRES(compare == float_relation_greater);
587 }
588 
589 /* round to integral */
590 uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status)
591 {
592     return float16_round_to_int(x, fp_status);
593 }
594 
595 uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status)
596 {
597     int old_flags = get_float_exception_flags(fp_status), new_flags;
598     float16 ret;
599 
600     ret = float16_round_to_int(x, fp_status);
601 
602     /* Suppress any inexact exceptions the conversion produced */
603     if (!(old_flags & float_flag_inexact)) {
604         new_flags = get_float_exception_flags(fp_status);
605         set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
606     }
607 
608     return ret;
609 }
610 
611 static int el_from_spsr(uint32_t spsr)
612 {
613     /* Return the exception level that this SPSR is requesting a return to,
614      * or -1 if it is invalid (an illegal return)
615      */
616     if (spsr & PSTATE_nRW) {
617         switch (spsr & CPSR_M) {
618         case ARM_CPU_MODE_USR:
619             return 0;
620         case ARM_CPU_MODE_HYP:
621             return 2;
622         case ARM_CPU_MODE_FIQ:
623         case ARM_CPU_MODE_IRQ:
624         case ARM_CPU_MODE_SVC:
625         case ARM_CPU_MODE_ABT:
626         case ARM_CPU_MODE_UND:
627         case ARM_CPU_MODE_SYS:
628             return 1;
629         case ARM_CPU_MODE_MON:
630             /* Returning to Mon from AArch64 is never possible,
631              * so this is an illegal return.
632              */
633         default:
634             return -1;
635         }
636     } else {
637         if (extract32(spsr, 1, 1)) {
638             /* Return with reserved M[1] bit set */
639             return -1;
640         }
641         if (extract32(spsr, 0, 4) == 1) {
642             /* return to EL0 with M[0] bit set */
643             return -1;
644         }
645         return extract32(spsr, 2, 2);
646     }
647 }
648 
649 static void cpsr_write_from_spsr_elx(CPUARMState *env,
650                                      uint32_t val)
651 {
652     uint32_t mask;
653 
654     /* Save SPSR_ELx.SS into PSTATE. */
655     env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
656     val &= ~PSTATE_SS;
657 
658     /* Move DIT to the correct location for CPSR */
659     if (val & PSTATE_DIT) {
660         val &= ~PSTATE_DIT;
661         val |= CPSR_DIT;
662     }
663 
664     mask = aarch32_cpsr_valid_mask(env->features, \
665         &env_archcpu(env)->isar);
666     cpsr_write(env, val, mask, CPSRWriteRaw);
667 }
668 
669 void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
670 {
671     int cur_el = arm_current_el(env);
672     unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
673     uint32_t spsr = env->banked_spsr[spsr_idx];
674     int new_el;
675     bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
676 
677     aarch64_save_sp(env, cur_el);
678 
679     arm_clear_exclusive(env);
680 
681     /* We must squash the PSTATE.SS bit to zero unless both of the
682      * following hold:
683      *  1. debug exceptions are currently disabled
684      *  2. singlestep will be active in the EL we return to
685      * We check 1 here and 2 after we've done the pstate/cpsr write() to
686      * transition to the EL we're going to.
687      */
688     if (arm_generate_debug_exceptions(env)) {
689         spsr &= ~PSTATE_SS;
690     }
691 
692     /*
693      * FEAT_RME forbids return from EL3 with an invalid security state.
694      * We don't need an explicit check for FEAT_RME here because we enforce
695      * in scr_write() that you can't set the NSE bit without it.
696      */
697     if (cur_el == 3 && (env->cp15.scr_el3 & (SCR_NS | SCR_NSE)) == SCR_NSE) {
698         goto illegal_return;
699     }
700 
701     new_el = el_from_spsr(spsr);
702     if (new_el == -1) {
703         goto illegal_return;
704     }
705     if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) {
706         /* Disallow return to an EL which is unimplemented or higher
707          * than the current one.
708          */
709         goto illegal_return;
710     }
711 
712     if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) {
713         /* Return to an EL which is configured for a different register width */
714         goto illegal_return;
715     }
716 
717     if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
718         goto illegal_return;
719     }
720 
721     bql_lock();
722     arm_call_pre_el_change_hook(env_archcpu(env));
723     bql_unlock();
724 
725     if (!return_to_aa64) {
726         env->aarch64 = false;
727         /* We do a raw CPSR write because aarch64_sync_64_to_32()
728          * will sort the register banks out for us, and we've already
729          * caught all the bad-mode cases in el_from_spsr().
730          */
731         cpsr_write_from_spsr_elx(env, spsr);
732         if (!arm_singlestep_active(env)) {
733             env->pstate &= ~PSTATE_SS;
734         }
735         aarch64_sync_64_to_32(env);
736 
737         if (spsr & CPSR_T) {
738             env->regs[15] = new_pc & ~0x1;
739         } else {
740             env->regs[15] = new_pc & ~0x3;
741         }
742         helper_rebuild_hflags_a32(env, new_el);
743         qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
744                       "AArch32 EL%d PC 0x%" PRIx32 "\n",
745                       cur_el, new_el, env->regs[15]);
746     } else {
747         int tbii;
748 
749         env->aarch64 = true;
750         spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar);
751         pstate_write(env, spsr);
752         if (!arm_singlestep_active(env)) {
753             env->pstate &= ~PSTATE_SS;
754         }
755         aarch64_restore_sp(env, new_el);
756         helper_rebuild_hflags_a64(env, new_el);
757 
758         /*
759          * Apply TBI to the exception return address.  We had to delay this
760          * until after we selected the new EL, so that we could select the
761          * correct TBI+TBID bits.  This is made easier by waiting until after
762          * the hflags rebuild, since we can pull the composite TBII field
763          * from there.
764          */
765         tbii = EX_TBFLAG_A64(env->hflags, TBII);
766         if ((tbii >> extract64(new_pc, 55, 1)) & 1) {
767             /* TBI is enabled. */
768             int core_mmu_idx = arm_env_mmu_index(env);
769             if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) {
770                 new_pc = sextract64(new_pc, 0, 56);
771             } else {
772                 new_pc = extract64(new_pc, 0, 56);
773             }
774         }
775         env->pc = new_pc;
776 
777         qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
778                       "AArch64 EL%d PC 0x%" PRIx64 "\n",
779                       cur_el, new_el, env->pc);
780     }
781 
782     /*
783      * Note that cur_el can never be 0.  If new_el is 0, then
784      * el0_a64 is return_to_aa64, else el0_a64 is ignored.
785      */
786     aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64);
787 
788     bql_lock();
789     arm_call_el_change_hook(env_archcpu(env));
790     bql_unlock();
791 
792     return;
793 
794 illegal_return:
795     /* Illegal return events of various kinds have architecturally
796      * mandated behaviour:
797      * restore NZCV and DAIF from SPSR_ELx
798      * set PSTATE.IL
799      * restore PC from ELR_ELx
800      * no change to exception level, execution state or stack pointer
801      */
802     env->pstate |= PSTATE_IL;
803     env->pc = new_pc;
804     spsr &= PSTATE_NZCV | PSTATE_DAIF | PSTATE_ALLINT;
805     spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF | PSTATE_ALLINT);
806     pstate_write(env, spsr);
807     if (!arm_singlestep_active(env)) {
808         env->pstate &= ~PSTATE_SS;
809     }
810     helper_rebuild_hflags_a64(env, cur_el);
811     qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: "
812                   "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc);
813 }
814 
815 void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
816 {
817     uintptr_t ra = GETPC();
818 
819     /*
820      * Implement DC ZVA, which zeroes a fixed-length block of memory.
821      * Note that we do not implement the (architecturally mandated)
822      * alignment fault for attempts to use this on Device memory
823      * (which matches the usual QEMU behaviour of not implementing either
824      * alignment faults or any memory attribute handling).
825      */
826     int blocklen = 4 << env_archcpu(env)->dcz_blocksize;
827     uint64_t vaddr = vaddr_in & ~(blocklen - 1);
828     int mmu_idx = arm_env_mmu_index(env);
829     void *mem;
830 
831     /*
832      * Trapless lookup.  In addition to actual invalid page, may
833      * return NULL for I/O, watchpoints, clean pages, etc.
834      */
835     mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx);
836 
837 #ifndef CONFIG_USER_ONLY
838     if (unlikely(!mem)) {
839         /*
840          * Trap if accessing an invalid page.  DC_ZVA requires that we supply
841          * the original pointer for an invalid page.  But watchpoints require
842          * that we probe the actual space.  So do both.
843          */
844         (void) probe_write(env, vaddr_in, 1, mmu_idx, ra);
845         mem = probe_write(env, vaddr, blocklen, mmu_idx, ra);
846 
847         if (unlikely(!mem)) {
848             /*
849              * The only remaining reason for mem == NULL is I/O.
850              * Just do a series of byte writes as the architecture demands.
851              */
852             for (int i = 0; i < blocklen; i++) {
853                 cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra);
854             }
855             return;
856         }
857     }
858 #endif
859 
860     set_helper_retaddr(ra);
861     memset(mem, 0, blocklen);
862     clear_helper_retaddr();
863 }
864 
865 void HELPER(unaligned_access)(CPUARMState *env, uint64_t addr,
866                               uint32_t access_type, uint32_t mmu_idx)
867 {
868     arm_cpu_do_unaligned_access(env_cpu(env), addr, access_type,
869                                 mmu_idx, GETPC());
870 }
871 
872 /* Memory operations (memset, memmove, memcpy) */
873 
874 /*
875  * Return true if the CPY* and SET* insns can execute; compare
876  * pseudocode CheckMOPSEnabled(), though we refactor it a little.
877  */
878 static bool mops_enabled(CPUARMState *env)
879 {
880     int el = arm_current_el(env);
881 
882     if (el < 2 &&
883         (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE) &&
884         !(arm_hcrx_el2_eff(env) & HCRX_MSCEN)) {
885         return false;
886     }
887 
888     if (el == 0) {
889         if (!el_is_in_host(env, 0)) {
890             return env->cp15.sctlr_el[1] & SCTLR_MSCEN;
891         } else {
892             return env->cp15.sctlr_el[2] & SCTLR_MSCEN;
893         }
894     }
895     return true;
896 }
897 
898 static void check_mops_enabled(CPUARMState *env, uintptr_t ra)
899 {
900     if (!mops_enabled(env)) {
901         raise_exception_ra(env, EXCP_UDEF, syn_uncategorized(),
902                            exception_target_el(env), ra);
903     }
904 }
905 
906 /*
907  * Return the target exception level for an exception due
908  * to mismatched arguments in a FEAT_MOPS copy or set.
909  * Compare pseudocode MismatchedCpySetTargetEL()
910  */
911 static int mops_mismatch_exception_target_el(CPUARMState *env)
912 {
913     int el = arm_current_el(env);
914 
915     if (el > 1) {
916         return el;
917     }
918     if (el == 0 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
919         return 2;
920     }
921     if (el == 1 && (arm_hcrx_el2_eff(env) & HCRX_MCE2)) {
922         return 2;
923     }
924     return 1;
925 }
926 
927 /*
928  * Check whether an M or E instruction was executed with a CF value
929  * indicating the wrong option for this implementation.
930  * Assumes we are always Option A.
931  */
932 static void check_mops_wrong_option(CPUARMState *env, uint32_t syndrome,
933                                     uintptr_t ra)
934 {
935     if (env->CF != 0) {
936         syndrome |= 1 << 17; /* Set the wrong-option bit */
937         raise_exception_ra(env, EXCP_UDEF, syndrome,
938                            mops_mismatch_exception_target_el(env), ra);
939     }
940 }
941 
942 /*
943  * Return the maximum number of bytes we can transfer starting at addr
944  * without crossing a page boundary.
945  */
946 static uint64_t page_limit(uint64_t addr)
947 {
948     return TARGET_PAGE_ALIGN(addr + 1) - addr;
949 }
950 
951 /*
952  * Return the number of bytes we can copy starting from addr and working
953  * backwards without crossing a page boundary.
954  */
955 static uint64_t page_limit_rev(uint64_t addr)
956 {
957     return (addr & ~TARGET_PAGE_MASK) + 1;
958 }
959 
960 /*
961  * Perform part of a memory set on an area of guest memory starting at
962  * toaddr (a dirty address) and extending for setsize bytes.
963  *
964  * Returns the number of bytes actually set, which might be less than
965  * setsize; the caller should loop until the whole set has been done.
966  * The caller should ensure that the guest registers are correct
967  * for the possibility that the first byte of the set encounters
968  * an exception or watchpoint. We guarantee not to take any faults
969  * for bytes other than the first.
970  */
971 static uint64_t set_step(CPUARMState *env, uint64_t toaddr,
972                          uint64_t setsize, uint32_t data, int memidx,
973                          uint32_t *mtedesc, uintptr_t ra)
974 {
975     void *mem;
976 
977     setsize = MIN(setsize, page_limit(toaddr));
978     if (*mtedesc) {
979         uint64_t mtesize = mte_mops_probe(env, toaddr, setsize, *mtedesc);
980         if (mtesize == 0) {
981             /* Trap, or not. All CPU state is up to date */
982             mte_check_fail(env, *mtedesc, toaddr, ra);
983             /* Continue, with no further MTE checks required */
984             *mtedesc = 0;
985         } else {
986             /* Advance to the end, or to the tag mismatch */
987             setsize = MIN(setsize, mtesize);
988         }
989     }
990 
991     toaddr = useronly_clean_ptr(toaddr);
992     /*
993      * Trapless lookup: returns NULL for invalid page, I/O,
994      * watchpoints, clean pages, etc.
995      */
996     mem = tlb_vaddr_to_host(env, toaddr, MMU_DATA_STORE, memidx);
997 
998 #ifndef CONFIG_USER_ONLY
999     if (unlikely(!mem)) {
1000         /*
1001          * Slow-path: just do one byte write. This will handle the
1002          * watchpoint, invalid page, etc handling correctly.
1003          * For clean code pages, the next iteration will see
1004          * the page dirty and will use the fast path.
1005          */
1006         cpu_stb_mmuidx_ra(env, toaddr, data, memidx, ra);
1007         return 1;
1008     }
1009 #endif
1010     /* Easy case: just memset the host memory */
1011     set_helper_retaddr(ra);
1012     memset(mem, data, setsize);
1013     clear_helper_retaddr();
1014     return setsize;
1015 }
1016 
1017 /*
1018  * Similar, but setting tags. The architecture requires us to do this
1019  * in 16-byte chunks. SETP accesses are not tag checked; they set
1020  * the tags.
1021  */
1022 static uint64_t set_step_tags(CPUARMState *env, uint64_t toaddr,
1023                               uint64_t setsize, uint32_t data, int memidx,
1024                               uint32_t *mtedesc, uintptr_t ra)
1025 {
1026     void *mem;
1027     uint64_t cleanaddr;
1028 
1029     setsize = MIN(setsize, page_limit(toaddr));
1030 
1031     cleanaddr = useronly_clean_ptr(toaddr);
1032     /*
1033      * Trapless lookup: returns NULL for invalid page, I/O,
1034      * watchpoints, clean pages, etc.
1035      */
1036     mem = tlb_vaddr_to_host(env, cleanaddr, MMU_DATA_STORE, memidx);
1037 
1038 #ifndef CONFIG_USER_ONLY
1039     if (unlikely(!mem)) {
1040         /*
1041          * Slow-path: just do one write. This will handle the
1042          * watchpoint, invalid page, etc handling correctly.
1043          * The architecture requires that we do 16 bytes at a time,
1044          * and we know both ptr and size are 16 byte aligned.
1045          * For clean code pages, the next iteration will see
1046          * the page dirty and will use the fast path.
1047          */
1048         uint64_t repldata = data * 0x0101010101010101ULL;
1049         MemOpIdx oi16 = make_memop_idx(MO_TE | MO_128, memidx);
1050         cpu_st16_mmu(env, toaddr, int128_make128(repldata, repldata), oi16, ra);
1051         mte_mops_set_tags(env, toaddr, 16, *mtedesc);
1052         return 16;
1053     }
1054 #endif
1055     /* Easy case: just memset the host memory */
1056     set_helper_retaddr(ra);
1057     memset(mem, data, setsize);
1058     clear_helper_retaddr();
1059     mte_mops_set_tags(env, toaddr, setsize, *mtedesc);
1060     return setsize;
1061 }
1062 
1063 typedef uint64_t StepFn(CPUARMState *env, uint64_t toaddr,
1064                         uint64_t setsize, uint32_t data,
1065                         int memidx, uint32_t *mtedesc, uintptr_t ra);
1066 
1067 /* Extract register numbers from a MOPS exception syndrome value */
1068 static int mops_destreg(uint32_t syndrome)
1069 {
1070     return extract32(syndrome, 10, 5);
1071 }
1072 
1073 static int mops_srcreg(uint32_t syndrome)
1074 {
1075     return extract32(syndrome, 5, 5);
1076 }
1077 
1078 static int mops_sizereg(uint32_t syndrome)
1079 {
1080     return extract32(syndrome, 0, 5);
1081 }
1082 
1083 /*
1084  * Return true if TCMA and TBI bits mean we need to do MTE checks.
1085  * We only need to do this once per MOPS insn, not for every page.
1086  */
1087 static bool mte_checks_needed(uint64_t ptr, uint32_t desc)
1088 {
1089     int bit55 = extract64(ptr, 55, 1);
1090 
1091     /*
1092      * Note that tbi_check() returns true for "access checked" but
1093      * tcma_check() returns true for "access unchecked".
1094      */
1095     if (!tbi_check(desc, bit55)) {
1096         return false;
1097     }
1098     return !tcma_check(desc, bit55, allocation_tag_from_addr(ptr));
1099 }
1100 
1101 /* Take an exception if the SETG addr/size are not granule aligned */
1102 static void check_setg_alignment(CPUARMState *env, uint64_t ptr, uint64_t size,
1103                                  uint32_t memidx, uintptr_t ra)
1104 {
1105     if ((size != 0 && !QEMU_IS_ALIGNED(ptr, TAG_GRANULE)) ||
1106         !QEMU_IS_ALIGNED(size, TAG_GRANULE)) {
1107         arm_cpu_do_unaligned_access(env_cpu(env), ptr, MMU_DATA_STORE,
1108                                     memidx, ra);
1109 
1110     }
1111 }
1112 
1113 static uint64_t arm_reg_or_xzr(CPUARMState *env, int reg)
1114 {
1115     /*
1116      * Runtime equivalent of cpu_reg() -- return the CPU register value,
1117      * for contexts when index 31 means XZR (not SP).
1118      */
1119     return reg == 31 ? 0 : env->xregs[reg];
1120 }
1121 
1122 /*
1123  * For the Memory Set operation, our implementation chooses
1124  * always to use "option A", where we update Xd to the final
1125  * address in the SETP insn, and set Xn to be -(bytes remaining).
1126  * On SETM and SETE insns we only need update Xn.
1127  *
1128  * @env: CPU
1129  * @syndrome: syndrome value for mismatch exceptions
1130  * (also contains the register numbers we need to use)
1131  * @mtedesc: MTE descriptor word
1132  * @stepfn: function which does a single part of the set operation
1133  * @is_setg: true if this is the tag-setting SETG variant
1134  */
1135 static void do_setp(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc,
1136                     StepFn *stepfn, bool is_setg, uintptr_t ra)
1137 {
1138     /* Prologue: we choose to do up to the next page boundary */
1139     int rd = mops_destreg(syndrome);
1140     int rs = mops_srcreg(syndrome);
1141     int rn = mops_sizereg(syndrome);
1142     uint8_t data = arm_reg_or_xzr(env, rs);
1143     uint32_t memidx = FIELD_EX32(mtedesc, MTEDESC, MIDX);
1144     uint64_t toaddr = env->xregs[rd];
1145     uint64_t setsize = env->xregs[rn];
1146     uint64_t stagesetsize, step;
1147 
1148     check_mops_enabled(env, ra);
1149 
1150     if (setsize > INT64_MAX) {
1151         setsize = INT64_MAX;
1152         if (is_setg) {
1153             setsize &= ~0xf;
1154         }
1155     }
1156 
1157     if (unlikely(is_setg)) {
1158         check_setg_alignment(env, toaddr, setsize, memidx, ra);
1159     } else if (!mte_checks_needed(toaddr, mtedesc)) {
1160         mtedesc = 0;
1161     }
1162 
1163     stagesetsize = MIN(setsize, page_limit(toaddr));
1164     while (stagesetsize) {
1165         env->xregs[rd] = toaddr;
1166         env->xregs[rn] = setsize;
1167         step = stepfn(env, toaddr, stagesetsize, data, memidx, &mtedesc, ra);
1168         toaddr += step;
1169         setsize -= step;
1170         stagesetsize -= step;
1171     }
1172     /* Insn completed, so update registers to the Option A format */
1173     env->xregs[rd] = toaddr + setsize;
1174     env->xregs[rn] = -setsize;
1175 
1176     /* Set NZCV = 0000 to indicate we are an Option A implementation */
1177     env->NF = 0;
1178     env->ZF = 1; /* our env->ZF encoding is inverted */
1179     env->CF = 0;
1180     env->VF = 0;
1181     return;
1182 }
1183 
1184 void HELPER(setp)(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc)
1185 {
1186     do_setp(env, syndrome, mtedesc, set_step, false, GETPC());
1187 }
1188 
1189 void HELPER(setgp)(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc)
1190 {
1191     do_setp(env, syndrome, mtedesc, set_step_tags, true, GETPC());
1192 }
1193 
1194 static void do_setm(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc,
1195                     StepFn *stepfn, bool is_setg, uintptr_t ra)
1196 {
1197     /* Main: we choose to do all the full-page chunks */
1198     CPUState *cs = env_cpu(env);
1199     int rd = mops_destreg(syndrome);
1200     int rs = mops_srcreg(syndrome);
1201     int rn = mops_sizereg(syndrome);
1202     uint8_t data = arm_reg_or_xzr(env, rs);
1203     uint64_t toaddr = env->xregs[rd] + env->xregs[rn];
1204     uint64_t setsize = -env->xregs[rn];
1205     uint32_t memidx = FIELD_EX32(mtedesc, MTEDESC, MIDX);
1206     uint64_t step, stagesetsize;
1207 
1208     check_mops_enabled(env, ra);
1209 
1210     /*
1211      * We're allowed to NOP out "no data to copy" before the consistency
1212      * checks; we choose to do so.
1213      */
1214     if (env->xregs[rn] == 0) {
1215         return;
1216     }
1217 
1218     check_mops_wrong_option(env, syndrome, ra);
1219 
1220     /*
1221      * Our implementation will work fine even if we have an unaligned
1222      * destination address, and because we update Xn every time around
1223      * the loop below and the return value from stepfn() may be less
1224      * than requested, we might find toaddr is unaligned. So we don't
1225      * have an IMPDEF check for alignment here.
1226      */
1227 
1228     if (unlikely(is_setg)) {
1229         check_setg_alignment(env, toaddr, setsize, memidx, ra);
1230     } else if (!mte_checks_needed(toaddr, mtedesc)) {
1231         mtedesc = 0;
1232     }
1233 
1234     /* Do the actual memset: we leave the last partial page to SETE */
1235     stagesetsize = setsize & TARGET_PAGE_MASK;
1236     while (stagesetsize > 0) {
1237         step = stepfn(env, toaddr, stagesetsize, data, memidx, &mtedesc, ra);
1238         toaddr += step;
1239         setsize -= step;
1240         stagesetsize -= step;
1241         env->xregs[rn] = -setsize;
1242         if (stagesetsize > 0 && unlikely(cpu_loop_exit_requested(cs))) {
1243             cpu_loop_exit_restore(cs, ra);
1244         }
1245     }
1246 }
1247 
1248 void HELPER(setm)(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc)
1249 {
1250     do_setm(env, syndrome, mtedesc, set_step, false, GETPC());
1251 }
1252 
1253 void HELPER(setgm)(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc)
1254 {
1255     do_setm(env, syndrome, mtedesc, set_step_tags, true, GETPC());
1256 }
1257 
1258 static void do_sete(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc,
1259                     StepFn *stepfn, bool is_setg, uintptr_t ra)
1260 {
1261     /* Epilogue: do the last partial page */
1262     int rd = mops_destreg(syndrome);
1263     int rs = mops_srcreg(syndrome);
1264     int rn = mops_sizereg(syndrome);
1265     uint8_t data = arm_reg_or_xzr(env, rs);
1266     uint64_t toaddr = env->xregs[rd] + env->xregs[rn];
1267     uint64_t setsize = -env->xregs[rn];
1268     uint32_t memidx = FIELD_EX32(mtedesc, MTEDESC, MIDX);
1269     uint64_t step;
1270 
1271     check_mops_enabled(env, ra);
1272 
1273     /*
1274      * We're allowed to NOP out "no data to copy" before the consistency
1275      * checks; we choose to do so.
1276      */
1277     if (setsize == 0) {
1278         return;
1279     }
1280 
1281     check_mops_wrong_option(env, syndrome, ra);
1282 
1283     /*
1284      * Our implementation has no address alignment requirements, but
1285      * we do want to enforce the "less than a page" size requirement,
1286      * so we don't need to have the "check for interrupts" here.
1287      */
1288     if (setsize >= TARGET_PAGE_SIZE) {
1289         raise_exception_ra(env, EXCP_UDEF, syndrome,
1290                            mops_mismatch_exception_target_el(env), ra);
1291     }
1292 
1293     if (unlikely(is_setg)) {
1294         check_setg_alignment(env, toaddr, setsize, memidx, ra);
1295     } else if (!mte_checks_needed(toaddr, mtedesc)) {
1296         mtedesc = 0;
1297     }
1298 
1299     /* Do the actual memset */
1300     while (setsize > 0) {
1301         step = stepfn(env, toaddr, setsize, data, memidx, &mtedesc, ra);
1302         toaddr += step;
1303         setsize -= step;
1304         env->xregs[rn] = -setsize;
1305     }
1306 }
1307 
1308 void HELPER(sete)(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc)
1309 {
1310     do_sete(env, syndrome, mtedesc, set_step, false, GETPC());
1311 }
1312 
1313 void HELPER(setge)(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc)
1314 {
1315     do_sete(env, syndrome, mtedesc, set_step_tags, true, GETPC());
1316 }
1317 
1318 /*
1319  * Perform part of a memory copy from the guest memory at fromaddr
1320  * and extending for copysize bytes, to the guest memory at
1321  * toaddr. Both addresses are dirty.
1322  *
1323  * Returns the number of bytes actually set, which might be less than
1324  * copysize; the caller should loop until the whole copy has been done.
1325  * The caller should ensure that the guest registers are correct
1326  * for the possibility that the first byte of the copy encounters
1327  * an exception or watchpoint. We guarantee not to take any faults
1328  * for bytes other than the first.
1329  */
1330 static uint64_t copy_step(CPUARMState *env, uint64_t toaddr, uint64_t fromaddr,
1331                           uint64_t copysize, int wmemidx, int rmemidx,
1332                           uint32_t *wdesc, uint32_t *rdesc, uintptr_t ra)
1333 {
1334     void *rmem;
1335     void *wmem;
1336 
1337     /* Don't cross a page boundary on either source or destination */
1338     copysize = MIN(copysize, page_limit(toaddr));
1339     copysize = MIN(copysize, page_limit(fromaddr));
1340     /*
1341      * Handle MTE tag checks: either handle the tag mismatch for byte 0,
1342      * or else copy up to but not including the byte with the mismatch.
1343      */
1344     if (*rdesc) {
1345         uint64_t mtesize = mte_mops_probe(env, fromaddr, copysize, *rdesc);
1346         if (mtesize == 0) {
1347             mte_check_fail(env, *rdesc, fromaddr, ra);
1348             *rdesc = 0;
1349         } else {
1350             copysize = MIN(copysize, mtesize);
1351         }
1352     }
1353     if (*wdesc) {
1354         uint64_t mtesize = mte_mops_probe(env, toaddr, copysize, *wdesc);
1355         if (mtesize == 0) {
1356             mte_check_fail(env, *wdesc, toaddr, ra);
1357             *wdesc = 0;
1358         } else {
1359             copysize = MIN(copysize, mtesize);
1360         }
1361     }
1362 
1363     toaddr = useronly_clean_ptr(toaddr);
1364     fromaddr = useronly_clean_ptr(fromaddr);
1365     /* Trapless lookup of whether we can get a host memory pointer */
1366     wmem = tlb_vaddr_to_host(env, toaddr, MMU_DATA_STORE, wmemidx);
1367     rmem = tlb_vaddr_to_host(env, fromaddr, MMU_DATA_LOAD, rmemidx);
1368 
1369 #ifndef CONFIG_USER_ONLY
1370     /*
1371      * If we don't have host memory for both source and dest then just
1372      * do a single byte copy. This will handle watchpoints, invalid pages,
1373      * etc correctly. For clean code pages, the next iteration will see
1374      * the page dirty and will use the fast path.
1375      */
1376     if (unlikely(!rmem || !wmem)) {
1377         uint8_t byte;
1378         if (rmem) {
1379             byte = *(uint8_t *)rmem;
1380         } else {
1381             byte = cpu_ldub_mmuidx_ra(env, fromaddr, rmemidx, ra);
1382         }
1383         if (wmem) {
1384             *(uint8_t *)wmem = byte;
1385         } else {
1386             cpu_stb_mmuidx_ra(env, toaddr, byte, wmemidx, ra);
1387         }
1388         return 1;
1389     }
1390 #endif
1391     /* Easy case: just memmove the host memory */
1392     set_helper_retaddr(ra);
1393     memmove(wmem, rmem, copysize);
1394     clear_helper_retaddr();
1395     return copysize;
1396 }
1397 
1398 /*
1399  * Do part of a backwards memory copy. Here toaddr and fromaddr point
1400  * to the *last* byte to be copied.
1401  */
1402 static uint64_t copy_step_rev(CPUARMState *env, uint64_t toaddr,
1403                               uint64_t fromaddr,
1404                               uint64_t copysize, int wmemidx, int rmemidx,
1405                               uint32_t *wdesc, uint32_t *rdesc, uintptr_t ra)
1406 {
1407     void *rmem;
1408     void *wmem;
1409 
1410     /* Don't cross a page boundary on either source or destination */
1411     copysize = MIN(copysize, page_limit_rev(toaddr));
1412     copysize = MIN(copysize, page_limit_rev(fromaddr));
1413 
1414     /*
1415      * Handle MTE tag checks: either handle the tag mismatch for byte 0,
1416      * or else copy up to but not including the byte with the mismatch.
1417      */
1418     if (*rdesc) {
1419         uint64_t mtesize = mte_mops_probe_rev(env, fromaddr, copysize, *rdesc);
1420         if (mtesize == 0) {
1421             mte_check_fail(env, *rdesc, fromaddr, ra);
1422             *rdesc = 0;
1423         } else {
1424             copysize = MIN(copysize, mtesize);
1425         }
1426     }
1427     if (*wdesc) {
1428         uint64_t mtesize = mte_mops_probe_rev(env, toaddr, copysize, *wdesc);
1429         if (mtesize == 0) {
1430             mte_check_fail(env, *wdesc, toaddr, ra);
1431             *wdesc = 0;
1432         } else {
1433             copysize = MIN(copysize, mtesize);
1434         }
1435     }
1436 
1437     toaddr = useronly_clean_ptr(toaddr);
1438     fromaddr = useronly_clean_ptr(fromaddr);
1439     /* Trapless lookup of whether we can get a host memory pointer */
1440     wmem = tlb_vaddr_to_host(env, toaddr, MMU_DATA_STORE, wmemidx);
1441     rmem = tlb_vaddr_to_host(env, fromaddr, MMU_DATA_LOAD, rmemidx);
1442 
1443 #ifndef CONFIG_USER_ONLY
1444     /*
1445      * If we don't have host memory for both source and dest then just
1446      * do a single byte copy. This will handle watchpoints, invalid pages,
1447      * etc correctly. For clean code pages, the next iteration will see
1448      * the page dirty and will use the fast path.
1449      */
1450     if (unlikely(!rmem || !wmem)) {
1451         uint8_t byte;
1452         if (rmem) {
1453             byte = *(uint8_t *)rmem;
1454         } else {
1455             byte = cpu_ldub_mmuidx_ra(env, fromaddr, rmemidx, ra);
1456         }
1457         if (wmem) {
1458             *(uint8_t *)wmem = byte;
1459         } else {
1460             cpu_stb_mmuidx_ra(env, toaddr, byte, wmemidx, ra);
1461         }
1462         return 1;
1463     }
1464 #endif
1465     /*
1466      * Easy case: just memmove the host memory. Note that wmem and
1467      * rmem here point to the *last* byte to copy.
1468      */
1469     set_helper_retaddr(ra);
1470     memmove(wmem - (copysize - 1), rmem - (copysize - 1), copysize);
1471     clear_helper_retaddr();
1472     return copysize;
1473 }
1474 
1475 /*
1476  * for the Memory Copy operation, our implementation chooses always
1477  * to use "option A", where we update Xd and Xs to the final addresses
1478  * in the CPYP insn, and then in CPYM and CPYE only need to update Xn.
1479  *
1480  * @env: CPU
1481  * @syndrome: syndrome value for mismatch exceptions
1482  * (also contains the register numbers we need to use)
1483  * @wdesc: MTE descriptor for the writes (destination)
1484  * @rdesc: MTE descriptor for the reads (source)
1485  * @move: true if this is CPY (memmove), false for CPYF (memcpy forwards)
1486  */
1487 static void do_cpyp(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1488                     uint32_t rdesc, uint32_t move, uintptr_t ra)
1489 {
1490     int rd = mops_destreg(syndrome);
1491     int rs = mops_srcreg(syndrome);
1492     int rn = mops_sizereg(syndrome);
1493     uint32_t rmemidx = FIELD_EX32(rdesc, MTEDESC, MIDX);
1494     uint32_t wmemidx = FIELD_EX32(wdesc, MTEDESC, MIDX);
1495     bool forwards = true;
1496     uint64_t toaddr = env->xregs[rd];
1497     uint64_t fromaddr = env->xregs[rs];
1498     uint64_t copysize = env->xregs[rn];
1499     uint64_t stagecopysize, step;
1500 
1501     check_mops_enabled(env, ra);
1502 
1503 
1504     if (move) {
1505         /*
1506          * Copy backwards if necessary. The direction for a non-overlapping
1507          * copy is IMPDEF; we choose forwards.
1508          */
1509         if (copysize > 0x007FFFFFFFFFFFFFULL) {
1510             copysize = 0x007FFFFFFFFFFFFFULL;
1511         }
1512         uint64_t fs = extract64(fromaddr, 0, 56);
1513         uint64_t ts = extract64(toaddr, 0, 56);
1514         uint64_t fe = extract64(fromaddr + copysize, 0, 56);
1515 
1516         if (fs < ts && fe > ts) {
1517             forwards = false;
1518         }
1519     } else {
1520         if (copysize > INT64_MAX) {
1521             copysize = INT64_MAX;
1522         }
1523     }
1524 
1525     if (!mte_checks_needed(fromaddr, rdesc)) {
1526         rdesc = 0;
1527     }
1528     if (!mte_checks_needed(toaddr, wdesc)) {
1529         wdesc = 0;
1530     }
1531 
1532     if (forwards) {
1533         stagecopysize = MIN(copysize, page_limit(toaddr));
1534         stagecopysize = MIN(stagecopysize, page_limit(fromaddr));
1535         while (stagecopysize) {
1536             env->xregs[rd] = toaddr;
1537             env->xregs[rs] = fromaddr;
1538             env->xregs[rn] = copysize;
1539             step = copy_step(env, toaddr, fromaddr, stagecopysize,
1540                              wmemidx, rmemidx, &wdesc, &rdesc, ra);
1541             toaddr += step;
1542             fromaddr += step;
1543             copysize -= step;
1544             stagecopysize -= step;
1545         }
1546         /* Insn completed, so update registers to the Option A format */
1547         env->xregs[rd] = toaddr + copysize;
1548         env->xregs[rs] = fromaddr + copysize;
1549         env->xregs[rn] = -copysize;
1550     } else {
1551         /*
1552          * In a reverse copy the to and from addrs in Xs and Xd are the start
1553          * of the range, but it's more convenient for us to work with pointers
1554          * to the last byte being copied.
1555          */
1556         toaddr += copysize - 1;
1557         fromaddr += copysize - 1;
1558         stagecopysize = MIN(copysize, page_limit_rev(toaddr));
1559         stagecopysize = MIN(stagecopysize, page_limit_rev(fromaddr));
1560         while (stagecopysize) {
1561             env->xregs[rn] = copysize;
1562             step = copy_step_rev(env, toaddr, fromaddr, stagecopysize,
1563                                  wmemidx, rmemidx, &wdesc, &rdesc, ra);
1564             copysize -= step;
1565             stagecopysize -= step;
1566             toaddr -= step;
1567             fromaddr -= step;
1568         }
1569         /*
1570          * Insn completed, so update registers to the Option A format.
1571          * For a reverse copy this is no different to the CPYP input format.
1572          */
1573         env->xregs[rn] = copysize;
1574     }
1575 
1576     /* Set NZCV = 0000 to indicate we are an Option A implementation */
1577     env->NF = 0;
1578     env->ZF = 1; /* our env->ZF encoding is inverted */
1579     env->CF = 0;
1580     env->VF = 0;
1581     return;
1582 }
1583 
1584 void HELPER(cpyp)(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1585                   uint32_t rdesc)
1586 {
1587     do_cpyp(env, syndrome, wdesc, rdesc, true, GETPC());
1588 }
1589 
1590 void HELPER(cpyfp)(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1591                    uint32_t rdesc)
1592 {
1593     do_cpyp(env, syndrome, wdesc, rdesc, false, GETPC());
1594 }
1595 
1596 static void do_cpym(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1597                     uint32_t rdesc, uint32_t move, uintptr_t ra)
1598 {
1599     /* Main: we choose to copy until less than a page remaining */
1600     CPUState *cs = env_cpu(env);
1601     int rd = mops_destreg(syndrome);
1602     int rs = mops_srcreg(syndrome);
1603     int rn = mops_sizereg(syndrome);
1604     uint32_t rmemidx = FIELD_EX32(rdesc, MTEDESC, MIDX);
1605     uint32_t wmemidx = FIELD_EX32(wdesc, MTEDESC, MIDX);
1606     bool forwards = true;
1607     uint64_t toaddr, fromaddr, copysize, step;
1608 
1609     check_mops_enabled(env, ra);
1610 
1611     /* We choose to NOP out "no data to copy" before consistency checks */
1612     if (env->xregs[rn] == 0) {
1613         return;
1614     }
1615 
1616     check_mops_wrong_option(env, syndrome, ra);
1617 
1618     if (move) {
1619         forwards = (int64_t)env->xregs[rn] < 0;
1620     }
1621 
1622     if (forwards) {
1623         toaddr = env->xregs[rd] + env->xregs[rn];
1624         fromaddr = env->xregs[rs] + env->xregs[rn];
1625         copysize = -env->xregs[rn];
1626     } else {
1627         copysize = env->xregs[rn];
1628         /* This toaddr and fromaddr point to the *last* byte to copy */
1629         toaddr = env->xregs[rd] + copysize - 1;
1630         fromaddr = env->xregs[rs] + copysize - 1;
1631     }
1632 
1633     if (!mte_checks_needed(fromaddr, rdesc)) {
1634         rdesc = 0;
1635     }
1636     if (!mte_checks_needed(toaddr, wdesc)) {
1637         wdesc = 0;
1638     }
1639 
1640     /* Our implementation has no particular parameter requirements for CPYM */
1641 
1642     /* Do the actual memmove */
1643     if (forwards) {
1644         while (copysize >= TARGET_PAGE_SIZE) {
1645             step = copy_step(env, toaddr, fromaddr, copysize,
1646                              wmemidx, rmemidx, &wdesc, &rdesc, ra);
1647             toaddr += step;
1648             fromaddr += step;
1649             copysize -= step;
1650             env->xregs[rn] = -copysize;
1651             if (copysize >= TARGET_PAGE_SIZE &&
1652                 unlikely(cpu_loop_exit_requested(cs))) {
1653                 cpu_loop_exit_restore(cs, ra);
1654             }
1655         }
1656     } else {
1657         while (copysize >= TARGET_PAGE_SIZE) {
1658             step = copy_step_rev(env, toaddr, fromaddr, copysize,
1659                                  wmemidx, rmemidx, &wdesc, &rdesc, ra);
1660             toaddr -= step;
1661             fromaddr -= step;
1662             copysize -= step;
1663             env->xregs[rn] = copysize;
1664             if (copysize >= TARGET_PAGE_SIZE &&
1665                 unlikely(cpu_loop_exit_requested(cs))) {
1666                 cpu_loop_exit_restore(cs, ra);
1667             }
1668         }
1669     }
1670 }
1671 
1672 void HELPER(cpym)(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1673                   uint32_t rdesc)
1674 {
1675     do_cpym(env, syndrome, wdesc, rdesc, true, GETPC());
1676 }
1677 
1678 void HELPER(cpyfm)(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1679                    uint32_t rdesc)
1680 {
1681     do_cpym(env, syndrome, wdesc, rdesc, false, GETPC());
1682 }
1683 
1684 static void do_cpye(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1685                     uint32_t rdesc, uint32_t move, uintptr_t ra)
1686 {
1687     /* Epilogue: do the last partial page */
1688     int rd = mops_destreg(syndrome);
1689     int rs = mops_srcreg(syndrome);
1690     int rn = mops_sizereg(syndrome);
1691     uint32_t rmemidx = FIELD_EX32(rdesc, MTEDESC, MIDX);
1692     uint32_t wmemidx = FIELD_EX32(wdesc, MTEDESC, MIDX);
1693     bool forwards = true;
1694     uint64_t toaddr, fromaddr, copysize, step;
1695 
1696     check_mops_enabled(env, ra);
1697 
1698     /* We choose to NOP out "no data to copy" before consistency checks */
1699     if (env->xregs[rn] == 0) {
1700         return;
1701     }
1702 
1703     check_mops_wrong_option(env, syndrome, ra);
1704 
1705     if (move) {
1706         forwards = (int64_t)env->xregs[rn] < 0;
1707     }
1708 
1709     if (forwards) {
1710         toaddr = env->xregs[rd] + env->xregs[rn];
1711         fromaddr = env->xregs[rs] + env->xregs[rn];
1712         copysize = -env->xregs[rn];
1713     } else {
1714         copysize = env->xregs[rn];
1715         /* This toaddr and fromaddr point to the *last* byte to copy */
1716         toaddr = env->xregs[rd] + copysize - 1;
1717         fromaddr = env->xregs[rs] + copysize - 1;
1718     }
1719 
1720     if (!mte_checks_needed(fromaddr, rdesc)) {
1721         rdesc = 0;
1722     }
1723     if (!mte_checks_needed(toaddr, wdesc)) {
1724         wdesc = 0;
1725     }
1726 
1727     /* Check the size; we don't want to have do a check-for-interrupts */
1728     if (copysize >= TARGET_PAGE_SIZE) {
1729         raise_exception_ra(env, EXCP_UDEF, syndrome,
1730                            mops_mismatch_exception_target_el(env), ra);
1731     }
1732 
1733     /* Do the actual memmove */
1734     if (forwards) {
1735         while (copysize > 0) {
1736             step = copy_step(env, toaddr, fromaddr, copysize,
1737                              wmemidx, rmemidx, &wdesc, &rdesc, ra);
1738             toaddr += step;
1739             fromaddr += step;
1740             copysize -= step;
1741             env->xregs[rn] = -copysize;
1742         }
1743     } else {
1744         while (copysize > 0) {
1745             step = copy_step_rev(env, toaddr, fromaddr, copysize,
1746                                  wmemidx, rmemidx, &wdesc, &rdesc, ra);
1747             toaddr -= step;
1748             fromaddr -= step;
1749             copysize -= step;
1750             env->xregs[rn] = copysize;
1751         }
1752     }
1753 }
1754 
1755 void HELPER(cpye)(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1756                   uint32_t rdesc)
1757 {
1758     do_cpye(env, syndrome, wdesc, rdesc, true, GETPC());
1759 }
1760 
1761 void HELPER(cpyfe)(CPUARMState *env, uint32_t syndrome, uint32_t wdesc,
1762                    uint32_t rdesc)
1763 {
1764     do_cpye(env, syndrome, wdesc, rdesc, false, GETPC());
1765 }
1766 
1767 static bool is_guarded_page(CPUARMState *env, target_ulong addr, uintptr_t ra)
1768 {
1769 #ifdef CONFIG_USER_ONLY
1770     return page_get_flags(addr) & PAGE_BTI;
1771 #else
1772     CPUTLBEntryFull *full;
1773     void *host;
1774     int mmu_idx = cpu_mmu_index(env_cpu(env), true);
1775     int flags = probe_access_full(env, addr, 0, MMU_INST_FETCH, mmu_idx,
1776                                   false, &host, &full, ra);
1777 
1778     assert(!(flags & TLB_INVALID_MASK));
1779     return full->extra.arm.guarded;
1780 #endif
1781 }
1782 
1783 void HELPER(guarded_page_check)(CPUARMState *env)
1784 {
1785     /*
1786      * We have already verified that bti is enabled, and that the
1787      * instruction at PC is not ok for BTYPE.  This is always at
1788      * the beginning of a block, so PC is always up-to-date and
1789      * no unwind is required.
1790      */
1791     if (is_guarded_page(env, env->pc, 0)) {
1792         raise_exception(env, EXCP_UDEF, syn_btitrap(env->btype),
1793                         exception_target_el(env));
1794     }
1795 }
1796 
1797 void HELPER(guarded_page_br)(CPUARMState *env, target_ulong pc)
1798 {
1799     /*
1800      * We have already checked for branch via x16 and x17.
1801      * What remains for choosing BTYPE is checking for a guarded page.
1802      */
1803     env->btype = is_guarded_page(env, pc, GETPC()) ? 3 : 1;
1804 }
1805