xref: /openbmc/qemu/accel/tcg/ldst_atomicity.c.inc (revision 6c1e3906)
1/*
2 * Routines common to user and system emulation of load/store.
3 *
4 *  Copyright (c) 2022 Linaro, Ltd.
5 *
6 * SPDX-License-Identifier: GPL-2.0-or-later
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12#ifdef CONFIG_ATOMIC64
13# define HAVE_al8          true
14#else
15# define HAVE_al8          false
16#endif
17#define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
18
19/*
20 * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
21 * that are supported by the host, e.g. s390x.  We can force the pointer to
22 * have our known alignment with __builtin_assume_aligned, however prior to
23 * GCC 13 that was only reliable with optimization enabled.  See
24 *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
25 */
26#if defined(CONFIG_ATOMIC128_OPT)
27# if !defined(__OPTIMIZE__)
28#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
29# endif
30# define CONFIG_ATOMIC128
31#endif
32#ifndef ATTRIBUTE_ATOMIC128_OPT
33# define ATTRIBUTE_ATOMIC128_OPT
34#endif
35
36#if defined(CONFIG_ATOMIC128)
37# define HAVE_al16_fast    true
38#else
39# define HAVE_al16_fast    false
40#endif
41#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
42# define HAVE_al16         true
43#else
44# define HAVE_al16         false
45#endif
46
47
48/**
49 * required_atomicity:
50 *
51 * Return the lg2 bytes of atomicity required by @memop for @p.
52 * If the operation must be split into two operations to be
53 * examined separately for atomicity, return -lg2.
54 */
55static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop)
56{
57    MemOp atom = memop & MO_ATOM_MASK;
58    MemOp size = memop & MO_SIZE;
59    MemOp half = size ? size - 1 : 0;
60    unsigned tmp;
61    int atmax;
62
63    switch (atom) {
64    case MO_ATOM_NONE:
65        atmax = MO_8;
66        break;
67
68    case MO_ATOM_IFALIGN_PAIR:
69        size = half;
70        /* fall through */
71
72    case MO_ATOM_IFALIGN:
73        tmp = (1 << size) - 1;
74        atmax = p & tmp ? MO_8 : size;
75        break;
76
77    case MO_ATOM_WITHIN16:
78        tmp = p & 15;
79        atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
80        break;
81
82    case MO_ATOM_WITHIN16_PAIR:
83        tmp = p & 15;
84        if (tmp + (1 << size) <= 16) {
85            atmax = size;
86        } else if (tmp + (1 << half) == 16) {
87            /*
88             * The pair exactly straddles the boundary.
89             * Both halves are naturally aligned and atomic.
90             */
91            atmax = half;
92        } else {
93            /*
94             * One of the pair crosses the boundary, and is non-atomic.
95             * The other of the pair does not cross, and is atomic.
96             */
97            atmax = -half;
98        }
99        break;
100
101    case MO_ATOM_SUBALIGN:
102        /*
103         * Examine the alignment of p to determine if there are subobjects
104         * that must be aligned.  Note that we only really need ctz4() --
105         * any more sigificant bits are discarded by the immediately
106         * following comparison.
107         */
108        tmp = ctz32(p);
109        atmax = MIN(size, tmp);
110        break;
111
112    default:
113        g_assert_not_reached();
114    }
115
116    /*
117     * Here we have the architectural atomicity of the operation.
118     * However, when executing in a serial context, we need no extra
119     * host atomicity in order to avoid racing.  This reduction
120     * avoids looping with cpu_loop_exit_atomic.
121     */
122    if (cpu_in_serial_context(env_cpu(env))) {
123        return MO_8;
124    }
125    return atmax;
126}
127
128/**
129 * load_atomic2:
130 * @pv: host address
131 *
132 * Atomically load 2 aligned bytes from @pv.
133 */
134static inline uint16_t load_atomic2(void *pv)
135{
136    uint16_t *p = __builtin_assume_aligned(pv, 2);
137    return qatomic_read(p);
138}
139
140/**
141 * load_atomic4:
142 * @pv: host address
143 *
144 * Atomically load 4 aligned bytes from @pv.
145 */
146static inline uint32_t load_atomic4(void *pv)
147{
148    uint32_t *p = __builtin_assume_aligned(pv, 4);
149    return qatomic_read(p);
150}
151
152/**
153 * load_atomic8:
154 * @pv: host address
155 *
156 * Atomically load 8 aligned bytes from @pv.
157 */
158static inline uint64_t load_atomic8(void *pv)
159{
160    uint64_t *p = __builtin_assume_aligned(pv, 8);
161
162    qemu_build_assert(HAVE_al8);
163    return qatomic_read__nocheck(p);
164}
165
166/**
167 * load_atomic16:
168 * @pv: host address
169 *
170 * Atomically load 16 aligned bytes from @pv.
171 */
172static inline Int128 ATTRIBUTE_ATOMIC128_OPT
173load_atomic16(void *pv)
174{
175#ifdef CONFIG_ATOMIC128
176    __uint128_t *p = __builtin_assume_aligned(pv, 16);
177    Int128Alias r;
178
179    r.u = qatomic_read__nocheck(p);
180    return r.s;
181#else
182    qemu_build_not_reached();
183#endif
184}
185
186/**
187 * load_atomic8_or_exit:
188 * @env: cpu context
189 * @ra: host unwind address
190 * @pv: host address
191 *
192 * Atomically load 8 aligned bytes from @pv.
193 * If this is not possible, longjmp out to restart serially.
194 */
195static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
196{
197    if (HAVE_al8) {
198        return load_atomic8(pv);
199    }
200
201#ifdef CONFIG_USER_ONLY
202    /*
203     * If the page is not writable, then assume the value is immutable
204     * and requires no locking.  This ignores the case of MAP_SHARED with
205     * another process, because the fallback start_exclusive solution
206     * provides no protection across processes.
207     */
208    if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) {
209        uint64_t *p = __builtin_assume_aligned(pv, 8);
210        return *p;
211    }
212#endif
213
214    /* Ultimate fallback: re-execute in serial context. */
215    cpu_loop_exit_atomic(env_cpu(env), ra);
216}
217
218/**
219 * load_atomic16_or_exit:
220 * @env: cpu context
221 * @ra: host unwind address
222 * @pv: host address
223 *
224 * Atomically load 16 aligned bytes from @pv.
225 * If this is not possible, longjmp out to restart serially.
226 */
227static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
228{
229    Int128 *p = __builtin_assume_aligned(pv, 16);
230
231    if (HAVE_al16_fast) {
232        return load_atomic16(p);
233    }
234
235#ifdef CONFIG_USER_ONLY
236    /*
237     * We can only use cmpxchg to emulate a load if the page is writable.
238     * If the page is not writable, then assume the value is immutable
239     * and requires no locking.  This ignores the case of MAP_SHARED with
240     * another process, because the fallback start_exclusive solution
241     * provides no protection across processes.
242     */
243    if (!page_check_range(h2g(p), 16, PAGE_WRITE)) {
244        return *p;
245    }
246#endif
247
248    /*
249     * In system mode all guest pages are writable, and for user-only
250     * we have just checked writability.  Try cmpxchg.
251     */
252#if defined(CONFIG_CMPXCHG128)
253    /* Swap 0 with 0, with the side-effect of returning the old value. */
254    {
255        Int128Alias r;
256        r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0);
257        return r.s;
258    }
259#endif
260
261    /* Ultimate fallback: re-execute in serial context. */
262    cpu_loop_exit_atomic(env_cpu(env), ra);
263}
264
265/**
266 * load_atom_extract_al4x2:
267 * @pv: host address
268 *
269 * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
270 */
271static uint32_t load_atom_extract_al4x2(void *pv)
272{
273    uintptr_t pi = (uintptr_t)pv;
274    int sh = (pi & 3) * 8;
275    uint32_t a, b;
276
277    pv = (void *)(pi & ~3);
278    a = load_atomic4(pv);
279    b = load_atomic4(pv + 4);
280
281    if (HOST_BIG_ENDIAN) {
282        return (a << sh) | (b >> (-sh & 31));
283    } else {
284        return (a >> sh) | (b << (-sh & 31));
285    }
286}
287
288/**
289 * load_atom_extract_al8x2:
290 * @pv: host address
291 *
292 * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
293 */
294static uint64_t load_atom_extract_al8x2(void *pv)
295{
296    uintptr_t pi = (uintptr_t)pv;
297    int sh = (pi & 7) * 8;
298    uint64_t a, b;
299
300    pv = (void *)(pi & ~7);
301    a = load_atomic8(pv);
302    b = load_atomic8(pv + 8);
303
304    if (HOST_BIG_ENDIAN) {
305        return (a << sh) | (b >> (-sh & 63));
306    } else {
307        return (a >> sh) | (b << (-sh & 63));
308    }
309}
310
311/**
312 * load_atom_extract_al8_or_exit:
313 * @env: cpu context
314 * @ra: host unwind address
315 * @pv: host address
316 * @s: object size in bytes, @s <= 4.
317 *
318 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
319 * not cross an 8-byte boundary.  This means that we can perform an atomic
320 * 8-byte load and extract.
321 * The value is returned in the low bits of a uint32_t.
322 */
323static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra,
324                                              void *pv, int s)
325{
326    uintptr_t pi = (uintptr_t)pv;
327    int o = pi & 7;
328    int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
329
330    pv = (void *)(pi & ~7);
331    return load_atomic8_or_exit(env, ra, pv) >> shr;
332}
333
334/**
335 * load_atom_extract_al16_or_exit:
336 * @env: cpu context
337 * @ra: host unwind address
338 * @p: host address
339 * @s: object size in bytes, @s <= 8.
340 *
341 * Atomically load @s bytes from @p, when p % 16 < 8
342 * and p % 16 + s > 8.  I.e. does not cross a 16-byte
343 * boundary, but *does* cross an 8-byte boundary.
344 * This is the slow version, so we must have eliminated
345 * any faster load_atom_extract_al8_or_exit case.
346 *
347 * If this is not possible, longjmp out to restart serially.
348 */
349static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
350                                               void *pv, int s)
351{
352    uintptr_t pi = (uintptr_t)pv;
353    int o = pi & 7;
354    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
355    Int128 r;
356
357    /*
358     * Note constraints above: p & 8 must be clear.
359     * Provoke SIGBUS if possible otherwise.
360     */
361    pv = (void *)(pi & ~7);
362    r = load_atomic16_or_exit(env, ra, pv);
363
364    r = int128_urshift(r, shr);
365    return int128_getlo(r);
366}
367
368/**
369 * load_atom_extract_al16_or_al8:
370 * @p: host address
371 * @s: object size in bytes, @s <= 8.
372 *
373 * Load @s bytes from @p, when p % s != 0.  If [p, p+s-1] does not
374 * cross an 16-byte boundary then the access must be 16-byte atomic,
375 * otherwise the access must be 8-byte atomic.
376 */
377static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
378load_atom_extract_al16_or_al8(void *pv, int s)
379{
380#if defined(CONFIG_ATOMIC128)
381    uintptr_t pi = (uintptr_t)pv;
382    int o = pi & 7;
383    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
384    __uint128_t r;
385
386    pv = (void *)(pi & ~7);
387    if (pi & 8) {
388        uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
389        uint64_t a = qatomic_read__nocheck(p8);
390        uint64_t b = qatomic_read__nocheck(p8 + 1);
391
392        if (HOST_BIG_ENDIAN) {
393            r = ((__uint128_t)a << 64) | b;
394        } else {
395            r = ((__uint128_t)b << 64) | a;
396        }
397    } else {
398        __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0);
399        r = qatomic_read__nocheck(p16);
400    }
401    return r >> shr;
402#else
403    qemu_build_not_reached();
404#endif
405}
406
407/**
408 * load_atom_4_by_2:
409 * @pv: host address
410 *
411 * Load 4 bytes from @pv, with two 2-byte atomic loads.
412 */
413static inline uint32_t load_atom_4_by_2(void *pv)
414{
415    uint32_t a = load_atomic2(pv);
416    uint32_t b = load_atomic2(pv + 2);
417
418    if (HOST_BIG_ENDIAN) {
419        return (a << 16) | b;
420    } else {
421        return (b << 16) | a;
422    }
423}
424
425/**
426 * load_atom_8_by_2:
427 * @pv: host address
428 *
429 * Load 8 bytes from @pv, with four 2-byte atomic loads.
430 */
431static inline uint64_t load_atom_8_by_2(void *pv)
432{
433    uint32_t a = load_atom_4_by_2(pv);
434    uint32_t b = load_atom_4_by_2(pv + 4);
435
436    if (HOST_BIG_ENDIAN) {
437        return ((uint64_t)a << 32) | b;
438    } else {
439        return ((uint64_t)b << 32) | a;
440    }
441}
442
443/**
444 * load_atom_8_by_4:
445 * @pv: host address
446 *
447 * Load 8 bytes from @pv, with two 4-byte atomic loads.
448 */
449static inline uint64_t load_atom_8_by_4(void *pv)
450{
451    uint32_t a = load_atomic4(pv);
452    uint32_t b = load_atomic4(pv + 4);
453
454    if (HOST_BIG_ENDIAN) {
455        return ((uint64_t)a << 32) | b;
456    } else {
457        return ((uint64_t)b << 32) | a;
458    }
459}
460
461/**
462 * load_atom_8_by_8_or_4:
463 * @pv: host address
464 *
465 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
466 */
467static inline uint64_t load_atom_8_by_8_or_4(void *pv)
468{
469    if (HAVE_al8_fast) {
470        return load_atomic8(pv);
471    } else {
472        return load_atom_8_by_4(pv);
473    }
474}
475
476/**
477 * load_atom_2:
478 * @p: host address
479 * @memop: the full memory op
480 *
481 * Load 2 bytes from @p, honoring the atomicity of @memop.
482 */
483static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
484                            void *pv, MemOp memop)
485{
486    uintptr_t pi = (uintptr_t)pv;
487    int atmax;
488
489    if (likely((pi & 1) == 0)) {
490        return load_atomic2(pv);
491    }
492    if (HAVE_al16_fast) {
493        return load_atom_extract_al16_or_al8(pv, 2);
494    }
495
496    atmax = required_atomicity(env, pi, memop);
497    switch (atmax) {
498    case MO_8:
499        return lduw_he_p(pv);
500    case MO_16:
501        /* The only case remaining is MO_ATOM_WITHIN16. */
502        if (!HAVE_al8_fast && (pi & 3) == 1) {
503            /* Big or little endian, we want the middle two bytes. */
504            return load_atomic4(pv - 1) >> 8;
505        }
506        if ((pi & 15) != 7) {
507            return load_atom_extract_al8_or_exit(env, ra, pv, 2);
508        }
509        return load_atom_extract_al16_or_exit(env, ra, pv, 2);
510    default:
511        g_assert_not_reached();
512    }
513}
514
515/**
516 * load_atom_4:
517 * @p: host address
518 * @memop: the full memory op
519 *
520 * Load 4 bytes from @p, honoring the atomicity of @memop.
521 */
522static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
523                            void *pv, MemOp memop)
524{
525    uintptr_t pi = (uintptr_t)pv;
526    int atmax;
527
528    if (likely((pi & 3) == 0)) {
529        return load_atomic4(pv);
530    }
531    if (HAVE_al16_fast) {
532        return load_atom_extract_al16_or_al8(pv, 4);
533    }
534
535    atmax = required_atomicity(env, pi, memop);
536    switch (atmax) {
537    case MO_8:
538    case MO_16:
539    case -MO_16:
540        /*
541         * For MO_ATOM_IFALIGN, this is more atomicity than required,
542         * but it's trivially supported on all hosts, better than 4
543         * individual byte loads (when the host requires alignment),
544         * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
545         */
546        return load_atom_extract_al4x2(pv);
547    case MO_32:
548        if (!(pi & 4)) {
549            return load_atom_extract_al8_or_exit(env, ra, pv, 4);
550        }
551        return load_atom_extract_al16_or_exit(env, ra, pv, 4);
552    default:
553        g_assert_not_reached();
554    }
555}
556
557/**
558 * load_atom_8:
559 * @p: host address
560 * @memop: the full memory op
561 *
562 * Load 8 bytes from @p, honoring the atomicity of @memop.
563 */
564static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
565                            void *pv, MemOp memop)
566{
567    uintptr_t pi = (uintptr_t)pv;
568    int atmax;
569
570    /*
571     * If the host does not support 8-byte atomics, wait until we have
572     * examined the atomicity parameters below.
573     */
574    if (HAVE_al8 && likely((pi & 7) == 0)) {
575        return load_atomic8(pv);
576    }
577    if (HAVE_al16_fast) {
578        return load_atom_extract_al16_or_al8(pv, 8);
579    }
580
581    atmax = required_atomicity(env, pi, memop);
582    if (atmax == MO_64) {
583        if (!HAVE_al8 && (pi & 7) == 0) {
584            load_atomic8_or_exit(env, ra, pv);
585        }
586        return load_atom_extract_al16_or_exit(env, ra, pv, 8);
587    }
588    if (HAVE_al8_fast) {
589        return load_atom_extract_al8x2(pv);
590    }
591    switch (atmax) {
592    case MO_8:
593        return ldq_he_p(pv);
594    case MO_16:
595        return load_atom_8_by_2(pv);
596    case MO_32:
597        return load_atom_8_by_4(pv);
598    case -MO_32:
599        if (HAVE_al8) {
600            return load_atom_extract_al8x2(pv);
601        }
602        cpu_loop_exit_atomic(env_cpu(env), ra);
603    default:
604        g_assert_not_reached();
605    }
606}
607
608/**
609 * load_atom_16:
610 * @p: host address
611 * @memop: the full memory op
612 *
613 * Load 16 bytes from @p, honoring the atomicity of @memop.
614 */
615static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
616                           void *pv, MemOp memop)
617{
618    uintptr_t pi = (uintptr_t)pv;
619    int atmax;
620    Int128 r;
621    uint64_t a, b;
622
623    /*
624     * If the host does not support 16-byte atomics, wait until we have
625     * examined the atomicity parameters below.
626     */
627    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
628        return load_atomic16(pv);
629    }
630
631    atmax = required_atomicity(env, pi, memop);
632    switch (atmax) {
633    case MO_8:
634        memcpy(&r, pv, 16);
635        return r;
636    case MO_16:
637        a = load_atom_8_by_2(pv);
638        b = load_atom_8_by_2(pv + 8);
639        break;
640    case MO_32:
641        a = load_atom_8_by_4(pv);
642        b = load_atom_8_by_4(pv + 8);
643        break;
644    case MO_64:
645        if (!HAVE_al8) {
646            cpu_loop_exit_atomic(env_cpu(env), ra);
647        }
648        a = load_atomic8(pv);
649        b = load_atomic8(pv + 8);
650        break;
651    case -MO_64:
652        if (!HAVE_al8) {
653            cpu_loop_exit_atomic(env_cpu(env), ra);
654        }
655        a = load_atom_extract_al8x2(pv);
656        b = load_atom_extract_al8x2(pv + 8);
657        break;
658    case MO_128:
659        return load_atomic16_or_exit(env, ra, pv);
660    default:
661        g_assert_not_reached();
662    }
663    return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
664}
665
666/**
667 * store_atomic2:
668 * @pv: host address
669 * @val: value to store
670 *
671 * Atomically store 2 aligned bytes to @pv.
672 */
673static inline void store_atomic2(void *pv, uint16_t val)
674{
675    uint16_t *p = __builtin_assume_aligned(pv, 2);
676    qatomic_set(p, val);
677}
678
679/**
680 * store_atomic4:
681 * @pv: host address
682 * @val: value to store
683 *
684 * Atomically store 4 aligned bytes to @pv.
685 */
686static inline void store_atomic4(void *pv, uint32_t val)
687{
688    uint32_t *p = __builtin_assume_aligned(pv, 4);
689    qatomic_set(p, val);
690}
691
692/**
693 * store_atomic8:
694 * @pv: host address
695 * @val: value to store
696 *
697 * Atomically store 8 aligned bytes to @pv.
698 */
699static inline void store_atomic8(void *pv, uint64_t val)
700{
701    uint64_t *p = __builtin_assume_aligned(pv, 8);
702
703    qemu_build_assert(HAVE_al8);
704    qatomic_set__nocheck(p, val);
705}
706
707/**
708 * store_atomic16:
709 * @pv: host address
710 * @val: value to store
711 *
712 * Atomically store 16 aligned bytes to @pv.
713 */
714static inline void ATTRIBUTE_ATOMIC128_OPT
715store_atomic16(void *pv, Int128Alias val)
716{
717#if defined(CONFIG_ATOMIC128)
718    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
719    qatomic_set__nocheck(pu, val.u);
720#elif defined(CONFIG_CMPXCHG128)
721    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
722    __uint128_t o;
723
724    /*
725     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
726     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
727     * and accept the sequential consistency that comes with it.
728     */
729    do {
730        o = *pu;
731    } while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
732#else
733    qemu_build_not_reached();
734#endif
735}
736
737/**
738 * store_atom_4x2
739 */
740static inline void store_atom_4_by_2(void *pv, uint32_t val)
741{
742    store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
743    store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
744}
745
746/**
747 * store_atom_8_by_2
748 */
749static inline void store_atom_8_by_2(void *pv, uint64_t val)
750{
751    store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
752    store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
753}
754
755/**
756 * store_atom_8_by_4
757 */
758static inline void store_atom_8_by_4(void *pv, uint64_t val)
759{
760    store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
761    store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
762}
763
764/**
765 * store_atom_insert_al4:
766 * @p: host address
767 * @val: shifted value to store
768 * @msk: mask for value to store
769 *
770 * Atomically store @val to @p, masked by @msk.
771 */
772static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
773{
774    uint32_t old, new;
775
776    p = __builtin_assume_aligned(p, 4);
777    old = qatomic_read(p);
778    do {
779        new = (old & ~msk) | val;
780    } while (!__atomic_compare_exchange_n(p, &old, new, true,
781                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
782}
783
784/**
785 * store_atom_insert_al8:
786 * @p: host address
787 * @val: shifted value to store
788 * @msk: mask for value to store
789 *
790 * Atomically store @val to @p masked by @msk.
791 */
792static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
793{
794    uint64_t old, new;
795
796    qemu_build_assert(HAVE_al8);
797    p = __builtin_assume_aligned(p, 8);
798    old = qatomic_read__nocheck(p);
799    do {
800        new = (old & ~msk) | val;
801    } while (!__atomic_compare_exchange_n(p, &old, new, true,
802                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
803}
804
805/**
806 * store_atom_insert_al16:
807 * @p: host address
808 * @val: shifted value to store
809 * @msk: mask for value to store
810 *
811 * Atomically store @val to @p masked by @msk.
812 */
813static void ATTRIBUTE_ATOMIC128_OPT
814store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
815{
816#if defined(CONFIG_ATOMIC128)
817    __uint128_t *pu, old, new;
818
819    /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
820    pu = __builtin_assume_aligned(ps, 16);
821    old = *pu;
822    do {
823        new = (old & ~msk.u) | val.u;
824    } while (!__atomic_compare_exchange_n(pu, &old, new, true,
825                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
826#elif defined(CONFIG_CMPXCHG128)
827    __uint128_t *pu, old, new;
828
829    /*
830     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
831     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
832     * and accept the sequential consistency that comes with it.
833     */
834    pu = __builtin_assume_aligned(ps, 16);
835    do {
836        old = *pu;
837        new = (old & ~msk.u) | val.u;
838    } while (!__sync_bool_compare_and_swap_16(pu, old, new));
839#else
840    qemu_build_not_reached();
841#endif
842}
843
844/**
845 * store_bytes_leN:
846 * @pv: host address
847 * @size: number of bytes to store
848 * @val_le: data to store
849 *
850 * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
851 * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
852 */
853static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
854{
855    uint8_t *p = pv;
856    for (int i = 0; i < size; i++, val_le >>= 8) {
857        p[i] = val_le;
858    }
859    return val_le;
860}
861
862/**
863 * store_parts_leN
864 * @pv: host address
865 * @size: number of bytes to store
866 * @val_le: data to store
867 *
868 * As store_bytes_leN, but atomically on each aligned part.
869 */
870G_GNUC_UNUSED
871static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
872{
873    do {
874        int n;
875
876        /* Find minimum of alignment and size */
877        switch (((uintptr_t)pv | size) & 7) {
878        case 4:
879            store_atomic4(pv, le32_to_cpu(val_le));
880            val_le >>= 32;
881            n = 4;
882            break;
883        case 2:
884        case 6:
885            store_atomic2(pv, le16_to_cpu(val_le));
886            val_le >>= 16;
887            n = 2;
888            break;
889        default:
890            *(uint8_t *)pv = val_le;
891            val_le >>= 8;
892            n = 1;
893            break;
894        case 0:
895            g_assert_not_reached();
896        }
897        pv += n;
898        size -= n;
899    } while (size != 0);
900
901    return val_le;
902}
903
904/**
905 * store_whole_le4
906 * @pv: host address
907 * @size: number of bytes to store
908 * @val_le: data to store
909 *
910 * As store_bytes_leN, but atomically as a whole.
911 * Four aligned bytes are guaranteed to cover the store.
912 */
913static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
914{
915    int sz = size * 8;
916    int o = (uintptr_t)pv & 3;
917    int sh = o * 8;
918    uint32_t m = MAKE_64BIT_MASK(0, sz);
919    uint32_t v;
920
921    if (HOST_BIG_ENDIAN) {
922        v = bswap32(val_le) >> sh;
923        m = bswap32(m) >> sh;
924    } else {
925        v = val_le << sh;
926        m <<= sh;
927    }
928    store_atom_insert_al4(pv - o, v, m);
929    return val_le >> sz;
930}
931
932/**
933 * store_whole_le8
934 * @pv: host address
935 * @size: number of bytes to store
936 * @val_le: data to store
937 *
938 * As store_bytes_leN, but atomically as a whole.
939 * Eight aligned bytes are guaranteed to cover the store.
940 */
941static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
942{
943    int sz = size * 8;
944    int o = (uintptr_t)pv & 7;
945    int sh = o * 8;
946    uint64_t m = MAKE_64BIT_MASK(0, sz);
947    uint64_t v;
948
949    qemu_build_assert(HAVE_al8);
950    if (HOST_BIG_ENDIAN) {
951        v = bswap64(val_le) >> sh;
952        m = bswap64(m) >> sh;
953    } else {
954        v = val_le << sh;
955        m <<= sh;
956    }
957    store_atom_insert_al8(pv - o, v, m);
958    return val_le >> sz;
959}
960
961/**
962 * store_whole_le16
963 * @pv: host address
964 * @size: number of bytes to store
965 * @val_le: data to store
966 *
967 * As store_bytes_leN, but atomically as a whole.
968 * 16 aligned bytes are guaranteed to cover the store.
969 */
970static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
971{
972    int sz = size * 8;
973    int o = (uintptr_t)pv & 15;
974    int sh = o * 8;
975    Int128 m, v;
976
977    qemu_build_assert(HAVE_al16);
978
979    /* Like MAKE_64BIT_MASK(0, sz), but larger. */
980    if (sz <= 64) {
981        m = int128_make64(MAKE_64BIT_MASK(0, sz));
982    } else {
983        m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
984    }
985
986    if (HOST_BIG_ENDIAN) {
987        v = int128_urshift(bswap128(val_le), sh);
988        m = int128_urshift(bswap128(m), sh);
989    } else {
990        v = int128_lshift(val_le, sh);
991        m = int128_lshift(m, sh);
992    }
993    store_atom_insert_al16(pv - o, v, m);
994
995    /* Unused if sz <= 64. */
996    return int128_gethi(val_le) >> (sz - 64);
997}
998
999/**
1000 * store_atom_2:
1001 * @p: host address
1002 * @val: the value to store
1003 * @memop: the full memory op
1004 *
1005 * Store 2 bytes to @p, honoring the atomicity of @memop.
1006 */
1007static void store_atom_2(CPUArchState *env, uintptr_t ra,
1008                         void *pv, MemOp memop, uint16_t val)
1009{
1010    uintptr_t pi = (uintptr_t)pv;
1011    int atmax;
1012
1013    if (likely((pi & 1) == 0)) {
1014        store_atomic2(pv, val);
1015        return;
1016    }
1017
1018    atmax = required_atomicity(env, pi, memop);
1019    if (atmax == MO_8) {
1020        stw_he_p(pv, val);
1021        return;
1022    }
1023
1024    /*
1025     * The only case remaining is MO_ATOM_WITHIN16.
1026     * Big or little endian, we want the middle two bytes in each test.
1027     */
1028    if ((pi & 3) == 1) {
1029        store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
1030        return;
1031    } else if ((pi & 7) == 3) {
1032        if (HAVE_al8) {
1033            store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
1034            return;
1035        }
1036    } else if ((pi & 15) == 7) {
1037        if (HAVE_al16) {
1038            Int128 v = int128_lshift(int128_make64(val), 56);
1039            Int128 m = int128_lshift(int128_make64(0xffff), 56);
1040            store_atom_insert_al16(pv - 7, v, m);
1041            return;
1042        }
1043    } else {
1044        g_assert_not_reached();
1045    }
1046
1047    cpu_loop_exit_atomic(env_cpu(env), ra);
1048}
1049
1050/**
1051 * store_atom_4:
1052 * @p: host address
1053 * @val: the value to store
1054 * @memop: the full memory op
1055 *
1056 * Store 4 bytes to @p, honoring the atomicity of @memop.
1057 */
1058static void store_atom_4(CPUArchState *env, uintptr_t ra,
1059                         void *pv, MemOp memop, uint32_t val)
1060{
1061    uintptr_t pi = (uintptr_t)pv;
1062    int atmax;
1063
1064    if (likely((pi & 3) == 0)) {
1065        store_atomic4(pv, val);
1066        return;
1067    }
1068
1069    atmax = required_atomicity(env, pi, memop);
1070    switch (atmax) {
1071    case MO_8:
1072        stl_he_p(pv, val);
1073        return;
1074    case MO_16:
1075        store_atom_4_by_2(pv, val);
1076        return;
1077    case -MO_16:
1078        {
1079            uint32_t val_le = cpu_to_le32(val);
1080            int s2 = pi & 3;
1081            int s1 = 4 - s2;
1082
1083            switch (s2) {
1084            case 1:
1085                val_le = store_whole_le4(pv, s1, val_le);
1086                *(uint8_t *)(pv + 3) = val_le;
1087                break;
1088            case 3:
1089                *(uint8_t *)pv = val_le;
1090                store_whole_le4(pv + 1, s2, val_le >> 8);
1091                break;
1092            case 0: /* aligned */
1093            case 2: /* atmax MO_16 */
1094            default:
1095                g_assert_not_reached();
1096            }
1097        }
1098        return;
1099    case MO_32:
1100        if ((pi & 7) < 4) {
1101            if (HAVE_al8) {
1102                store_whole_le8(pv, 4, cpu_to_le32(val));
1103                return;
1104            }
1105        } else {
1106            if (HAVE_al16) {
1107                store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
1108                return;
1109            }
1110        }
1111        cpu_loop_exit_atomic(env_cpu(env), ra);
1112    default:
1113        g_assert_not_reached();
1114    }
1115}
1116
1117/**
1118 * store_atom_8:
1119 * @p: host address
1120 * @val: the value to store
1121 * @memop: the full memory op
1122 *
1123 * Store 8 bytes to @p, honoring the atomicity of @memop.
1124 */
1125static void store_atom_8(CPUArchState *env, uintptr_t ra,
1126                         void *pv, MemOp memop, uint64_t val)
1127{
1128    uintptr_t pi = (uintptr_t)pv;
1129    int atmax;
1130
1131    if (HAVE_al8 && likely((pi & 7) == 0)) {
1132        store_atomic8(pv, val);
1133        return;
1134    }
1135
1136    atmax = required_atomicity(env, pi, memop);
1137    switch (atmax) {
1138    case MO_8:
1139        stq_he_p(pv, val);
1140        return;
1141    case MO_16:
1142        store_atom_8_by_2(pv, val);
1143        return;
1144    case MO_32:
1145        store_atom_8_by_4(pv, val);
1146        return;
1147    case -MO_32:
1148        if (HAVE_al8) {
1149            uint64_t val_le = cpu_to_le64(val);
1150            int s2 = pi & 7;
1151            int s1 = 8 - s2;
1152
1153            switch (s2) {
1154            case 1 ... 3:
1155                val_le = store_whole_le8(pv, s1, val_le);
1156                store_bytes_leN(pv + s1, s2, val_le);
1157                break;
1158            case 5 ... 7:
1159                val_le = store_bytes_leN(pv, s1, val_le);
1160                store_whole_le8(pv + s1, s2, val_le);
1161                break;
1162            case 0: /* aligned */
1163            case 4: /* atmax MO_32 */
1164            default:
1165                g_assert_not_reached();
1166            }
1167            return;
1168        }
1169        break;
1170    case MO_64:
1171        if (HAVE_al16) {
1172            store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1173            return;
1174        }
1175        break;
1176    default:
1177        g_assert_not_reached();
1178    }
1179    cpu_loop_exit_atomic(env_cpu(env), ra);
1180}
1181
1182/**
1183 * store_atom_16:
1184 * @p: host address
1185 * @val: the value to store
1186 * @memop: the full memory op
1187 *
1188 * Store 16 bytes to @p, honoring the atomicity of @memop.
1189 */
1190static void store_atom_16(CPUArchState *env, uintptr_t ra,
1191                          void *pv, MemOp memop, Int128 val)
1192{
1193    uintptr_t pi = (uintptr_t)pv;
1194    uint64_t a, b;
1195    int atmax;
1196
1197    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
1198        store_atomic16(pv, val);
1199        return;
1200    }
1201
1202    atmax = required_atomicity(env, pi, memop);
1203
1204    a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1205    b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1206    switch (atmax) {
1207    case MO_8:
1208        memcpy(pv, &val, 16);
1209        return;
1210    case MO_16:
1211        store_atom_8_by_2(pv, a);
1212        store_atom_8_by_2(pv + 8, b);
1213        return;
1214    case MO_32:
1215        store_atom_8_by_4(pv, a);
1216        store_atom_8_by_4(pv + 8, b);
1217        return;
1218    case MO_64:
1219        if (HAVE_al8) {
1220            store_atomic8(pv, a);
1221            store_atomic8(pv + 8, b);
1222            return;
1223        }
1224        break;
1225    case -MO_64:
1226        if (HAVE_al16) {
1227            uint64_t val_le;
1228            int s2 = pi & 15;
1229            int s1 = 16 - s2;
1230
1231            if (HOST_BIG_ENDIAN) {
1232                val = bswap128(val);
1233            }
1234            switch (s2) {
1235            case 1 ... 7:
1236                val_le = store_whole_le16(pv, s1, val);
1237                store_bytes_leN(pv + s1, s2, val_le);
1238                break;
1239            case 9 ... 15:
1240                store_bytes_leN(pv, s1, int128_getlo(val));
1241                val = int128_urshift(val, s1 * 8);
1242                store_whole_le16(pv + s1, s2, val);
1243                break;
1244            case 0: /* aligned */
1245            case 8: /* atmax MO_64 */
1246            default:
1247                g_assert_not_reached();
1248            }
1249            return;
1250        }
1251        break;
1252    case MO_128:
1253        if (HAVE_al16) {
1254            store_atomic16(pv, val);
1255            return;
1256        }
1257        break;
1258    default:
1259        g_assert_not_reached();
1260    }
1261    cpu_loop_exit_atomic(env_cpu(env), ra);
1262}
1263