xref: /openbmc/qemu/accel/tcg/ldst_atomicity.c.inc (revision bc9da794)
1/*
2 * Routines common to user and system emulation of load/store.
3 *
4 *  Copyright (c) 2022 Linaro, Ltd.
5 *
6 * SPDX-License-Identifier: GPL-2.0-or-later
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12#include "host/load-extract-al16-al8.h.inc"
13#include "host/store-insert-al16.h.inc"
14
15#ifdef CONFIG_ATOMIC64
16# define HAVE_al8          true
17#else
18# define HAVE_al8          false
19#endif
20#define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
21
22/**
23 * required_atomicity:
24 *
25 * Return the lg2 bytes of atomicity required by @memop for @p.
26 * If the operation must be split into two operations to be
27 * examined separately for atomicity, return -lg2.
28 */
29static int required_atomicity(CPUState *cpu, uintptr_t p, MemOp memop)
30{
31    MemOp atom = memop & MO_ATOM_MASK;
32    MemOp size = memop & MO_SIZE;
33    MemOp half = size ? size - 1 : 0;
34    unsigned tmp;
35    int atmax;
36
37    switch (atom) {
38    case MO_ATOM_NONE:
39        atmax = MO_8;
40        break;
41
42    case MO_ATOM_IFALIGN_PAIR:
43        size = half;
44        /* fall through */
45
46    case MO_ATOM_IFALIGN:
47        tmp = (1 << size) - 1;
48        atmax = p & tmp ? MO_8 : size;
49        break;
50
51    case MO_ATOM_WITHIN16:
52        tmp = p & 15;
53        atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
54        break;
55
56    case MO_ATOM_WITHIN16_PAIR:
57        tmp = p & 15;
58        if (tmp + (1 << size) <= 16) {
59            atmax = size;
60        } else if (tmp + (1 << half) == 16) {
61            /*
62             * The pair exactly straddles the boundary.
63             * Both halves are naturally aligned and atomic.
64             */
65            atmax = half;
66        } else {
67            /*
68             * One of the pair crosses the boundary, and is non-atomic.
69             * The other of the pair does not cross, and is atomic.
70             */
71            atmax = -half;
72        }
73        break;
74
75    case MO_ATOM_SUBALIGN:
76        /*
77         * Examine the alignment of p to determine if there are subobjects
78         * that must be aligned.  Note that we only really need ctz4() --
79         * any more significant bits are discarded by the immediately
80         * following comparison.
81         */
82        tmp = ctz32(p);
83        atmax = MIN(size, tmp);
84        break;
85
86    default:
87        g_assert_not_reached();
88    }
89
90    /*
91     * Here we have the architectural atomicity of the operation.
92     * However, when executing in a serial context, we need no extra
93     * host atomicity in order to avoid racing.  This reduction
94     * avoids looping with cpu_loop_exit_atomic.
95     */
96    if (cpu_in_serial_context(cpu)) {
97        return MO_8;
98    }
99    return atmax;
100}
101
102/**
103 * load_atomic2:
104 * @pv: host address
105 *
106 * Atomically load 2 aligned bytes from @pv.
107 */
108static inline uint16_t load_atomic2(void *pv)
109{
110    uint16_t *p = __builtin_assume_aligned(pv, 2);
111    return qatomic_read(p);
112}
113
114/**
115 * load_atomic4:
116 * @pv: host address
117 *
118 * Atomically load 4 aligned bytes from @pv.
119 */
120static inline uint32_t load_atomic4(void *pv)
121{
122    uint32_t *p = __builtin_assume_aligned(pv, 4);
123    return qatomic_read(p);
124}
125
126/**
127 * load_atomic8:
128 * @pv: host address
129 *
130 * Atomically load 8 aligned bytes from @pv.
131 */
132static inline uint64_t load_atomic8(void *pv)
133{
134    uint64_t *p = __builtin_assume_aligned(pv, 8);
135
136    qemu_build_assert(HAVE_al8);
137    return qatomic_read__nocheck(p);
138}
139
140/**
141 * load_atomic8_or_exit:
142 * @cpu: generic cpu state
143 * @ra: host unwind address
144 * @pv: host address
145 *
146 * Atomically load 8 aligned bytes from @pv.
147 * If this is not possible, longjmp out to restart serially.
148 */
149static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
150{
151    if (HAVE_al8) {
152        return load_atomic8(pv);
153    }
154
155#ifdef CONFIG_USER_ONLY
156    /*
157     * If the page is not writable, then assume the value is immutable
158     * and requires no locking.  This ignores the case of MAP_SHARED with
159     * another process, because the fallback start_exclusive solution
160     * provides no protection across processes.
161     */
162    WITH_MMAP_LOCK_GUARD() {
163        if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
164            uint64_t *p = __builtin_assume_aligned(pv, 8);
165            return *p;
166        }
167    }
168#endif
169
170    /* Ultimate fallback: re-execute in serial context. */
171    trace_load_atom8_or_exit_fallback(ra);
172    cpu_loop_exit_atomic(cpu, ra);
173}
174
175/**
176 * load_atomic16_or_exit:
177 * @cpu: generic cpu state
178 * @ra: host unwind address
179 * @pv: host address
180 *
181 * Atomically load 16 aligned bytes from @pv.
182 * If this is not possible, longjmp out to restart serially.
183 */
184static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
185{
186    Int128 *p = __builtin_assume_aligned(pv, 16);
187
188    if (HAVE_ATOMIC128_RO) {
189        return atomic16_read_ro(p);
190    }
191
192    /*
193     * We can only use cmpxchg to emulate a load if the page is writable.
194     * If the page is not writable, then assume the value is immutable
195     * and requires no locking.  This ignores the case of MAP_SHARED with
196     * another process, because the fallback start_exclusive solution
197     * provides no protection across processes.
198     *
199     * In system mode all guest pages are writable.  For user mode,
200     * we must take mmap_lock so that the query remains valid until
201     * the write is complete -- tests/tcg/multiarch/munmap-pthread.c
202     * is an example that can race.
203     */
204    WITH_MMAP_LOCK_GUARD() {
205#ifdef CONFIG_USER_ONLY
206        if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
207            return *p;
208        }
209#endif
210        if (HAVE_ATOMIC128_RW) {
211            return atomic16_read_rw(p);
212        }
213    }
214
215    /* Ultimate fallback: re-execute in serial context. */
216    trace_load_atom16_or_exit_fallback(ra);
217    cpu_loop_exit_atomic(cpu, ra);
218}
219
220/**
221 * load_atom_extract_al4x2:
222 * @pv: host address
223 *
224 * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
225 */
226static uint32_t load_atom_extract_al4x2(void *pv)
227{
228    uintptr_t pi = (uintptr_t)pv;
229    int sh = (pi & 3) * 8;
230    uint32_t a, b;
231
232    pv = (void *)(pi & ~3);
233    a = load_atomic4(pv);
234    b = load_atomic4(pv + 4);
235
236    if (HOST_BIG_ENDIAN) {
237        return (a << sh) | (b >> (-sh & 31));
238    } else {
239        return (a >> sh) | (b << (-sh & 31));
240    }
241}
242
243/**
244 * load_atom_extract_al8x2:
245 * @pv: host address
246 *
247 * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
248 */
249static uint64_t load_atom_extract_al8x2(void *pv)
250{
251    uintptr_t pi = (uintptr_t)pv;
252    int sh = (pi & 7) * 8;
253    uint64_t a, b;
254
255    pv = (void *)(pi & ~7);
256    a = load_atomic8(pv);
257    b = load_atomic8(pv + 8);
258
259    if (HOST_BIG_ENDIAN) {
260        return (a << sh) | (b >> (-sh & 63));
261    } else {
262        return (a >> sh) | (b << (-sh & 63));
263    }
264}
265
266/**
267 * load_atom_extract_al8_or_exit:
268 * @cpu: generic cpu state
269 * @ra: host unwind address
270 * @pv: host address
271 * @s: object size in bytes, @s <= 4.
272 *
273 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
274 * not cross an 8-byte boundary.  This means that we can perform an atomic
275 * 8-byte load and extract.
276 * The value is returned in the low bits of a uint32_t.
277 */
278static uint32_t load_atom_extract_al8_or_exit(CPUState *cpu, uintptr_t ra,
279                                              void *pv, int s)
280{
281    uintptr_t pi = (uintptr_t)pv;
282    int o = pi & 7;
283    int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
284
285    pv = (void *)(pi & ~7);
286    return load_atomic8_or_exit(cpu, ra, pv) >> shr;
287}
288
289/**
290 * load_atom_extract_al16_or_exit:
291 * @cpu: generic cpu state
292 * @ra: host unwind address
293 * @p: host address
294 * @s: object size in bytes, @s <= 8.
295 *
296 * Atomically load @s bytes from @p, when p % 16 < 8
297 * and p % 16 + s > 8.  I.e. does not cross a 16-byte
298 * boundary, but *does* cross an 8-byte boundary.
299 * This is the slow version, so we must have eliminated
300 * any faster load_atom_extract_al8_or_exit case.
301 *
302 * If this is not possible, longjmp out to restart serially.
303 */
304static uint64_t load_atom_extract_al16_or_exit(CPUState *cpu, uintptr_t ra,
305                                               void *pv, int s)
306{
307    uintptr_t pi = (uintptr_t)pv;
308    int o = pi & 7;
309    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
310    Int128 r;
311
312    /*
313     * Note constraints above: p & 8 must be clear.
314     * Provoke SIGBUS if possible otherwise.
315     */
316    pv = (void *)(pi & ~7);
317    r = load_atomic16_or_exit(cpu, ra, pv);
318
319    r = int128_urshift(r, shr);
320    return int128_getlo(r);
321}
322
323/**
324 * load_atom_4_by_2:
325 * @pv: host address
326 *
327 * Load 4 bytes from @pv, with two 2-byte atomic loads.
328 */
329static inline uint32_t load_atom_4_by_2(void *pv)
330{
331    uint32_t a = load_atomic2(pv);
332    uint32_t b = load_atomic2(pv + 2);
333
334    if (HOST_BIG_ENDIAN) {
335        return (a << 16) | b;
336    } else {
337        return (b << 16) | a;
338    }
339}
340
341/**
342 * load_atom_8_by_2:
343 * @pv: host address
344 *
345 * Load 8 bytes from @pv, with four 2-byte atomic loads.
346 */
347static inline uint64_t load_atom_8_by_2(void *pv)
348{
349    uint32_t a = load_atom_4_by_2(pv);
350    uint32_t b = load_atom_4_by_2(pv + 4);
351
352    if (HOST_BIG_ENDIAN) {
353        return ((uint64_t)a << 32) | b;
354    } else {
355        return ((uint64_t)b << 32) | a;
356    }
357}
358
359/**
360 * load_atom_8_by_4:
361 * @pv: host address
362 *
363 * Load 8 bytes from @pv, with two 4-byte atomic loads.
364 */
365static inline uint64_t load_atom_8_by_4(void *pv)
366{
367    uint32_t a = load_atomic4(pv);
368    uint32_t b = load_atomic4(pv + 4);
369
370    if (HOST_BIG_ENDIAN) {
371        return ((uint64_t)a << 32) | b;
372    } else {
373        return ((uint64_t)b << 32) | a;
374    }
375}
376
377/**
378 * load_atom_8_by_8_or_4:
379 * @pv: host address
380 *
381 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
382 */
383static inline uint64_t load_atom_8_by_8_or_4(void *pv)
384{
385    if (HAVE_al8_fast) {
386        return load_atomic8(pv);
387    } else {
388        return load_atom_8_by_4(pv);
389    }
390}
391
392/**
393 * load_atom_2:
394 * @p: host address
395 * @memop: the full memory op
396 *
397 * Load 2 bytes from @p, honoring the atomicity of @memop.
398 */
399static uint16_t load_atom_2(CPUState *cpu, uintptr_t ra,
400                            void *pv, MemOp memop)
401{
402    uintptr_t pi = (uintptr_t)pv;
403    int atmax;
404
405    if (likely((pi & 1) == 0)) {
406        return load_atomic2(pv);
407    }
408    if (HAVE_ATOMIC128_RO) {
409        intptr_t left_in_page = -(pi | TARGET_PAGE_MASK);
410        if (likely(left_in_page > 8)) {
411            return load_atom_extract_al16_or_al8(pv, 2);
412        }
413    }
414
415    atmax = required_atomicity(cpu, pi, memop);
416    switch (atmax) {
417    case MO_8:
418        return lduw_he_p(pv);
419    case MO_16:
420        /* The only case remaining is MO_ATOM_WITHIN16. */
421        if (!HAVE_al8_fast && (pi & 3) == 1) {
422            /* Big or little endian, we want the middle two bytes. */
423            return load_atomic4(pv - 1) >> 8;
424        }
425        if ((pi & 15) != 7) {
426            return load_atom_extract_al8_or_exit(cpu, ra, pv, 2);
427        }
428        return load_atom_extract_al16_or_exit(cpu, ra, pv, 2);
429    default:
430        g_assert_not_reached();
431    }
432}
433
434/**
435 * load_atom_4:
436 * @p: host address
437 * @memop: the full memory op
438 *
439 * Load 4 bytes from @p, honoring the atomicity of @memop.
440 */
441static uint32_t load_atom_4(CPUState *cpu, uintptr_t ra,
442                            void *pv, MemOp memop)
443{
444    uintptr_t pi = (uintptr_t)pv;
445    int atmax;
446
447    if (likely((pi & 3) == 0)) {
448        return load_atomic4(pv);
449    }
450    if (HAVE_ATOMIC128_RO) {
451        intptr_t left_in_page = -(pi | TARGET_PAGE_MASK);
452        if (likely(left_in_page > 8)) {
453            return load_atom_extract_al16_or_al8(pv, 4);
454        }
455    }
456
457    atmax = required_atomicity(cpu, pi, memop);
458    switch (atmax) {
459    case MO_8:
460    case MO_16:
461    case -MO_16:
462        /*
463         * For MO_ATOM_IFALIGN, this is more atomicity than required,
464         * but it's trivially supported on all hosts, better than 4
465         * individual byte loads (when the host requires alignment),
466         * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
467         */
468        return load_atom_extract_al4x2(pv);
469    case MO_32:
470        if (!(pi & 4)) {
471            return load_atom_extract_al8_or_exit(cpu, ra, pv, 4);
472        }
473        return load_atom_extract_al16_or_exit(cpu, ra, pv, 4);
474    default:
475        g_assert_not_reached();
476    }
477}
478
479/**
480 * load_atom_8:
481 * @p: host address
482 * @memop: the full memory op
483 *
484 * Load 8 bytes from @p, honoring the atomicity of @memop.
485 */
486static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra,
487                            void *pv, MemOp memop)
488{
489    uintptr_t pi = (uintptr_t)pv;
490    int atmax;
491
492    /*
493     * If the host does not support 8-byte atomics, wait until we have
494     * examined the atomicity parameters below.
495     */
496    if (HAVE_al8 && likely((pi & 7) == 0)) {
497        return load_atomic8(pv);
498    }
499    if (HAVE_ATOMIC128_RO) {
500        return load_atom_extract_al16_or_al8(pv, 8);
501    }
502
503    atmax = required_atomicity(cpu, pi, memop);
504    if (atmax == MO_64) {
505        if (!HAVE_al8 && (pi & 7) == 0) {
506            load_atomic8_or_exit(cpu, ra, pv);
507        }
508        return load_atom_extract_al16_or_exit(cpu, ra, pv, 8);
509    }
510    if (HAVE_al8_fast) {
511        return load_atom_extract_al8x2(pv);
512    }
513    switch (atmax) {
514    case MO_8:
515        return ldq_he_p(pv);
516    case MO_16:
517        return load_atom_8_by_2(pv);
518    case MO_32:
519        return load_atom_8_by_4(pv);
520    case -MO_32:
521        if (HAVE_al8) {
522            return load_atom_extract_al8x2(pv);
523        }
524        trace_load_atom8_fallback(memop, ra);
525        cpu_loop_exit_atomic(cpu, ra);
526    default:
527        g_assert_not_reached();
528    }
529}
530
531/**
532 * load_atom_16:
533 * @p: host address
534 * @memop: the full memory op
535 *
536 * Load 16 bytes from @p, honoring the atomicity of @memop.
537 */
538static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
539                           void *pv, MemOp memop)
540{
541    uintptr_t pi = (uintptr_t)pv;
542    int atmax;
543    Int128 r;
544    uint64_t a, b;
545
546    /*
547     * If the host does not support 16-byte atomics, wait until we have
548     * examined the atomicity parameters below.
549     */
550    if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
551        return atomic16_read_ro(pv);
552    }
553
554    atmax = required_atomicity(cpu, pi, memop);
555    switch (atmax) {
556    case MO_8:
557        memcpy(&r, pv, 16);
558        return r;
559    case MO_16:
560        a = load_atom_8_by_2(pv);
561        b = load_atom_8_by_2(pv + 8);
562        break;
563    case MO_32:
564        a = load_atom_8_by_4(pv);
565        b = load_atom_8_by_4(pv + 8);
566        break;
567    case MO_64:
568        if (!HAVE_al8) {
569            trace_load_atom16_fallback(memop, ra);
570            cpu_loop_exit_atomic(cpu, ra);
571        }
572        a = load_atomic8(pv);
573        b = load_atomic8(pv + 8);
574        break;
575    case -MO_64:
576        if (!HAVE_al8) {
577            trace_load_atom16_fallback(memop, ra);
578            cpu_loop_exit_atomic(cpu, ra);
579        }
580        a = load_atom_extract_al8x2(pv);
581        b = load_atom_extract_al8x2(pv + 8);
582        break;
583    case MO_128:
584        return load_atomic16_or_exit(cpu, ra, pv);
585    default:
586        g_assert_not_reached();
587    }
588    return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
589}
590
591/**
592 * store_atomic2:
593 * @pv: host address
594 * @val: value to store
595 *
596 * Atomically store 2 aligned bytes to @pv.
597 */
598static inline void store_atomic2(void *pv, uint16_t val)
599{
600    uint16_t *p = __builtin_assume_aligned(pv, 2);
601    qatomic_set(p, val);
602}
603
604/**
605 * store_atomic4:
606 * @pv: host address
607 * @val: value to store
608 *
609 * Atomically store 4 aligned bytes to @pv.
610 */
611static inline void store_atomic4(void *pv, uint32_t val)
612{
613    uint32_t *p = __builtin_assume_aligned(pv, 4);
614    qatomic_set(p, val);
615}
616
617/**
618 * store_atomic8:
619 * @pv: host address
620 * @val: value to store
621 *
622 * Atomically store 8 aligned bytes to @pv.
623 */
624static inline void store_atomic8(void *pv, uint64_t val)
625{
626    uint64_t *p = __builtin_assume_aligned(pv, 8);
627
628    qemu_build_assert(HAVE_al8);
629    qatomic_set__nocheck(p, val);
630}
631
632/**
633 * store_atom_4x2
634 */
635static inline void store_atom_4_by_2(void *pv, uint32_t val)
636{
637    store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
638    store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
639}
640
641/**
642 * store_atom_8_by_2
643 */
644static inline void store_atom_8_by_2(void *pv, uint64_t val)
645{
646    store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
647    store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
648}
649
650/**
651 * store_atom_8_by_4
652 */
653static inline void store_atom_8_by_4(void *pv, uint64_t val)
654{
655    store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
656    store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
657}
658
659/**
660 * store_atom_insert_al4:
661 * @p: host address
662 * @val: shifted value to store
663 * @msk: mask for value to store
664 *
665 * Atomically store @val to @p, masked by @msk.
666 */
667static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
668{
669    uint32_t old, new;
670
671    p = __builtin_assume_aligned(p, 4);
672    old = qatomic_read(p);
673    do {
674        new = (old & ~msk) | val;
675    } while (!__atomic_compare_exchange_n(p, &old, new, true,
676                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
677}
678
679/**
680 * store_atom_insert_al8:
681 * @p: host address
682 * @val: shifted value to store
683 * @msk: mask for value to store
684 *
685 * Atomically store @val to @p masked by @msk.
686 */
687static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
688{
689    uint64_t old, new;
690
691    qemu_build_assert(HAVE_al8);
692    p = __builtin_assume_aligned(p, 8);
693    old = qatomic_read__nocheck(p);
694    do {
695        new = (old & ~msk) | val;
696    } while (!__atomic_compare_exchange_n(p, &old, new, true,
697                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
698}
699
700/**
701 * store_bytes_leN:
702 * @pv: host address
703 * @size: number of bytes to store
704 * @val_le: data to store
705 *
706 * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
707 * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
708 */
709static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
710{
711    uint8_t *p = pv;
712    for (int i = 0; i < size; i++, val_le >>= 8) {
713        p[i] = val_le;
714    }
715    return val_le;
716}
717
718/**
719 * store_parts_leN
720 * @pv: host address
721 * @size: number of bytes to store
722 * @val_le: data to store
723 *
724 * As store_bytes_leN, but atomically on each aligned part.
725 */
726G_GNUC_UNUSED
727static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
728{
729    do {
730        int n;
731
732        /* Find minimum of alignment and size */
733        switch (((uintptr_t)pv | size) & 7) {
734        case 4:
735            store_atomic4(pv, le32_to_cpu(val_le));
736            val_le >>= 32;
737            n = 4;
738            break;
739        case 2:
740        case 6:
741            store_atomic2(pv, le16_to_cpu(val_le));
742            val_le >>= 16;
743            n = 2;
744            break;
745        default:
746            *(uint8_t *)pv = val_le;
747            val_le >>= 8;
748            n = 1;
749            break;
750        case 0:
751            g_assert_not_reached();
752        }
753        pv += n;
754        size -= n;
755    } while (size != 0);
756
757    return val_le;
758}
759
760/**
761 * store_whole_le4
762 * @pv: host address
763 * @size: number of bytes to store
764 * @val_le: data to store
765 *
766 * As store_bytes_leN, but atomically as a whole.
767 * Four aligned bytes are guaranteed to cover the store.
768 */
769static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
770{
771    int sz = size * 8;
772    int o = (uintptr_t)pv & 3;
773    int sh = o * 8;
774    uint32_t m = MAKE_64BIT_MASK(0, sz);
775    uint32_t v;
776
777    if (HOST_BIG_ENDIAN) {
778        v = bswap32(val_le) >> sh;
779        m = bswap32(m) >> sh;
780    } else {
781        v = val_le << sh;
782        m <<= sh;
783    }
784    store_atom_insert_al4(pv - o, v, m);
785    return val_le >> sz;
786}
787
788/**
789 * store_whole_le8
790 * @pv: host address
791 * @size: number of bytes to store
792 * @val_le: data to store
793 *
794 * As store_bytes_leN, but atomically as a whole.
795 * Eight aligned bytes are guaranteed to cover the store.
796 */
797static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
798{
799    int sz = size * 8;
800    int o = (uintptr_t)pv & 7;
801    int sh = o * 8;
802    uint64_t m = MAKE_64BIT_MASK(0, sz);
803    uint64_t v;
804
805    qemu_build_assert(HAVE_al8);
806    if (HOST_BIG_ENDIAN) {
807        v = bswap64(val_le) >> sh;
808        m = bswap64(m) >> sh;
809    } else {
810        v = val_le << sh;
811        m <<= sh;
812    }
813    store_atom_insert_al8(pv - o, v, m);
814    return val_le >> sz;
815}
816
817/**
818 * store_whole_le16
819 * @pv: host address
820 * @size: number of bytes to store
821 * @val_le: data to store
822 *
823 * As store_bytes_leN, but atomically as a whole.
824 * 16 aligned bytes are guaranteed to cover the store.
825 */
826static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
827{
828    int sz = size * 8;
829    int o = (uintptr_t)pv & 15;
830    int sh = o * 8;
831    Int128 m, v;
832
833    qemu_build_assert(HAVE_CMPXCHG128);
834
835    /* Like MAKE_64BIT_MASK(0, sz), but larger. */
836    if (sz <= 64) {
837        m = int128_make64(MAKE_64BIT_MASK(0, sz));
838    } else {
839        m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
840    }
841
842    if (HOST_BIG_ENDIAN) {
843        v = int128_urshift(bswap128(val_le), sh);
844        m = int128_urshift(bswap128(m), sh);
845    } else {
846        v = int128_lshift(val_le, sh);
847        m = int128_lshift(m, sh);
848    }
849    store_atom_insert_al16(pv - o, v, m);
850
851    if (sz <= 64) {
852        return 0;
853    }
854    return int128_gethi(val_le) >> (sz - 64);
855}
856
857/**
858 * store_atom_2:
859 * @p: host address
860 * @val: the value to store
861 * @memop: the full memory op
862 *
863 * Store 2 bytes to @p, honoring the atomicity of @memop.
864 */
865static void store_atom_2(CPUState *cpu, uintptr_t ra,
866                         void *pv, MemOp memop, uint16_t val)
867{
868    uintptr_t pi = (uintptr_t)pv;
869    int atmax;
870
871    if (likely((pi & 1) == 0)) {
872        store_atomic2(pv, val);
873        return;
874    }
875
876    atmax = required_atomicity(cpu, pi, memop);
877    if (atmax == MO_8) {
878        stw_he_p(pv, val);
879        return;
880    }
881
882    /*
883     * The only case remaining is MO_ATOM_WITHIN16.
884     * Big or little endian, we want the middle two bytes in each test.
885     */
886    if ((pi & 3) == 1) {
887        store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
888        return;
889    } else if ((pi & 7) == 3) {
890        if (HAVE_al8) {
891            store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
892            return;
893        }
894    } else if ((pi & 15) == 7) {
895        if (HAVE_CMPXCHG128) {
896            Int128 v = int128_lshift(int128_make64(val), 56);
897            Int128 m = int128_lshift(int128_make64(0xffff), 56);
898            store_atom_insert_al16(pv - 7, v, m);
899            return;
900        }
901    } else {
902        g_assert_not_reached();
903    }
904
905    trace_store_atom2_fallback(memop, ra);
906    cpu_loop_exit_atomic(cpu, ra);
907}
908
909/**
910 * store_atom_4:
911 * @p: host address
912 * @val: the value to store
913 * @memop: the full memory op
914 *
915 * Store 4 bytes to @p, honoring the atomicity of @memop.
916 */
917static void store_atom_4(CPUState *cpu, uintptr_t ra,
918                         void *pv, MemOp memop, uint32_t val)
919{
920    uintptr_t pi = (uintptr_t)pv;
921    int atmax;
922
923    if (likely((pi & 3) == 0)) {
924        store_atomic4(pv, val);
925        return;
926    }
927
928    atmax = required_atomicity(cpu, pi, memop);
929    switch (atmax) {
930    case MO_8:
931        stl_he_p(pv, val);
932        return;
933    case MO_16:
934        store_atom_4_by_2(pv, val);
935        return;
936    case -MO_16:
937        {
938            uint32_t val_le = cpu_to_le32(val);
939            int s2 = pi & 3;
940            int s1 = 4 - s2;
941
942            switch (s2) {
943            case 1:
944                val_le = store_whole_le4(pv, s1, val_le);
945                *(uint8_t *)(pv + 3) = val_le;
946                break;
947            case 3:
948                *(uint8_t *)pv = val_le;
949                store_whole_le4(pv + 1, s2, val_le >> 8);
950                break;
951            case 0: /* aligned */
952            case 2: /* atmax MO_16 */
953            default:
954                g_assert_not_reached();
955            }
956        }
957        return;
958    case MO_32:
959        if ((pi & 7) < 4) {
960            if (HAVE_al8) {
961                store_whole_le8(pv, 4, cpu_to_le32(val));
962                return;
963            }
964        } else {
965            if (HAVE_CMPXCHG128) {
966                store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
967                return;
968            }
969        }
970        trace_store_atom4_fallback(memop, ra);
971        cpu_loop_exit_atomic(cpu, ra);
972    default:
973        g_assert_not_reached();
974    }
975}
976
977/**
978 * store_atom_8:
979 * @p: host address
980 * @val: the value to store
981 * @memop: the full memory op
982 *
983 * Store 8 bytes to @p, honoring the atomicity of @memop.
984 */
985static void store_atom_8(CPUState *cpu, uintptr_t ra,
986                         void *pv, MemOp memop, uint64_t val)
987{
988    uintptr_t pi = (uintptr_t)pv;
989    int atmax;
990
991    if (HAVE_al8 && likely((pi & 7) == 0)) {
992        store_atomic8(pv, val);
993        return;
994    }
995
996    atmax = required_atomicity(cpu, pi, memop);
997    switch (atmax) {
998    case MO_8:
999        stq_he_p(pv, val);
1000        return;
1001    case MO_16:
1002        store_atom_8_by_2(pv, val);
1003        return;
1004    case MO_32:
1005        store_atom_8_by_4(pv, val);
1006        return;
1007    case -MO_32:
1008        if (HAVE_al8) {
1009            uint64_t val_le = cpu_to_le64(val);
1010            int s2 = pi & 7;
1011            int s1 = 8 - s2;
1012
1013            switch (s2) {
1014            case 1 ... 3:
1015                val_le = store_whole_le8(pv, s1, val_le);
1016                store_bytes_leN(pv + s1, s2, val_le);
1017                break;
1018            case 5 ... 7:
1019                val_le = store_bytes_leN(pv, s1, val_le);
1020                store_whole_le8(pv + s1, s2, val_le);
1021                break;
1022            case 0: /* aligned */
1023            case 4: /* atmax MO_32 */
1024            default:
1025                g_assert_not_reached();
1026            }
1027            return;
1028        }
1029        break;
1030    case MO_64:
1031        if (HAVE_CMPXCHG128) {
1032            store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1033            return;
1034        }
1035        break;
1036    default:
1037        g_assert_not_reached();
1038    }
1039    trace_store_atom8_fallback(memop, ra);
1040    cpu_loop_exit_atomic(cpu, ra);
1041}
1042
1043/**
1044 * store_atom_16:
1045 * @p: host address
1046 * @val: the value to store
1047 * @memop: the full memory op
1048 *
1049 * Store 16 bytes to @p, honoring the atomicity of @memop.
1050 */
1051static void store_atom_16(CPUState *cpu, uintptr_t ra,
1052                          void *pv, MemOp memop, Int128 val)
1053{
1054    uintptr_t pi = (uintptr_t)pv;
1055    uint64_t a, b;
1056    int atmax;
1057
1058    if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
1059        atomic16_set(pv, val);
1060        return;
1061    }
1062
1063    atmax = required_atomicity(cpu, pi, memop);
1064
1065    a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1066    b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1067    switch (atmax) {
1068    case MO_8:
1069        memcpy(pv, &val, 16);
1070        return;
1071    case MO_16:
1072        store_atom_8_by_2(pv, a);
1073        store_atom_8_by_2(pv + 8, b);
1074        return;
1075    case MO_32:
1076        store_atom_8_by_4(pv, a);
1077        store_atom_8_by_4(pv + 8, b);
1078        return;
1079    case MO_64:
1080        if (HAVE_al8) {
1081            store_atomic8(pv, a);
1082            store_atomic8(pv + 8, b);
1083            return;
1084        }
1085        break;
1086    case -MO_64:
1087        if (HAVE_CMPXCHG128) {
1088            uint64_t val_le;
1089            int s2 = pi & 15;
1090            int s1 = 16 - s2;
1091
1092            if (HOST_BIG_ENDIAN) {
1093                val = bswap128(val);
1094            }
1095            switch (s2) {
1096            case 1 ... 7:
1097                val_le = store_whole_le16(pv, s1, val);
1098                store_bytes_leN(pv + s1, s2, val_le);
1099                break;
1100            case 9 ... 15:
1101                store_bytes_leN(pv, s1, int128_getlo(val));
1102                val = int128_urshift(val, s1 * 8);
1103                store_whole_le16(pv + s1, s2, val);
1104                break;
1105            case 0: /* aligned */
1106            case 8: /* atmax MO_64 */
1107            default:
1108                g_assert_not_reached();
1109            }
1110            return;
1111        }
1112        break;
1113    case MO_128:
1114        break;
1115    default:
1116        g_assert_not_reached();
1117    }
1118    trace_store_atom16_fallback(memop, ra);
1119    cpu_loop_exit_atomic(cpu, ra);
1120}
1121