xref: /openbmc/qemu/accel/tcg/ldst_atomicity.c.inc (revision 623d7e3551a6fc5693c06ea938c60fe281b52e27)
1/*
2 * Routines common to user and system emulation of load/store.
3 *
4 *  Copyright (c) 2022 Linaro, Ltd.
5 *
6 * SPDX-License-Identifier: GPL-2.0-or-later
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12#include "host/load-extract-al16-al8.h"
13#include "host/store-insert-al16.h"
14
15#ifdef CONFIG_ATOMIC64
16# define HAVE_al8          true
17#else
18# define HAVE_al8          false
19#endif
20#define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
21
22/**
23 * required_atomicity:
24 *
25 * Return the lg2 bytes of atomicity required by @memop for @p.
26 * If the operation must be split into two operations to be
27 * examined separately for atomicity, return -lg2.
28 */
29static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop)
30{
31    MemOp atom = memop & MO_ATOM_MASK;
32    MemOp size = memop & MO_SIZE;
33    MemOp half = size ? size - 1 : 0;
34    unsigned tmp;
35    int atmax;
36
37    switch (atom) {
38    case MO_ATOM_NONE:
39        atmax = MO_8;
40        break;
41
42    case MO_ATOM_IFALIGN_PAIR:
43        size = half;
44        /* fall through */
45
46    case MO_ATOM_IFALIGN:
47        tmp = (1 << size) - 1;
48        atmax = p & tmp ? MO_8 : size;
49        break;
50
51    case MO_ATOM_WITHIN16:
52        tmp = p & 15;
53        atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
54        break;
55
56    case MO_ATOM_WITHIN16_PAIR:
57        tmp = p & 15;
58        if (tmp + (1 << size) <= 16) {
59            atmax = size;
60        } else if (tmp + (1 << half) == 16) {
61            /*
62             * The pair exactly straddles the boundary.
63             * Both halves are naturally aligned and atomic.
64             */
65            atmax = half;
66        } else {
67            /*
68             * One of the pair crosses the boundary, and is non-atomic.
69             * The other of the pair does not cross, and is atomic.
70             */
71            atmax = -half;
72        }
73        break;
74
75    case MO_ATOM_SUBALIGN:
76        /*
77         * Examine the alignment of p to determine if there are subobjects
78         * that must be aligned.  Note that we only really need ctz4() --
79         * any more sigificant bits are discarded by the immediately
80         * following comparison.
81         */
82        tmp = ctz32(p);
83        atmax = MIN(size, tmp);
84        break;
85
86    default:
87        g_assert_not_reached();
88    }
89
90    /*
91     * Here we have the architectural atomicity of the operation.
92     * However, when executing in a serial context, we need no extra
93     * host atomicity in order to avoid racing.  This reduction
94     * avoids looping with cpu_loop_exit_atomic.
95     */
96    if (cpu_in_serial_context(env_cpu(env))) {
97        return MO_8;
98    }
99    return atmax;
100}
101
102/**
103 * load_atomic2:
104 * @pv: host address
105 *
106 * Atomically load 2 aligned bytes from @pv.
107 */
108static inline uint16_t load_atomic2(void *pv)
109{
110    uint16_t *p = __builtin_assume_aligned(pv, 2);
111    return qatomic_read(p);
112}
113
114/**
115 * load_atomic4:
116 * @pv: host address
117 *
118 * Atomically load 4 aligned bytes from @pv.
119 */
120static inline uint32_t load_atomic4(void *pv)
121{
122    uint32_t *p = __builtin_assume_aligned(pv, 4);
123    return qatomic_read(p);
124}
125
126/**
127 * load_atomic8:
128 * @pv: host address
129 *
130 * Atomically load 8 aligned bytes from @pv.
131 */
132static inline uint64_t load_atomic8(void *pv)
133{
134    uint64_t *p = __builtin_assume_aligned(pv, 8);
135
136    qemu_build_assert(HAVE_al8);
137    return qatomic_read__nocheck(p);
138}
139
140/**
141 * load_atomic8_or_exit:
142 * @env: cpu context
143 * @ra: host unwind address
144 * @pv: host address
145 *
146 * Atomically load 8 aligned bytes from @pv.
147 * If this is not possible, longjmp out to restart serially.
148 */
149static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
150{
151    if (HAVE_al8) {
152        return load_atomic8(pv);
153    }
154
155#ifdef CONFIG_USER_ONLY
156    /*
157     * If the page is not writable, then assume the value is immutable
158     * and requires no locking.  This ignores the case of MAP_SHARED with
159     * another process, because the fallback start_exclusive solution
160     * provides no protection across processes.
161     */
162    if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
163        uint64_t *p = __builtin_assume_aligned(pv, 8);
164        return *p;
165    }
166#endif
167
168    /* Ultimate fallback: re-execute in serial context. */
169    cpu_loop_exit_atomic(env_cpu(env), ra);
170}
171
172/**
173 * load_atomic16_or_exit:
174 * @env: cpu context
175 * @ra: host unwind address
176 * @pv: host address
177 *
178 * Atomically load 16 aligned bytes from @pv.
179 * If this is not possible, longjmp out to restart serially.
180 */
181static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
182{
183    Int128 *p = __builtin_assume_aligned(pv, 16);
184
185    if (HAVE_ATOMIC128_RO) {
186        return atomic16_read_ro(p);
187    }
188
189#ifdef CONFIG_USER_ONLY
190    /*
191     * We can only use cmpxchg to emulate a load if the page is writable.
192     * If the page is not writable, then assume the value is immutable
193     * and requires no locking.  This ignores the case of MAP_SHARED with
194     * another process, because the fallback start_exclusive solution
195     * provides no protection across processes.
196     */
197    if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
198        return *p;
199    }
200#endif
201
202    /*
203     * In system mode all guest pages are writable, and for user-only
204     * we have just checked writability.  Try cmpxchg.
205     */
206    if (HAVE_ATOMIC128_RW) {
207        return atomic16_read_rw(p);
208    }
209
210    /* Ultimate fallback: re-execute in serial context. */
211    cpu_loop_exit_atomic(env_cpu(env), ra);
212}
213
214/**
215 * load_atom_extract_al4x2:
216 * @pv: host address
217 *
218 * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
219 */
220static uint32_t load_atom_extract_al4x2(void *pv)
221{
222    uintptr_t pi = (uintptr_t)pv;
223    int sh = (pi & 3) * 8;
224    uint32_t a, b;
225
226    pv = (void *)(pi & ~3);
227    a = load_atomic4(pv);
228    b = load_atomic4(pv + 4);
229
230    if (HOST_BIG_ENDIAN) {
231        return (a << sh) | (b >> (-sh & 31));
232    } else {
233        return (a >> sh) | (b << (-sh & 31));
234    }
235}
236
237/**
238 * load_atom_extract_al8x2:
239 * @pv: host address
240 *
241 * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
242 */
243static uint64_t load_atom_extract_al8x2(void *pv)
244{
245    uintptr_t pi = (uintptr_t)pv;
246    int sh = (pi & 7) * 8;
247    uint64_t a, b;
248
249    pv = (void *)(pi & ~7);
250    a = load_atomic8(pv);
251    b = load_atomic8(pv + 8);
252
253    if (HOST_BIG_ENDIAN) {
254        return (a << sh) | (b >> (-sh & 63));
255    } else {
256        return (a >> sh) | (b << (-sh & 63));
257    }
258}
259
260/**
261 * load_atom_extract_al8_or_exit:
262 * @env: cpu context
263 * @ra: host unwind address
264 * @pv: host address
265 * @s: object size in bytes, @s <= 4.
266 *
267 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
268 * not cross an 8-byte boundary.  This means that we can perform an atomic
269 * 8-byte load and extract.
270 * The value is returned in the low bits of a uint32_t.
271 */
272static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra,
273                                              void *pv, int s)
274{
275    uintptr_t pi = (uintptr_t)pv;
276    int o = pi & 7;
277    int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
278
279    pv = (void *)(pi & ~7);
280    return load_atomic8_or_exit(env, ra, pv) >> shr;
281}
282
283/**
284 * load_atom_extract_al16_or_exit:
285 * @env: cpu context
286 * @ra: host unwind address
287 * @p: host address
288 * @s: object size in bytes, @s <= 8.
289 *
290 * Atomically load @s bytes from @p, when p % 16 < 8
291 * and p % 16 + s > 8.  I.e. does not cross a 16-byte
292 * boundary, but *does* cross an 8-byte boundary.
293 * This is the slow version, so we must have eliminated
294 * any faster load_atom_extract_al8_or_exit case.
295 *
296 * If this is not possible, longjmp out to restart serially.
297 */
298static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
299                                               void *pv, int s)
300{
301    uintptr_t pi = (uintptr_t)pv;
302    int o = pi & 7;
303    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
304    Int128 r;
305
306    /*
307     * Note constraints above: p & 8 must be clear.
308     * Provoke SIGBUS if possible otherwise.
309     */
310    pv = (void *)(pi & ~7);
311    r = load_atomic16_or_exit(env, ra, pv);
312
313    r = int128_urshift(r, shr);
314    return int128_getlo(r);
315}
316
317/**
318 * load_atom_4_by_2:
319 * @pv: host address
320 *
321 * Load 4 bytes from @pv, with two 2-byte atomic loads.
322 */
323static inline uint32_t load_atom_4_by_2(void *pv)
324{
325    uint32_t a = load_atomic2(pv);
326    uint32_t b = load_atomic2(pv + 2);
327
328    if (HOST_BIG_ENDIAN) {
329        return (a << 16) | b;
330    } else {
331        return (b << 16) | a;
332    }
333}
334
335/**
336 * load_atom_8_by_2:
337 * @pv: host address
338 *
339 * Load 8 bytes from @pv, with four 2-byte atomic loads.
340 */
341static inline uint64_t load_atom_8_by_2(void *pv)
342{
343    uint32_t a = load_atom_4_by_2(pv);
344    uint32_t b = load_atom_4_by_2(pv + 4);
345
346    if (HOST_BIG_ENDIAN) {
347        return ((uint64_t)a << 32) | b;
348    } else {
349        return ((uint64_t)b << 32) | a;
350    }
351}
352
353/**
354 * load_atom_8_by_4:
355 * @pv: host address
356 *
357 * Load 8 bytes from @pv, with two 4-byte atomic loads.
358 */
359static inline uint64_t load_atom_8_by_4(void *pv)
360{
361    uint32_t a = load_atomic4(pv);
362    uint32_t b = load_atomic4(pv + 4);
363
364    if (HOST_BIG_ENDIAN) {
365        return ((uint64_t)a << 32) | b;
366    } else {
367        return ((uint64_t)b << 32) | a;
368    }
369}
370
371/**
372 * load_atom_8_by_8_or_4:
373 * @pv: host address
374 *
375 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
376 */
377static inline uint64_t load_atom_8_by_8_or_4(void *pv)
378{
379    if (HAVE_al8_fast) {
380        return load_atomic8(pv);
381    } else {
382        return load_atom_8_by_4(pv);
383    }
384}
385
386/**
387 * load_atom_2:
388 * @p: host address
389 * @memop: the full memory op
390 *
391 * Load 2 bytes from @p, honoring the atomicity of @memop.
392 */
393static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
394                            void *pv, MemOp memop)
395{
396    uintptr_t pi = (uintptr_t)pv;
397    int atmax;
398
399    if (likely((pi & 1) == 0)) {
400        return load_atomic2(pv);
401    }
402    if (HAVE_ATOMIC128_RO) {
403        return load_atom_extract_al16_or_al8(pv, 2);
404    }
405
406    atmax = required_atomicity(env, pi, memop);
407    switch (atmax) {
408    case MO_8:
409        return lduw_he_p(pv);
410    case MO_16:
411        /* The only case remaining is MO_ATOM_WITHIN16. */
412        if (!HAVE_al8_fast && (pi & 3) == 1) {
413            /* Big or little endian, we want the middle two bytes. */
414            return load_atomic4(pv - 1) >> 8;
415        }
416        if ((pi & 15) != 7) {
417            return load_atom_extract_al8_or_exit(env, ra, pv, 2);
418        }
419        return load_atom_extract_al16_or_exit(env, ra, pv, 2);
420    default:
421        g_assert_not_reached();
422    }
423}
424
425/**
426 * load_atom_4:
427 * @p: host address
428 * @memop: the full memory op
429 *
430 * Load 4 bytes from @p, honoring the atomicity of @memop.
431 */
432static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
433                            void *pv, MemOp memop)
434{
435    uintptr_t pi = (uintptr_t)pv;
436    int atmax;
437
438    if (likely((pi & 3) == 0)) {
439        return load_atomic4(pv);
440    }
441    if (HAVE_ATOMIC128_RO) {
442        return load_atom_extract_al16_or_al8(pv, 4);
443    }
444
445    atmax = required_atomicity(env, pi, memop);
446    switch (atmax) {
447    case MO_8:
448    case MO_16:
449    case -MO_16:
450        /*
451         * For MO_ATOM_IFALIGN, this is more atomicity than required,
452         * but it's trivially supported on all hosts, better than 4
453         * individual byte loads (when the host requires alignment),
454         * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
455         */
456        return load_atom_extract_al4x2(pv);
457    case MO_32:
458        if (!(pi & 4)) {
459            return load_atom_extract_al8_or_exit(env, ra, pv, 4);
460        }
461        return load_atom_extract_al16_or_exit(env, ra, pv, 4);
462    default:
463        g_assert_not_reached();
464    }
465}
466
467/**
468 * load_atom_8:
469 * @p: host address
470 * @memop: the full memory op
471 *
472 * Load 8 bytes from @p, honoring the atomicity of @memop.
473 */
474static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
475                            void *pv, MemOp memop)
476{
477    uintptr_t pi = (uintptr_t)pv;
478    int atmax;
479
480    /*
481     * If the host does not support 8-byte atomics, wait until we have
482     * examined the atomicity parameters below.
483     */
484    if (HAVE_al8 && likely((pi & 7) == 0)) {
485        return load_atomic8(pv);
486    }
487    if (HAVE_ATOMIC128_RO) {
488        return load_atom_extract_al16_or_al8(pv, 8);
489    }
490
491    atmax = required_atomicity(env, pi, memop);
492    if (atmax == MO_64) {
493        if (!HAVE_al8 && (pi & 7) == 0) {
494            load_atomic8_or_exit(env, ra, pv);
495        }
496        return load_atom_extract_al16_or_exit(env, ra, pv, 8);
497    }
498    if (HAVE_al8_fast) {
499        return load_atom_extract_al8x2(pv);
500    }
501    switch (atmax) {
502    case MO_8:
503        return ldq_he_p(pv);
504    case MO_16:
505        return load_atom_8_by_2(pv);
506    case MO_32:
507        return load_atom_8_by_4(pv);
508    case -MO_32:
509        if (HAVE_al8) {
510            return load_atom_extract_al8x2(pv);
511        }
512        cpu_loop_exit_atomic(env_cpu(env), ra);
513    default:
514        g_assert_not_reached();
515    }
516}
517
518/**
519 * load_atom_16:
520 * @p: host address
521 * @memop: the full memory op
522 *
523 * Load 16 bytes from @p, honoring the atomicity of @memop.
524 */
525static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
526                           void *pv, MemOp memop)
527{
528    uintptr_t pi = (uintptr_t)pv;
529    int atmax;
530    Int128 r;
531    uint64_t a, b;
532
533    /*
534     * If the host does not support 16-byte atomics, wait until we have
535     * examined the atomicity parameters below.
536     */
537    if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
538        return atomic16_read_ro(pv);
539    }
540
541    atmax = required_atomicity(env, pi, memop);
542    switch (atmax) {
543    case MO_8:
544        memcpy(&r, pv, 16);
545        return r;
546    case MO_16:
547        a = load_atom_8_by_2(pv);
548        b = load_atom_8_by_2(pv + 8);
549        break;
550    case MO_32:
551        a = load_atom_8_by_4(pv);
552        b = load_atom_8_by_4(pv + 8);
553        break;
554    case MO_64:
555        if (!HAVE_al8) {
556            cpu_loop_exit_atomic(env_cpu(env), ra);
557        }
558        a = load_atomic8(pv);
559        b = load_atomic8(pv + 8);
560        break;
561    case -MO_64:
562        if (!HAVE_al8) {
563            cpu_loop_exit_atomic(env_cpu(env), ra);
564        }
565        a = load_atom_extract_al8x2(pv);
566        b = load_atom_extract_al8x2(pv + 8);
567        break;
568    case MO_128:
569        return load_atomic16_or_exit(env, ra, pv);
570    default:
571        g_assert_not_reached();
572    }
573    return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
574}
575
576/**
577 * store_atomic2:
578 * @pv: host address
579 * @val: value to store
580 *
581 * Atomically store 2 aligned bytes to @pv.
582 */
583static inline void store_atomic2(void *pv, uint16_t val)
584{
585    uint16_t *p = __builtin_assume_aligned(pv, 2);
586    qatomic_set(p, val);
587}
588
589/**
590 * store_atomic4:
591 * @pv: host address
592 * @val: value to store
593 *
594 * Atomically store 4 aligned bytes to @pv.
595 */
596static inline void store_atomic4(void *pv, uint32_t val)
597{
598    uint32_t *p = __builtin_assume_aligned(pv, 4);
599    qatomic_set(p, val);
600}
601
602/**
603 * store_atomic8:
604 * @pv: host address
605 * @val: value to store
606 *
607 * Atomically store 8 aligned bytes to @pv.
608 */
609static inline void store_atomic8(void *pv, uint64_t val)
610{
611    uint64_t *p = __builtin_assume_aligned(pv, 8);
612
613    qemu_build_assert(HAVE_al8);
614    qatomic_set__nocheck(p, val);
615}
616
617/**
618 * store_atom_4x2
619 */
620static inline void store_atom_4_by_2(void *pv, uint32_t val)
621{
622    store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
623    store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
624}
625
626/**
627 * store_atom_8_by_2
628 */
629static inline void store_atom_8_by_2(void *pv, uint64_t val)
630{
631    store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
632    store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
633}
634
635/**
636 * store_atom_8_by_4
637 */
638static inline void store_atom_8_by_4(void *pv, uint64_t val)
639{
640    store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
641    store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
642}
643
644/**
645 * store_atom_insert_al4:
646 * @p: host address
647 * @val: shifted value to store
648 * @msk: mask for value to store
649 *
650 * Atomically store @val to @p, masked by @msk.
651 */
652static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
653{
654    uint32_t old, new;
655
656    p = __builtin_assume_aligned(p, 4);
657    old = qatomic_read(p);
658    do {
659        new = (old & ~msk) | val;
660    } while (!__atomic_compare_exchange_n(p, &old, new, true,
661                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
662}
663
664/**
665 * store_atom_insert_al8:
666 * @p: host address
667 * @val: shifted value to store
668 * @msk: mask for value to store
669 *
670 * Atomically store @val to @p masked by @msk.
671 */
672static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
673{
674    uint64_t old, new;
675
676    qemu_build_assert(HAVE_al8);
677    p = __builtin_assume_aligned(p, 8);
678    old = qatomic_read__nocheck(p);
679    do {
680        new = (old & ~msk) | val;
681    } while (!__atomic_compare_exchange_n(p, &old, new, true,
682                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
683}
684
685/**
686 * store_bytes_leN:
687 * @pv: host address
688 * @size: number of bytes to store
689 * @val_le: data to store
690 *
691 * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
692 * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
693 */
694static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
695{
696    uint8_t *p = pv;
697    for (int i = 0; i < size; i++, val_le >>= 8) {
698        p[i] = val_le;
699    }
700    return val_le;
701}
702
703/**
704 * store_parts_leN
705 * @pv: host address
706 * @size: number of bytes to store
707 * @val_le: data to store
708 *
709 * As store_bytes_leN, but atomically on each aligned part.
710 */
711G_GNUC_UNUSED
712static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
713{
714    do {
715        int n;
716
717        /* Find minimum of alignment and size */
718        switch (((uintptr_t)pv | size) & 7) {
719        case 4:
720            store_atomic4(pv, le32_to_cpu(val_le));
721            val_le >>= 32;
722            n = 4;
723            break;
724        case 2:
725        case 6:
726            store_atomic2(pv, le16_to_cpu(val_le));
727            val_le >>= 16;
728            n = 2;
729            break;
730        default:
731            *(uint8_t *)pv = val_le;
732            val_le >>= 8;
733            n = 1;
734            break;
735        case 0:
736            g_assert_not_reached();
737        }
738        pv += n;
739        size -= n;
740    } while (size != 0);
741
742    return val_le;
743}
744
745/**
746 * store_whole_le4
747 * @pv: host address
748 * @size: number of bytes to store
749 * @val_le: data to store
750 *
751 * As store_bytes_leN, but atomically as a whole.
752 * Four aligned bytes are guaranteed to cover the store.
753 */
754static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
755{
756    int sz = size * 8;
757    int o = (uintptr_t)pv & 3;
758    int sh = o * 8;
759    uint32_t m = MAKE_64BIT_MASK(0, sz);
760    uint32_t v;
761
762    if (HOST_BIG_ENDIAN) {
763        v = bswap32(val_le) >> sh;
764        m = bswap32(m) >> sh;
765    } else {
766        v = val_le << sh;
767        m <<= sh;
768    }
769    store_atom_insert_al4(pv - o, v, m);
770    return val_le >> sz;
771}
772
773/**
774 * store_whole_le8
775 * @pv: host address
776 * @size: number of bytes to store
777 * @val_le: data to store
778 *
779 * As store_bytes_leN, but atomically as a whole.
780 * Eight aligned bytes are guaranteed to cover the store.
781 */
782static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
783{
784    int sz = size * 8;
785    int o = (uintptr_t)pv & 7;
786    int sh = o * 8;
787    uint64_t m = MAKE_64BIT_MASK(0, sz);
788    uint64_t v;
789
790    qemu_build_assert(HAVE_al8);
791    if (HOST_BIG_ENDIAN) {
792        v = bswap64(val_le) >> sh;
793        m = bswap64(m) >> sh;
794    } else {
795        v = val_le << sh;
796        m <<= sh;
797    }
798    store_atom_insert_al8(pv - o, v, m);
799    return val_le >> sz;
800}
801
802/**
803 * store_whole_le16
804 * @pv: host address
805 * @size: number of bytes to store
806 * @val_le: data to store
807 *
808 * As store_bytes_leN, but atomically as a whole.
809 * 16 aligned bytes are guaranteed to cover the store.
810 */
811static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
812{
813    int sz = size * 8;
814    int o = (uintptr_t)pv & 15;
815    int sh = o * 8;
816    Int128 m, v;
817
818    qemu_build_assert(HAVE_ATOMIC128_RW);
819
820    /* Like MAKE_64BIT_MASK(0, sz), but larger. */
821    if (sz <= 64) {
822        m = int128_make64(MAKE_64BIT_MASK(0, sz));
823    } else {
824        m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
825    }
826
827    if (HOST_BIG_ENDIAN) {
828        v = int128_urshift(bswap128(val_le), sh);
829        m = int128_urshift(bswap128(m), sh);
830    } else {
831        v = int128_lshift(val_le, sh);
832        m = int128_lshift(m, sh);
833    }
834    store_atom_insert_al16(pv - o, v, m);
835
836    if (sz <= 64) {
837        return 0;
838    }
839    return int128_gethi(val_le) >> (sz - 64);
840}
841
842/**
843 * store_atom_2:
844 * @p: host address
845 * @val: the value to store
846 * @memop: the full memory op
847 *
848 * Store 2 bytes to @p, honoring the atomicity of @memop.
849 */
850static void store_atom_2(CPUArchState *env, uintptr_t ra,
851                         void *pv, MemOp memop, uint16_t val)
852{
853    uintptr_t pi = (uintptr_t)pv;
854    int atmax;
855
856    if (likely((pi & 1) == 0)) {
857        store_atomic2(pv, val);
858        return;
859    }
860
861    atmax = required_atomicity(env, pi, memop);
862    if (atmax == MO_8) {
863        stw_he_p(pv, val);
864        return;
865    }
866
867    /*
868     * The only case remaining is MO_ATOM_WITHIN16.
869     * Big or little endian, we want the middle two bytes in each test.
870     */
871    if ((pi & 3) == 1) {
872        store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
873        return;
874    } else if ((pi & 7) == 3) {
875        if (HAVE_al8) {
876            store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
877            return;
878        }
879    } else if ((pi & 15) == 7) {
880        if (HAVE_ATOMIC128_RW) {
881            Int128 v = int128_lshift(int128_make64(val), 56);
882            Int128 m = int128_lshift(int128_make64(0xffff), 56);
883            store_atom_insert_al16(pv - 7, v, m);
884            return;
885        }
886    } else {
887        g_assert_not_reached();
888    }
889
890    cpu_loop_exit_atomic(env_cpu(env), ra);
891}
892
893/**
894 * store_atom_4:
895 * @p: host address
896 * @val: the value to store
897 * @memop: the full memory op
898 *
899 * Store 4 bytes to @p, honoring the atomicity of @memop.
900 */
901static void store_atom_4(CPUArchState *env, uintptr_t ra,
902                         void *pv, MemOp memop, uint32_t val)
903{
904    uintptr_t pi = (uintptr_t)pv;
905    int atmax;
906
907    if (likely((pi & 3) == 0)) {
908        store_atomic4(pv, val);
909        return;
910    }
911
912    atmax = required_atomicity(env, pi, memop);
913    switch (atmax) {
914    case MO_8:
915        stl_he_p(pv, val);
916        return;
917    case MO_16:
918        store_atom_4_by_2(pv, val);
919        return;
920    case -MO_16:
921        {
922            uint32_t val_le = cpu_to_le32(val);
923            int s2 = pi & 3;
924            int s1 = 4 - s2;
925
926            switch (s2) {
927            case 1:
928                val_le = store_whole_le4(pv, s1, val_le);
929                *(uint8_t *)(pv + 3) = val_le;
930                break;
931            case 3:
932                *(uint8_t *)pv = val_le;
933                store_whole_le4(pv + 1, s2, val_le >> 8);
934                break;
935            case 0: /* aligned */
936            case 2: /* atmax MO_16 */
937            default:
938                g_assert_not_reached();
939            }
940        }
941        return;
942    case MO_32:
943        if ((pi & 7) < 4) {
944            if (HAVE_al8) {
945                store_whole_le8(pv, 4, cpu_to_le32(val));
946                return;
947            }
948        } else {
949            if (HAVE_ATOMIC128_RW) {
950                store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
951                return;
952            }
953        }
954        cpu_loop_exit_atomic(env_cpu(env), ra);
955    default:
956        g_assert_not_reached();
957    }
958}
959
960/**
961 * store_atom_8:
962 * @p: host address
963 * @val: the value to store
964 * @memop: the full memory op
965 *
966 * Store 8 bytes to @p, honoring the atomicity of @memop.
967 */
968static void store_atom_8(CPUArchState *env, uintptr_t ra,
969                         void *pv, MemOp memop, uint64_t val)
970{
971    uintptr_t pi = (uintptr_t)pv;
972    int atmax;
973
974    if (HAVE_al8 && likely((pi & 7) == 0)) {
975        store_atomic8(pv, val);
976        return;
977    }
978
979    atmax = required_atomicity(env, pi, memop);
980    switch (atmax) {
981    case MO_8:
982        stq_he_p(pv, val);
983        return;
984    case MO_16:
985        store_atom_8_by_2(pv, val);
986        return;
987    case MO_32:
988        store_atom_8_by_4(pv, val);
989        return;
990    case -MO_32:
991        if (HAVE_al8) {
992            uint64_t val_le = cpu_to_le64(val);
993            int s2 = pi & 7;
994            int s1 = 8 - s2;
995
996            switch (s2) {
997            case 1 ... 3:
998                val_le = store_whole_le8(pv, s1, val_le);
999                store_bytes_leN(pv + s1, s2, val_le);
1000                break;
1001            case 5 ... 7:
1002                val_le = store_bytes_leN(pv, s1, val_le);
1003                store_whole_le8(pv + s1, s2, val_le);
1004                break;
1005            case 0: /* aligned */
1006            case 4: /* atmax MO_32 */
1007            default:
1008                g_assert_not_reached();
1009            }
1010            return;
1011        }
1012        break;
1013    case MO_64:
1014        if (HAVE_ATOMIC128_RW) {
1015            store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1016            return;
1017        }
1018        break;
1019    default:
1020        g_assert_not_reached();
1021    }
1022    cpu_loop_exit_atomic(env_cpu(env), ra);
1023}
1024
1025/**
1026 * store_atom_16:
1027 * @p: host address
1028 * @val: the value to store
1029 * @memop: the full memory op
1030 *
1031 * Store 16 bytes to @p, honoring the atomicity of @memop.
1032 */
1033static void store_atom_16(CPUArchState *env, uintptr_t ra,
1034                          void *pv, MemOp memop, Int128 val)
1035{
1036    uintptr_t pi = (uintptr_t)pv;
1037    uint64_t a, b;
1038    int atmax;
1039
1040    if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
1041        atomic16_set(pv, val);
1042        return;
1043    }
1044
1045    atmax = required_atomicity(env, pi, memop);
1046
1047    a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1048    b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1049    switch (atmax) {
1050    case MO_8:
1051        memcpy(pv, &val, 16);
1052        return;
1053    case MO_16:
1054        store_atom_8_by_2(pv, a);
1055        store_atom_8_by_2(pv + 8, b);
1056        return;
1057    case MO_32:
1058        store_atom_8_by_4(pv, a);
1059        store_atom_8_by_4(pv + 8, b);
1060        return;
1061    case MO_64:
1062        if (HAVE_al8) {
1063            store_atomic8(pv, a);
1064            store_atomic8(pv + 8, b);
1065            return;
1066        }
1067        break;
1068    case -MO_64:
1069        if (HAVE_ATOMIC128_RW) {
1070            uint64_t val_le;
1071            int s2 = pi & 15;
1072            int s1 = 16 - s2;
1073
1074            if (HOST_BIG_ENDIAN) {
1075                val = bswap128(val);
1076            }
1077            switch (s2) {
1078            case 1 ... 7:
1079                val_le = store_whole_le16(pv, s1, val);
1080                store_bytes_leN(pv + s1, s2, val_le);
1081                break;
1082            case 9 ... 15:
1083                store_bytes_leN(pv, s1, int128_getlo(val));
1084                val = int128_urshift(val, s1 * 8);
1085                store_whole_le16(pv + s1, s2, val);
1086                break;
1087            case 0: /* aligned */
1088            case 8: /* atmax MO_64 */
1089            default:
1090                g_assert_not_reached();
1091            }
1092            return;
1093        }
1094        break;
1095    case MO_128:
1096        if (HAVE_ATOMIC128_RW) {
1097            atomic16_set(pv, val);
1098            return;
1099        }
1100        break;
1101    default:
1102        g_assert_not_reached();
1103    }
1104    cpu_loop_exit_atomic(env_cpu(env), ra);
1105}
1106