xref: /openbmc/qemu/accel/tcg/ldst_atomicity.c.inc (revision 0cadc1eda1a3120c37c713ab6d6b7a02da0d2e6f)
1/*
2 * Routines common to user and system emulation of load/store.
3 *
4 *  Copyright (c) 2022 Linaro, Ltd.
5 *
6 * SPDX-License-Identifier: GPL-2.0-or-later
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12#ifdef CONFIG_ATOMIC64
13# define HAVE_al8          true
14#else
15# define HAVE_al8          false
16#endif
17#define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
18
19#if defined(CONFIG_ATOMIC128)
20# define HAVE_al16_fast    true
21#else
22# define HAVE_al16_fast    false
23#endif
24#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
25# define HAVE_al16         true
26#else
27# define HAVE_al16         false
28#endif
29
30
31/**
32 * required_atomicity:
33 *
34 * Return the lg2 bytes of atomicity required by @memop for @p.
35 * If the operation must be split into two operations to be
36 * examined separately for atomicity, return -lg2.
37 */
38static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop)
39{
40    MemOp atom = memop & MO_ATOM_MASK;
41    MemOp size = memop & MO_SIZE;
42    MemOp half = size ? size - 1 : 0;
43    unsigned tmp;
44    int atmax;
45
46    switch (atom) {
47    case MO_ATOM_NONE:
48        atmax = MO_8;
49        break;
50
51    case MO_ATOM_IFALIGN_PAIR:
52        size = half;
53        /* fall through */
54
55    case MO_ATOM_IFALIGN:
56        tmp = (1 << size) - 1;
57        atmax = p & tmp ? MO_8 : size;
58        break;
59
60    case MO_ATOM_WITHIN16:
61        tmp = p & 15;
62        atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
63        break;
64
65    case MO_ATOM_WITHIN16_PAIR:
66        tmp = p & 15;
67        if (tmp + (1 << size) <= 16) {
68            atmax = size;
69        } else if (tmp + (1 << half) == 16) {
70            /*
71             * The pair exactly straddles the boundary.
72             * Both halves are naturally aligned and atomic.
73             */
74            atmax = half;
75        } else {
76            /*
77             * One of the pair crosses the boundary, and is non-atomic.
78             * The other of the pair does not cross, and is atomic.
79             */
80            atmax = -half;
81        }
82        break;
83
84    case MO_ATOM_SUBALIGN:
85        /*
86         * Examine the alignment of p to determine if there are subobjects
87         * that must be aligned.  Note that we only really need ctz4() --
88         * any more sigificant bits are discarded by the immediately
89         * following comparison.
90         */
91        tmp = ctz32(p);
92        atmax = MIN(size, tmp);
93        break;
94
95    default:
96        g_assert_not_reached();
97    }
98
99    /*
100     * Here we have the architectural atomicity of the operation.
101     * However, when executing in a serial context, we need no extra
102     * host atomicity in order to avoid racing.  This reduction
103     * avoids looping with cpu_loop_exit_atomic.
104     */
105    if (cpu_in_serial_context(env_cpu(env))) {
106        return MO_8;
107    }
108    return atmax;
109}
110
111/**
112 * load_atomic2:
113 * @pv: host address
114 *
115 * Atomically load 2 aligned bytes from @pv.
116 */
117static inline uint16_t load_atomic2(void *pv)
118{
119    uint16_t *p = __builtin_assume_aligned(pv, 2);
120    return qatomic_read(p);
121}
122
123/**
124 * load_atomic4:
125 * @pv: host address
126 *
127 * Atomically load 4 aligned bytes from @pv.
128 */
129static inline uint32_t load_atomic4(void *pv)
130{
131    uint32_t *p = __builtin_assume_aligned(pv, 4);
132    return qatomic_read(p);
133}
134
135/**
136 * load_atomic8:
137 * @pv: host address
138 *
139 * Atomically load 8 aligned bytes from @pv.
140 */
141static inline uint64_t load_atomic8(void *pv)
142{
143    uint64_t *p = __builtin_assume_aligned(pv, 8);
144
145    qemu_build_assert(HAVE_al8);
146    return qatomic_read__nocheck(p);
147}
148
149/**
150 * load_atomic16:
151 * @pv: host address
152 *
153 * Atomically load 16 aligned bytes from @pv.
154 */
155static inline Int128 load_atomic16(void *pv)
156{
157#ifdef CONFIG_ATOMIC128
158    __uint128_t *p = __builtin_assume_aligned(pv, 16);
159    Int128Alias r;
160
161    r.u = qatomic_read__nocheck(p);
162    return r.s;
163#else
164    qemu_build_not_reached();
165#endif
166}
167
168/**
169 * load_atomic8_or_exit:
170 * @env: cpu context
171 * @ra: host unwind address
172 * @pv: host address
173 *
174 * Atomically load 8 aligned bytes from @pv.
175 * If this is not possible, longjmp out to restart serially.
176 */
177static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
178{
179    if (HAVE_al8) {
180        return load_atomic8(pv);
181    }
182
183#ifdef CONFIG_USER_ONLY
184    /*
185     * If the page is not writable, then assume the value is immutable
186     * and requires no locking.  This ignores the case of MAP_SHARED with
187     * another process, because the fallback start_exclusive solution
188     * provides no protection across processes.
189     */
190    if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) {
191        uint64_t *p = __builtin_assume_aligned(pv, 8);
192        return *p;
193    }
194#endif
195
196    /* Ultimate fallback: re-execute in serial context. */
197    cpu_loop_exit_atomic(env_cpu(env), ra);
198}
199
200/**
201 * load_atomic16_or_exit:
202 * @env: cpu context
203 * @ra: host unwind address
204 * @pv: host address
205 *
206 * Atomically load 16 aligned bytes from @pv.
207 * If this is not possible, longjmp out to restart serially.
208 */
209static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
210{
211    Int128 *p = __builtin_assume_aligned(pv, 16);
212
213    if (HAVE_al16_fast) {
214        return load_atomic16(p);
215    }
216
217#ifdef CONFIG_USER_ONLY
218    /*
219     * We can only use cmpxchg to emulate a load if the page is writable.
220     * If the page is not writable, then assume the value is immutable
221     * and requires no locking.  This ignores the case of MAP_SHARED with
222     * another process, because the fallback start_exclusive solution
223     * provides no protection across processes.
224     */
225    if (!page_check_range(h2g(p), 16, PAGE_WRITE)) {
226        return *p;
227    }
228#endif
229
230    /*
231     * In system mode all guest pages are writable, and for user-only
232     * we have just checked writability.  Try cmpxchg.
233     */
234#if defined(CONFIG_CMPXCHG128)
235    /* Swap 0 with 0, with the side-effect of returning the old value. */
236    {
237        Int128Alias r;
238        r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0);
239        return r.s;
240    }
241#endif
242
243    /* Ultimate fallback: re-execute in serial context. */
244    cpu_loop_exit_atomic(env_cpu(env), ra);
245}
246
247/**
248 * load_atom_extract_al4x2:
249 * @pv: host address
250 *
251 * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
252 */
253static uint32_t load_atom_extract_al4x2(void *pv)
254{
255    uintptr_t pi = (uintptr_t)pv;
256    int sh = (pi & 3) * 8;
257    uint32_t a, b;
258
259    pv = (void *)(pi & ~3);
260    a = load_atomic4(pv);
261    b = load_atomic4(pv + 4);
262
263    if (HOST_BIG_ENDIAN) {
264        return (a << sh) | (b >> (-sh & 31));
265    } else {
266        return (a >> sh) | (b << (-sh & 31));
267    }
268}
269
270/**
271 * load_atom_extract_al8x2:
272 * @pv: host address
273 *
274 * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
275 */
276static uint64_t load_atom_extract_al8x2(void *pv)
277{
278    uintptr_t pi = (uintptr_t)pv;
279    int sh = (pi & 7) * 8;
280    uint64_t a, b;
281
282    pv = (void *)(pi & ~7);
283    a = load_atomic8(pv);
284    b = load_atomic8(pv + 8);
285
286    if (HOST_BIG_ENDIAN) {
287        return (a << sh) | (b >> (-sh & 63));
288    } else {
289        return (a >> sh) | (b << (-sh & 63));
290    }
291}
292
293/**
294 * load_atom_extract_al8_or_exit:
295 * @env: cpu context
296 * @ra: host unwind address
297 * @pv: host address
298 * @s: object size in bytes, @s <= 4.
299 *
300 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
301 * not cross an 8-byte boundary.  This means that we can perform an atomic
302 * 8-byte load and extract.
303 * The value is returned in the low bits of a uint32_t.
304 */
305static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra,
306                                              void *pv, int s)
307{
308    uintptr_t pi = (uintptr_t)pv;
309    int o = pi & 7;
310    int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
311
312    pv = (void *)(pi & ~7);
313    return load_atomic8_or_exit(env, ra, pv) >> shr;
314}
315
316/**
317 * load_atom_extract_al16_or_exit:
318 * @env: cpu context
319 * @ra: host unwind address
320 * @p: host address
321 * @s: object size in bytes, @s <= 8.
322 *
323 * Atomically load @s bytes from @p, when p % 16 < 8
324 * and p % 16 + s > 8.  I.e. does not cross a 16-byte
325 * boundary, but *does* cross an 8-byte boundary.
326 * This is the slow version, so we must have eliminated
327 * any faster load_atom_extract_al8_or_exit case.
328 *
329 * If this is not possible, longjmp out to restart serially.
330 */
331static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
332                                               void *pv, int s)
333{
334    uintptr_t pi = (uintptr_t)pv;
335    int o = pi & 7;
336    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
337    Int128 r;
338
339    /*
340     * Note constraints above: p & 8 must be clear.
341     * Provoke SIGBUS if possible otherwise.
342     */
343    pv = (void *)(pi & ~7);
344    r = load_atomic16_or_exit(env, ra, pv);
345
346    r = int128_urshift(r, shr);
347    return int128_getlo(r);
348}
349
350/**
351 * load_atom_extract_al16_or_al8:
352 * @p: host address
353 * @s: object size in bytes, @s <= 8.
354 *
355 * Load @s bytes from @p, when p % s != 0.  If [p, p+s-1] does not
356 * cross an 16-byte boundary then the access must be 16-byte atomic,
357 * otherwise the access must be 8-byte atomic.
358 */
359static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
360{
361#if defined(CONFIG_ATOMIC128)
362    uintptr_t pi = (uintptr_t)pv;
363    int o = pi & 7;
364    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
365    __uint128_t r;
366
367    pv = (void *)(pi & ~7);
368    if (pi & 8) {
369        uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
370        uint64_t a = qatomic_read__nocheck(p8);
371        uint64_t b = qatomic_read__nocheck(p8 + 1);
372
373        if (HOST_BIG_ENDIAN) {
374            r = ((__uint128_t)a << 64) | b;
375        } else {
376            r = ((__uint128_t)b << 64) | a;
377        }
378    } else {
379        __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0);
380        r = qatomic_read__nocheck(p16);
381    }
382    return r >> shr;
383#else
384    qemu_build_not_reached();
385#endif
386}
387
388/**
389 * load_atom_4_by_2:
390 * @pv: host address
391 *
392 * Load 4 bytes from @pv, with two 2-byte atomic loads.
393 */
394static inline uint32_t load_atom_4_by_2(void *pv)
395{
396    uint32_t a = load_atomic2(pv);
397    uint32_t b = load_atomic2(pv + 2);
398
399    if (HOST_BIG_ENDIAN) {
400        return (a << 16) | b;
401    } else {
402        return (b << 16) | a;
403    }
404}
405
406/**
407 * load_atom_8_by_2:
408 * @pv: host address
409 *
410 * Load 8 bytes from @pv, with four 2-byte atomic loads.
411 */
412static inline uint64_t load_atom_8_by_2(void *pv)
413{
414    uint32_t a = load_atom_4_by_2(pv);
415    uint32_t b = load_atom_4_by_2(pv + 4);
416
417    if (HOST_BIG_ENDIAN) {
418        return ((uint64_t)a << 32) | b;
419    } else {
420        return ((uint64_t)b << 32) | a;
421    }
422}
423
424/**
425 * load_atom_8_by_4:
426 * @pv: host address
427 *
428 * Load 8 bytes from @pv, with two 4-byte atomic loads.
429 */
430static inline uint64_t load_atom_8_by_4(void *pv)
431{
432    uint32_t a = load_atomic4(pv);
433    uint32_t b = load_atomic4(pv + 4);
434
435    if (HOST_BIG_ENDIAN) {
436        return ((uint64_t)a << 32) | b;
437    } else {
438        return ((uint64_t)b << 32) | a;
439    }
440}
441
442/**
443 * load_atom_2:
444 * @p: host address
445 * @memop: the full memory op
446 *
447 * Load 2 bytes from @p, honoring the atomicity of @memop.
448 */
449static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
450                            void *pv, MemOp memop)
451{
452    uintptr_t pi = (uintptr_t)pv;
453    int atmax;
454
455    if (likely((pi & 1) == 0)) {
456        return load_atomic2(pv);
457    }
458    if (HAVE_al16_fast) {
459        return load_atom_extract_al16_or_al8(pv, 2);
460    }
461
462    atmax = required_atomicity(env, pi, memop);
463    switch (atmax) {
464    case MO_8:
465        return lduw_he_p(pv);
466    case MO_16:
467        /* The only case remaining is MO_ATOM_WITHIN16. */
468        if (!HAVE_al8_fast && (pi & 3) == 1) {
469            /* Big or little endian, we want the middle two bytes. */
470            return load_atomic4(pv - 1) >> 8;
471        }
472        if ((pi & 15) != 7) {
473            return load_atom_extract_al8_or_exit(env, ra, pv, 2);
474        }
475        return load_atom_extract_al16_or_exit(env, ra, pv, 2);
476    default:
477        g_assert_not_reached();
478    }
479}
480
481/**
482 * load_atom_4:
483 * @p: host address
484 * @memop: the full memory op
485 *
486 * Load 4 bytes from @p, honoring the atomicity of @memop.
487 */
488static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
489                            void *pv, MemOp memop)
490{
491    uintptr_t pi = (uintptr_t)pv;
492    int atmax;
493
494    if (likely((pi & 3) == 0)) {
495        return load_atomic4(pv);
496    }
497    if (HAVE_al16_fast) {
498        return load_atom_extract_al16_or_al8(pv, 4);
499    }
500
501    atmax = required_atomicity(env, pi, memop);
502    switch (atmax) {
503    case MO_8:
504    case MO_16:
505    case -MO_16:
506        /*
507         * For MO_ATOM_IFALIGN, this is more atomicity than required,
508         * but it's trivially supported on all hosts, better than 4
509         * individual byte loads (when the host requires alignment),
510         * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
511         */
512        return load_atom_extract_al4x2(pv);
513    case MO_32:
514        if (!(pi & 4)) {
515            return load_atom_extract_al8_or_exit(env, ra, pv, 4);
516        }
517        return load_atom_extract_al16_or_exit(env, ra, pv, 4);
518    default:
519        g_assert_not_reached();
520    }
521}
522
523/**
524 * load_atom_8:
525 * @p: host address
526 * @memop: the full memory op
527 *
528 * Load 8 bytes from @p, honoring the atomicity of @memop.
529 */
530static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
531                            void *pv, MemOp memop)
532{
533    uintptr_t pi = (uintptr_t)pv;
534    int atmax;
535
536    /*
537     * If the host does not support 8-byte atomics, wait until we have
538     * examined the atomicity parameters below.
539     */
540    if (HAVE_al8 && likely((pi & 7) == 0)) {
541        return load_atomic8(pv);
542    }
543    if (HAVE_al16_fast) {
544        return load_atom_extract_al16_or_al8(pv, 8);
545    }
546
547    atmax = required_atomicity(env, pi, memop);
548    if (atmax == MO_64) {
549        if (!HAVE_al8 && (pi & 7) == 0) {
550            load_atomic8_or_exit(env, ra, pv);
551        }
552        return load_atom_extract_al16_or_exit(env, ra, pv, 8);
553    }
554    if (HAVE_al8_fast) {
555        return load_atom_extract_al8x2(pv);
556    }
557    switch (atmax) {
558    case MO_8:
559        return ldq_he_p(pv);
560    case MO_16:
561        return load_atom_8_by_2(pv);
562    case MO_32:
563        return load_atom_8_by_4(pv);
564    case -MO_32:
565        if (HAVE_al8) {
566            return load_atom_extract_al8x2(pv);
567        }
568        cpu_loop_exit_atomic(env_cpu(env), ra);
569    default:
570        g_assert_not_reached();
571    }
572}
573
574/**
575 * store_atomic2:
576 * @pv: host address
577 * @val: value to store
578 *
579 * Atomically store 2 aligned bytes to @pv.
580 */
581static inline void store_atomic2(void *pv, uint16_t val)
582{
583    uint16_t *p = __builtin_assume_aligned(pv, 2);
584    qatomic_set(p, val);
585}
586
587/**
588 * store_atomic4:
589 * @pv: host address
590 * @val: value to store
591 *
592 * Atomically store 4 aligned bytes to @pv.
593 */
594static inline void store_atomic4(void *pv, uint32_t val)
595{
596    uint32_t *p = __builtin_assume_aligned(pv, 4);
597    qatomic_set(p, val);
598}
599
600/**
601 * store_atomic8:
602 * @pv: host address
603 * @val: value to store
604 *
605 * Atomically store 8 aligned bytes to @pv.
606 */
607static inline void store_atomic8(void *pv, uint64_t val)
608{
609    uint64_t *p = __builtin_assume_aligned(pv, 8);
610
611    qemu_build_assert(HAVE_al8);
612    qatomic_set__nocheck(p, val);
613}
614
615/**
616 * store_atom_4x2
617 */
618static inline void store_atom_4_by_2(void *pv, uint32_t val)
619{
620    store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
621    store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
622}
623
624/**
625 * store_atom_8_by_2
626 */
627static inline void store_atom_8_by_2(void *pv, uint64_t val)
628{
629    store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
630    store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
631}
632
633/**
634 * store_atom_8_by_4
635 */
636static inline void store_atom_8_by_4(void *pv, uint64_t val)
637{
638    store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
639    store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
640}
641
642/**
643 * store_atom_insert_al4:
644 * @p: host address
645 * @val: shifted value to store
646 * @msk: mask for value to store
647 *
648 * Atomically store @val to @p, masked by @msk.
649 */
650static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
651{
652    uint32_t old, new;
653
654    p = __builtin_assume_aligned(p, 4);
655    old = qatomic_read(p);
656    do {
657        new = (old & ~msk) | val;
658    } while (!__atomic_compare_exchange_n(p, &old, new, true,
659                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
660}
661
662/**
663 * store_atom_insert_al8:
664 * @p: host address
665 * @val: shifted value to store
666 * @msk: mask for value to store
667 *
668 * Atomically store @val to @p masked by @msk.
669 */
670static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
671{
672    uint64_t old, new;
673
674    qemu_build_assert(HAVE_al8);
675    p = __builtin_assume_aligned(p, 8);
676    old = qatomic_read__nocheck(p);
677    do {
678        new = (old & ~msk) | val;
679    } while (!__atomic_compare_exchange_n(p, &old, new, true,
680                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
681}
682
683/**
684 * store_atom_insert_al16:
685 * @p: host address
686 * @val: shifted value to store
687 * @msk: mask for value to store
688 *
689 * Atomically store @val to @p masked by @msk.
690 */
691static void store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
692{
693#if defined(CONFIG_ATOMIC128)
694    __uint128_t *pu, old, new;
695
696    /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
697    pu = __builtin_assume_aligned(ps, 16);
698    old = *pu;
699    do {
700        new = (old & ~msk.u) | val.u;
701    } while (!__atomic_compare_exchange_n(pu, &old, new, true,
702                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
703#elif defined(CONFIG_CMPXCHG128)
704    __uint128_t *pu, old, new;
705
706    /*
707     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
708     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
709     * and accept the sequential consistency that comes with it.
710     */
711    pu = __builtin_assume_aligned(ps, 16);
712    do {
713        old = *pu;
714        new = (old & ~msk.u) | val.u;
715    } while (!__sync_bool_compare_and_swap_16(pu, old, new));
716#else
717    qemu_build_not_reached();
718#endif
719}
720
721/**
722 * store_bytes_leN:
723 * @pv: host address
724 * @size: number of bytes to store
725 * @val_le: data to store
726 *
727 * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
728 * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
729 */
730static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
731{
732    uint8_t *p = pv;
733    for (int i = 0; i < size; i++, val_le >>= 8) {
734        p[i] = val_le;
735    }
736    return val_le;
737}
738
739/**
740 * store_parts_leN
741 * @pv: host address
742 * @size: number of bytes to store
743 * @val_le: data to store
744 *
745 * As store_bytes_leN, but atomically on each aligned part.
746 */
747G_GNUC_UNUSED
748static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
749{
750    do {
751        int n;
752
753        /* Find minimum of alignment and size */
754        switch (((uintptr_t)pv | size) & 7) {
755        case 4:
756            store_atomic4(pv, le32_to_cpu(val_le));
757            val_le >>= 32;
758            n = 4;
759            break;
760        case 2:
761        case 6:
762            store_atomic2(pv, le16_to_cpu(val_le));
763            val_le >>= 16;
764            n = 2;
765            break;
766        default:
767            *(uint8_t *)pv = val_le;
768            val_le >>= 8;
769            n = 1;
770            break;
771        case 0:
772            g_assert_not_reached();
773        }
774        pv += n;
775        size -= n;
776    } while (size != 0);
777
778    return val_le;
779}
780
781/**
782 * store_whole_le4
783 * @pv: host address
784 * @size: number of bytes to store
785 * @val_le: data to store
786 *
787 * As store_bytes_leN, but atomically as a whole.
788 * Four aligned bytes are guaranteed to cover the store.
789 */
790static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
791{
792    int sz = size * 8;
793    int o = (uintptr_t)pv & 3;
794    int sh = o * 8;
795    uint32_t m = MAKE_64BIT_MASK(0, sz);
796    uint32_t v;
797
798    if (HOST_BIG_ENDIAN) {
799        v = bswap32(val_le) >> sh;
800        m = bswap32(m) >> sh;
801    } else {
802        v = val_le << sh;
803        m <<= sh;
804    }
805    store_atom_insert_al4(pv - o, v, m);
806    return val_le >> sz;
807}
808
809/**
810 * store_whole_le8
811 * @pv: host address
812 * @size: number of bytes to store
813 * @val_le: data to store
814 *
815 * As store_bytes_leN, but atomically as a whole.
816 * Eight aligned bytes are guaranteed to cover the store.
817 */
818static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
819{
820    int sz = size * 8;
821    int o = (uintptr_t)pv & 7;
822    int sh = o * 8;
823    uint64_t m = MAKE_64BIT_MASK(0, sz);
824    uint64_t v;
825
826    qemu_build_assert(HAVE_al8);
827    if (HOST_BIG_ENDIAN) {
828        v = bswap64(val_le) >> sh;
829        m = bswap64(m) >> sh;
830    } else {
831        v = val_le << sh;
832        m <<= sh;
833    }
834    store_atom_insert_al8(pv - o, v, m);
835    return val_le >> sz;
836}
837
838/**
839 * store_whole_le16
840 * @pv: host address
841 * @size: number of bytes to store
842 * @val_le: data to store
843 *
844 * As store_bytes_leN, but atomically as a whole.
845 * 16 aligned bytes are guaranteed to cover the store.
846 */
847static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
848{
849    int sz = size * 8;
850    int o = (uintptr_t)pv & 15;
851    int sh = o * 8;
852    Int128 m, v;
853
854    qemu_build_assert(HAVE_al16);
855
856    /* Like MAKE_64BIT_MASK(0, sz), but larger. */
857    if (sz <= 64) {
858        m = int128_make64(MAKE_64BIT_MASK(0, sz));
859    } else {
860        m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
861    }
862
863    if (HOST_BIG_ENDIAN) {
864        v = int128_urshift(bswap128(val_le), sh);
865        m = int128_urshift(bswap128(m), sh);
866    } else {
867        v = int128_lshift(val_le, sh);
868        m = int128_lshift(m, sh);
869    }
870    store_atom_insert_al16(pv - o, v, m);
871
872    /* Unused if sz <= 64. */
873    return int128_gethi(val_le) >> (sz - 64);
874}
875
876/**
877 * store_atom_2:
878 * @p: host address
879 * @val: the value to store
880 * @memop: the full memory op
881 *
882 * Store 2 bytes to @p, honoring the atomicity of @memop.
883 */
884static void store_atom_2(CPUArchState *env, uintptr_t ra,
885                         void *pv, MemOp memop, uint16_t val)
886{
887    uintptr_t pi = (uintptr_t)pv;
888    int atmax;
889
890    if (likely((pi & 1) == 0)) {
891        store_atomic2(pv, val);
892        return;
893    }
894
895    atmax = required_atomicity(env, pi, memop);
896    if (atmax == MO_8) {
897        stw_he_p(pv, val);
898        return;
899    }
900
901    /*
902     * The only case remaining is MO_ATOM_WITHIN16.
903     * Big or little endian, we want the middle two bytes in each test.
904     */
905    if ((pi & 3) == 1) {
906        store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
907        return;
908    } else if ((pi & 7) == 3) {
909        if (HAVE_al8) {
910            store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
911            return;
912        }
913    } else if ((pi & 15) == 7) {
914        if (HAVE_al16) {
915            Int128 v = int128_lshift(int128_make64(val), 56);
916            Int128 m = int128_lshift(int128_make64(0xffff), 56);
917            store_atom_insert_al16(pv - 7, v, m);
918            return;
919        }
920    } else {
921        g_assert_not_reached();
922    }
923
924    cpu_loop_exit_atomic(env_cpu(env), ra);
925}
926
927/**
928 * store_atom_4:
929 * @p: host address
930 * @val: the value to store
931 * @memop: the full memory op
932 *
933 * Store 4 bytes to @p, honoring the atomicity of @memop.
934 */
935static void store_atom_4(CPUArchState *env, uintptr_t ra,
936                         void *pv, MemOp memop, uint32_t val)
937{
938    uintptr_t pi = (uintptr_t)pv;
939    int atmax;
940
941    if (likely((pi & 3) == 0)) {
942        store_atomic4(pv, val);
943        return;
944    }
945
946    atmax = required_atomicity(env, pi, memop);
947    switch (atmax) {
948    case MO_8:
949        stl_he_p(pv, val);
950        return;
951    case MO_16:
952        store_atom_4_by_2(pv, val);
953        return;
954    case -MO_16:
955        {
956            uint32_t val_le = cpu_to_le32(val);
957            int s2 = pi & 3;
958            int s1 = 4 - s2;
959
960            switch (s2) {
961            case 1:
962                val_le = store_whole_le4(pv, s1, val_le);
963                *(uint8_t *)(pv + 3) = val_le;
964                break;
965            case 3:
966                *(uint8_t *)pv = val_le;
967                store_whole_le4(pv + 1, s2, val_le >> 8);
968                break;
969            case 0: /* aligned */
970            case 2: /* atmax MO_16 */
971            default:
972                g_assert_not_reached();
973            }
974        }
975        return;
976    case MO_32:
977        if ((pi & 7) < 4) {
978            if (HAVE_al8) {
979                store_whole_le8(pv, 4, cpu_to_le32(val));
980                return;
981            }
982        } else {
983            if (HAVE_al16) {
984                store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
985                return;
986            }
987        }
988        cpu_loop_exit_atomic(env_cpu(env), ra);
989    default:
990        g_assert_not_reached();
991    }
992}
993
994/**
995 * store_atom_8:
996 * @p: host address
997 * @val: the value to store
998 * @memop: the full memory op
999 *
1000 * Store 8 bytes to @p, honoring the atomicity of @memop.
1001 */
1002static void store_atom_8(CPUArchState *env, uintptr_t ra,
1003                         void *pv, MemOp memop, uint64_t val)
1004{
1005    uintptr_t pi = (uintptr_t)pv;
1006    int atmax;
1007
1008    if (HAVE_al8 && likely((pi & 7) == 0)) {
1009        store_atomic8(pv, val);
1010        return;
1011    }
1012
1013    atmax = required_atomicity(env, pi, memop);
1014    switch (atmax) {
1015    case MO_8:
1016        stq_he_p(pv, val);
1017        return;
1018    case MO_16:
1019        store_atom_8_by_2(pv, val);
1020        return;
1021    case MO_32:
1022        store_atom_8_by_4(pv, val);
1023        return;
1024    case -MO_32:
1025        if (HAVE_al8) {
1026            uint64_t val_le = cpu_to_le64(val);
1027            int s2 = pi & 7;
1028            int s1 = 8 - s2;
1029
1030            switch (s2) {
1031            case 1 ... 3:
1032                val_le = store_whole_le8(pv, s1, val_le);
1033                store_bytes_leN(pv + s1, s2, val_le);
1034                break;
1035            case 5 ... 7:
1036                val_le = store_bytes_leN(pv, s1, val_le);
1037                store_whole_le8(pv + s1, s2, val_le);
1038                break;
1039            case 0: /* aligned */
1040            case 4: /* atmax MO_32 */
1041            default:
1042                g_assert_not_reached();
1043            }
1044            return;
1045        }
1046        break;
1047    case MO_64:
1048        if (HAVE_al16) {
1049            store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1050            return;
1051        }
1052        break;
1053    default:
1054        g_assert_not_reached();
1055    }
1056    cpu_loop_exit_atomic(env_cpu(env), ra);
1057}
1058