accel/tcg/ldst_atomicity.c.inc

6  * SPDX-License-Identifier: GPL-2.0-or-later
9  * See the COPYING file in the top-level directory.
12 #include "host/load-extract-al16-al8.h.inc"
13 #include "host/store-insert-al16.h.inc"
20 #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
27  * examined separately for atomicity, return -lg2.
29 static int required_atomicity(CPUState *cpu, uintptr_t p, MemOp memop)
33     MemOp half = size ? size - 1 : 0;
47         tmp = (1 << size) - 1;
68              * One of the pair crosses the boundary, and is non-atomic.
71             atmax = -half;
78          * that must be aligned.  Note that we only really need ctz4() --
96     if (cpu_in_serial_context(cpu)) {
130  * Atomically load 8 aligned bytes from @pv.
134     uint64_t *p = __builtin_assume_aligned(pv, 8);
142  * @cpu: generic cpu state
146  * Atomically load 8 aligned bytes from @pv.
149 static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
163         if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
164             uint64_t *p = __builtin_assume_aligned(pv, 8);
170     /* Ultimate fallback: re-execute in serial context. */
172     cpu_loop_exit_atomic(cpu, ra);
177  * @cpu: generic cpu state
184 static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
201      * the write is complete -- tests/tcg/multiarch/munmap-pthread.c
215     /* Ultimate fallback: re-execute in serial context. */
217     cpu_loop_exit_atomic(cpu, ra);
224  * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
229     int sh = (pi & 3) * 8;
237         return (a << sh) | (b >> (-sh & 31));
239         return (a >> sh) | (b << (-sh & 31));
247  * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
252     int sh = (pi & 7) * 8;
257     b = load_atomic8(pv + 8);
260         return (a << sh) | (b >> (-sh & 63));
262         return (a >> sh) | (b << (-sh & 63));
268  * @cpu: generic cpu state
273  * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
274  * not cross an 8-byte boundary.  This means that we can perform an atomic
275  * 8-byte load and extract.
278 static uint32_t load_atom_extract_al8_or_exit(CPUState *cpu, uintptr_t ra,
283     int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
286     return load_atomic8_or_exit(cpu, ra, pv) >> shr;
291  * @cpu: generic cpu state
294  * @s: object size in bytes, @s <= 8.
296  * Atomically load @s bytes from @p, when p % 16 < 8
297  * and p % 16 + s > 8.  I.e. does not cross a 16-byte
298  * boundary, but *does* cross an 8-byte boundary.
304 static uint64_t load_atom_extract_al16_or_exit(CPUState *cpu, uintptr_t ra,
309     int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
313      * Note constraints above: p & 8 must be clear.
317     r = load_atomic16_or_exit(cpu, ra, pv);
327  * Load 4 bytes from @pv, with two 2-byte atomic loads.
345  * Load 8 bytes from @pv, with four 2-byte atomic loads.
363  * Load 8 bytes from @pv, with two 4-byte atomic loads.
381  * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
399 static uint16_t load_atom_2(CPUState *cpu, uintptr_t ra,
409         intptr_t left_in_page = -(pi | TARGET_PAGE_MASK);
410         if (likely(left_in_page > 8)) {
415     atmax = required_atomicity(cpu, pi, memop);
423             return load_atomic4(pv - 1) >> 8;
426             return load_atom_extract_al8_or_exit(cpu, ra, pv, 2);
428         return load_atom_extract_al16_or_exit(cpu, ra, pv, 2);
441 static uint32_t load_atom_4(CPUState *cpu, uintptr_t ra,
451         intptr_t left_in_page = -(pi | TARGET_PAGE_MASK);
452         if (likely(left_in_page > 8)) {
457     atmax = required_atomicity(cpu, pi, memop);
461     case -MO_16:
471             return load_atom_extract_al8_or_exit(cpu, ra, pv, 4);
473         return load_atom_extract_al16_or_exit(cpu, ra, pv, 4);
484  * Load 8 bytes from @p, honoring the atomicity of @memop.
486 static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra,
493      * If the host does not support 8-byte atomics, wait until we have
500         return load_atom_extract_al16_or_al8(pv, 8);
503     atmax = required_atomicity(cpu, pi, memop);
506             load_atomic8_or_exit(cpu, ra, pv);
508         return load_atom_extract_al16_or_exit(cpu, ra, pv, 8);
520     case -MO_32:
525         cpu_loop_exit_atomic(cpu, ra);
538 static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
547      * If the host does not support 16-byte atomics, wait until we have
554     atmax = required_atomicity(cpu, pi, memop);
561         b = load_atom_8_by_2(pv + 8);
565         b = load_atom_8_by_4(pv + 8);
570             cpu_loop_exit_atomic(cpu, ra);
573         b = load_atomic8(pv + 8);
575     case -MO_64:
578             cpu_loop_exit_atomic(cpu, ra);
581         b = load_atom_extract_al8x2(pv + 8);
584         return load_atomic16_or_exit(cpu, ra, pv);
622  * Atomically store 8 aligned bytes to @pv.
626     uint64_t *p = __builtin_assume_aligned(pv, 8);
692     p = __builtin_assume_aligned(p, 8);
706  * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
712     for (int i = 0; i < size; i++, val_le >>= 8) {
747             val_le >>= 8;
754         size -= n;
771     int sz = size * 8;
773     int sh = o * 8;
784     store_atom_insert_al4(pv - o, v, m);
799     int sz = size * 8;
801     int sh = o * 8;
813     store_atom_insert_al8(pv - o, v, m);
828     int sz = size * 8;
830     int sh = o * 8;
839         m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
849     store_atom_insert_al16(pv - o, v, m);
854     return int128_gethi(val_le) >> (sz - 64);
865 static void store_atom_2(CPUState *cpu, uintptr_t ra,
876     atmax = required_atomicity(cpu, pi, memop);
887         store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
891             store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
898             store_atom_insert_al16(pv - 7, v, m);
906     cpu_loop_exit_atomic(cpu, ra);
917 static void store_atom_4(CPUState *cpu, uintptr_t ra,
928     atmax = required_atomicity(cpu, pi, memop);
936     case -MO_16:
940             int s1 = 4 - s2;
949                 store_whole_le4(pv + 1, s2, val_le >> 8);
971         cpu_loop_exit_atomic(cpu, ra);
983  * Store 8 bytes to @p, honoring the atomicity of @memop.
985 static void store_atom_8(CPUState *cpu, uintptr_t ra,
996     atmax = required_atomicity(cpu, pi, memop);
1007     case -MO_32:
1011             int s1 = 8 - s2;
1032             store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1040     cpu_loop_exit_atomic(cpu, ra);
1051 static void store_atom_16(CPUState *cpu, uintptr_t ra,
1063     atmax = required_atomicity(cpu, pi, memop);
1073         store_atom_8_by_2(pv + 8, b);
1077         store_atom_8_by_4(pv + 8, b);
1082             store_atomic8(pv + 8, b);
1086     case -MO_64:
1090             int s1 = 16 - s2;
1102                 val = int128_urshift(val, s1 * 8);
1106             case 8: /* atmax MO_64 */
1119     cpu_loop_exit_atomic(cpu, ra);