1/* 2 * Routines common to user and system emulation of load/store. 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * SPDX-License-Identifier: GPL-2.0-or-later 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12#ifdef CONFIG_ATOMIC64 13# define HAVE_al8 true 14#else 15# define HAVE_al8 false 16#endif 17#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) 18 19/* 20 * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics 21 * that are supported by the host, e.g. s390x. We can force the pointer to 22 * have our known alignment with __builtin_assume_aligned, however prior to 23 * GCC 13 that was only reliable with optimization enabled. See 24 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389 25 */ 26#if defined(CONFIG_ATOMIC128_OPT) 27# if !defined(__OPTIMIZE__) 28# define ATTRIBUTE_ATOMIC128_OPT __attribute__((optimize("O1"))) 29# endif 30# define CONFIG_ATOMIC128 31#endif 32#ifndef ATTRIBUTE_ATOMIC128_OPT 33# define ATTRIBUTE_ATOMIC128_OPT 34#endif 35 36#if defined(CONFIG_ATOMIC128) 37# define HAVE_al16_fast true 38#else 39# define HAVE_al16_fast false 40#endif 41#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128) 42# define HAVE_al16 true 43#else 44# define HAVE_al16 false 45#endif 46 47 48/** 49 * required_atomicity: 50 * 51 * Return the lg2 bytes of atomicity required by @memop for @p. 52 * If the operation must be split into two operations to be 53 * examined separately for atomicity, return -lg2. 54 */ 55static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop) 56{ 57 MemOp atom = memop & MO_ATOM_MASK; 58 MemOp size = memop & MO_SIZE; 59 MemOp half = size ? size - 1 : 0; 60 unsigned tmp; 61 int atmax; 62 63 switch (atom) { 64 case MO_ATOM_NONE: 65 atmax = MO_8; 66 break; 67 68 case MO_ATOM_IFALIGN_PAIR: 69 size = half; 70 /* fall through */ 71 72 case MO_ATOM_IFALIGN: 73 tmp = (1 << size) - 1; 74 atmax = p & tmp ? MO_8 : size; 75 break; 76 77 case MO_ATOM_WITHIN16: 78 tmp = p & 15; 79 atmax = (tmp + (1 << size) <= 16 ? size : MO_8); 80 break; 81 82 case MO_ATOM_WITHIN16_PAIR: 83 tmp = p & 15; 84 if (tmp + (1 << size) <= 16) { 85 atmax = size; 86 } else if (tmp + (1 << half) == 16) { 87 /* 88 * The pair exactly straddles the boundary. 89 * Both halves are naturally aligned and atomic. 90 */ 91 atmax = half; 92 } else { 93 /* 94 * One of the pair crosses the boundary, and is non-atomic. 95 * The other of the pair does not cross, and is atomic. 96 */ 97 atmax = -half; 98 } 99 break; 100 101 case MO_ATOM_SUBALIGN: 102 /* 103 * Examine the alignment of p to determine if there are subobjects 104 * that must be aligned. Note that we only really need ctz4() -- 105 * any more sigificant bits are discarded by the immediately 106 * following comparison. 107 */ 108 tmp = ctz32(p); 109 atmax = MIN(size, tmp); 110 break; 111 112 default: 113 g_assert_not_reached(); 114 } 115 116 /* 117 * Here we have the architectural atomicity of the operation. 118 * However, when executing in a serial context, we need no extra 119 * host atomicity in order to avoid racing. This reduction 120 * avoids looping with cpu_loop_exit_atomic. 121 */ 122 if (cpu_in_serial_context(env_cpu(env))) { 123 return MO_8; 124 } 125 return atmax; 126} 127 128/** 129 * load_atomic2: 130 * @pv: host address 131 * 132 * Atomically load 2 aligned bytes from @pv. 133 */ 134static inline uint16_t load_atomic2(void *pv) 135{ 136 uint16_t *p = __builtin_assume_aligned(pv, 2); 137 return qatomic_read(p); 138} 139 140/** 141 * load_atomic4: 142 * @pv: host address 143 * 144 * Atomically load 4 aligned bytes from @pv. 145 */ 146static inline uint32_t load_atomic4(void *pv) 147{ 148 uint32_t *p = __builtin_assume_aligned(pv, 4); 149 return qatomic_read(p); 150} 151 152/** 153 * load_atomic8: 154 * @pv: host address 155 * 156 * Atomically load 8 aligned bytes from @pv. 157 */ 158static inline uint64_t load_atomic8(void *pv) 159{ 160 uint64_t *p = __builtin_assume_aligned(pv, 8); 161 162 qemu_build_assert(HAVE_al8); 163 return qatomic_read__nocheck(p); 164} 165 166/** 167 * load_atomic16: 168 * @pv: host address 169 * 170 * Atomically load 16 aligned bytes from @pv. 171 */ 172static inline Int128 ATTRIBUTE_ATOMIC128_OPT 173load_atomic16(void *pv) 174{ 175#ifdef CONFIG_ATOMIC128 176 __uint128_t *p = __builtin_assume_aligned(pv, 16); 177 Int128Alias r; 178 179 r.u = qatomic_read__nocheck(p); 180 return r.s; 181#else 182 qemu_build_not_reached(); 183#endif 184} 185 186/** 187 * load_atomic8_or_exit: 188 * @env: cpu context 189 * @ra: host unwind address 190 * @pv: host address 191 * 192 * Atomically load 8 aligned bytes from @pv. 193 * If this is not possible, longjmp out to restart serially. 194 */ 195static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 196{ 197 if (HAVE_al8) { 198 return load_atomic8(pv); 199 } 200 201#ifdef CONFIG_USER_ONLY 202 /* 203 * If the page is not writable, then assume the value is immutable 204 * and requires no locking. This ignores the case of MAP_SHARED with 205 * another process, because the fallback start_exclusive solution 206 * provides no protection across processes. 207 */ 208 if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) { 209 uint64_t *p = __builtin_assume_aligned(pv, 8); 210 return *p; 211 } 212#endif 213 214 /* Ultimate fallback: re-execute in serial context. */ 215 cpu_loop_exit_atomic(env_cpu(env), ra); 216} 217 218/** 219 * load_atomic16_or_exit: 220 * @env: cpu context 221 * @ra: host unwind address 222 * @pv: host address 223 * 224 * Atomically load 16 aligned bytes from @pv. 225 * If this is not possible, longjmp out to restart serially. 226 */ 227static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 228{ 229 Int128 *p = __builtin_assume_aligned(pv, 16); 230 231 if (HAVE_al16_fast) { 232 return load_atomic16(p); 233 } 234 235#ifdef CONFIG_USER_ONLY 236 /* 237 * We can only use cmpxchg to emulate a load if the page is writable. 238 * If the page is not writable, then assume the value is immutable 239 * and requires no locking. This ignores the case of MAP_SHARED with 240 * another process, because the fallback start_exclusive solution 241 * provides no protection across processes. 242 */ 243 if (!page_check_range(h2g(p), 16, PAGE_WRITE)) { 244 return *p; 245 } 246#endif 247 248 /* 249 * In system mode all guest pages are writable, and for user-only 250 * we have just checked writability. Try cmpxchg. 251 */ 252#if defined(CONFIG_CMPXCHG128) 253 /* Swap 0 with 0, with the side-effect of returning the old value. */ 254 { 255 Int128Alias r; 256 r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0); 257 return r.s; 258 } 259#endif 260 261 /* Ultimate fallback: re-execute in serial context. */ 262 cpu_loop_exit_atomic(env_cpu(env), ra); 263} 264 265/** 266 * load_atom_extract_al4x2: 267 * @pv: host address 268 * 269 * Load 4 bytes from @p, from two sequential atomic 4-byte loads. 270 */ 271static uint32_t load_atom_extract_al4x2(void *pv) 272{ 273 uintptr_t pi = (uintptr_t)pv; 274 int sh = (pi & 3) * 8; 275 uint32_t a, b; 276 277 pv = (void *)(pi & ~3); 278 a = load_atomic4(pv); 279 b = load_atomic4(pv + 4); 280 281 if (HOST_BIG_ENDIAN) { 282 return (a << sh) | (b >> (-sh & 31)); 283 } else { 284 return (a >> sh) | (b << (-sh & 31)); 285 } 286} 287 288/** 289 * load_atom_extract_al8x2: 290 * @pv: host address 291 * 292 * Load 8 bytes from @p, from two sequential atomic 8-byte loads. 293 */ 294static uint64_t load_atom_extract_al8x2(void *pv) 295{ 296 uintptr_t pi = (uintptr_t)pv; 297 int sh = (pi & 7) * 8; 298 uint64_t a, b; 299 300 pv = (void *)(pi & ~7); 301 a = load_atomic8(pv); 302 b = load_atomic8(pv + 8); 303 304 if (HOST_BIG_ENDIAN) { 305 return (a << sh) | (b >> (-sh & 63)); 306 } else { 307 return (a >> sh) | (b << (-sh & 63)); 308 } 309} 310 311/** 312 * load_atom_extract_al8_or_exit: 313 * @env: cpu context 314 * @ra: host unwind address 315 * @pv: host address 316 * @s: object size in bytes, @s <= 4. 317 * 318 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does 319 * not cross an 8-byte boundary. This means that we can perform an atomic 320 * 8-byte load and extract. 321 * The value is returned in the low bits of a uint32_t. 322 */ 323static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra, 324 void *pv, int s) 325{ 326 uintptr_t pi = (uintptr_t)pv; 327 int o = pi & 7; 328 int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; 329 330 pv = (void *)(pi & ~7); 331 return load_atomic8_or_exit(env, ra, pv) >> shr; 332} 333 334/** 335 * load_atom_extract_al16_or_exit: 336 * @env: cpu context 337 * @ra: host unwind address 338 * @p: host address 339 * @s: object size in bytes, @s <= 8. 340 * 341 * Atomically load @s bytes from @p, when p % 16 < 8 342 * and p % 16 + s > 8. I.e. does not cross a 16-byte 343 * boundary, but *does* cross an 8-byte boundary. 344 * This is the slow version, so we must have eliminated 345 * any faster load_atom_extract_al8_or_exit case. 346 * 347 * If this is not possible, longjmp out to restart serially. 348 */ 349static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra, 350 void *pv, int s) 351{ 352 uintptr_t pi = (uintptr_t)pv; 353 int o = pi & 7; 354 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 355 Int128 r; 356 357 /* 358 * Note constraints above: p & 8 must be clear. 359 * Provoke SIGBUS if possible otherwise. 360 */ 361 pv = (void *)(pi & ~7); 362 r = load_atomic16_or_exit(env, ra, pv); 363 364 r = int128_urshift(r, shr); 365 return int128_getlo(r); 366} 367 368/** 369 * load_atom_extract_al16_or_al8: 370 * @p: host address 371 * @s: object size in bytes, @s <= 8. 372 * 373 * Load @s bytes from @p, when p % s != 0. If [p, p+s-1] does not 374 * cross an 16-byte boundary then the access must be 16-byte atomic, 375 * otherwise the access must be 8-byte atomic. 376 */ 377static inline uint64_t ATTRIBUTE_ATOMIC128_OPT 378load_atom_extract_al16_or_al8(void *pv, int s) 379{ 380#if defined(CONFIG_ATOMIC128) 381 uintptr_t pi = (uintptr_t)pv; 382 int o = pi & 7; 383 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 384 __uint128_t r; 385 386 pv = (void *)(pi & ~7); 387 if (pi & 8) { 388 uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8); 389 uint64_t a = qatomic_read__nocheck(p8); 390 uint64_t b = qatomic_read__nocheck(p8 + 1); 391 392 if (HOST_BIG_ENDIAN) { 393 r = ((__uint128_t)a << 64) | b; 394 } else { 395 r = ((__uint128_t)b << 64) | a; 396 } 397 } else { 398 __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0); 399 r = qatomic_read__nocheck(p16); 400 } 401 return r >> shr; 402#else 403 qemu_build_not_reached(); 404#endif 405} 406 407/** 408 * load_atom_4_by_2: 409 * @pv: host address 410 * 411 * Load 4 bytes from @pv, with two 2-byte atomic loads. 412 */ 413static inline uint32_t load_atom_4_by_2(void *pv) 414{ 415 uint32_t a = load_atomic2(pv); 416 uint32_t b = load_atomic2(pv + 2); 417 418 if (HOST_BIG_ENDIAN) { 419 return (a << 16) | b; 420 } else { 421 return (b << 16) | a; 422 } 423} 424 425/** 426 * load_atom_8_by_2: 427 * @pv: host address 428 * 429 * Load 8 bytes from @pv, with four 2-byte atomic loads. 430 */ 431static inline uint64_t load_atom_8_by_2(void *pv) 432{ 433 uint32_t a = load_atom_4_by_2(pv); 434 uint32_t b = load_atom_4_by_2(pv + 4); 435 436 if (HOST_BIG_ENDIAN) { 437 return ((uint64_t)a << 32) | b; 438 } else { 439 return ((uint64_t)b << 32) | a; 440 } 441} 442 443/** 444 * load_atom_8_by_4: 445 * @pv: host address 446 * 447 * Load 8 bytes from @pv, with two 4-byte atomic loads. 448 */ 449static inline uint64_t load_atom_8_by_4(void *pv) 450{ 451 uint32_t a = load_atomic4(pv); 452 uint32_t b = load_atomic4(pv + 4); 453 454 if (HOST_BIG_ENDIAN) { 455 return ((uint64_t)a << 32) | b; 456 } else { 457 return ((uint64_t)b << 32) | a; 458 } 459} 460 461/** 462 * load_atom_8_by_8_or_4: 463 * @pv: host address 464 * 465 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity. 466 */ 467static inline uint64_t load_atom_8_by_8_or_4(void *pv) 468{ 469 if (HAVE_al8_fast) { 470 return load_atomic8(pv); 471 } else { 472 return load_atom_8_by_4(pv); 473 } 474} 475 476/** 477 * load_atom_2: 478 * @p: host address 479 * @memop: the full memory op 480 * 481 * Load 2 bytes from @p, honoring the atomicity of @memop. 482 */ 483static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra, 484 void *pv, MemOp memop) 485{ 486 uintptr_t pi = (uintptr_t)pv; 487 int atmax; 488 489 if (likely((pi & 1) == 0)) { 490 return load_atomic2(pv); 491 } 492 if (HAVE_al16_fast) { 493 return load_atom_extract_al16_or_al8(pv, 2); 494 } 495 496 atmax = required_atomicity(env, pi, memop); 497 switch (atmax) { 498 case MO_8: 499 return lduw_he_p(pv); 500 case MO_16: 501 /* The only case remaining is MO_ATOM_WITHIN16. */ 502 if (!HAVE_al8_fast && (pi & 3) == 1) { 503 /* Big or little endian, we want the middle two bytes. */ 504 return load_atomic4(pv - 1) >> 8; 505 } 506 if ((pi & 15) != 7) { 507 return load_atom_extract_al8_or_exit(env, ra, pv, 2); 508 } 509 return load_atom_extract_al16_or_exit(env, ra, pv, 2); 510 default: 511 g_assert_not_reached(); 512 } 513} 514 515/** 516 * load_atom_4: 517 * @p: host address 518 * @memop: the full memory op 519 * 520 * Load 4 bytes from @p, honoring the atomicity of @memop. 521 */ 522static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra, 523 void *pv, MemOp memop) 524{ 525 uintptr_t pi = (uintptr_t)pv; 526 int atmax; 527 528 if (likely((pi & 3) == 0)) { 529 return load_atomic4(pv); 530 } 531 if (HAVE_al16_fast) { 532 return load_atom_extract_al16_or_al8(pv, 4); 533 } 534 535 atmax = required_atomicity(env, pi, memop); 536 switch (atmax) { 537 case MO_8: 538 case MO_16: 539 case -MO_16: 540 /* 541 * For MO_ATOM_IFALIGN, this is more atomicity than required, 542 * but it's trivially supported on all hosts, better than 4 543 * individual byte loads (when the host requires alignment), 544 * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. 545 */ 546 return load_atom_extract_al4x2(pv); 547 case MO_32: 548 if (!(pi & 4)) { 549 return load_atom_extract_al8_or_exit(env, ra, pv, 4); 550 } 551 return load_atom_extract_al16_or_exit(env, ra, pv, 4); 552 default: 553 g_assert_not_reached(); 554 } 555} 556 557/** 558 * load_atom_8: 559 * @p: host address 560 * @memop: the full memory op 561 * 562 * Load 8 bytes from @p, honoring the atomicity of @memop. 563 */ 564static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra, 565 void *pv, MemOp memop) 566{ 567 uintptr_t pi = (uintptr_t)pv; 568 int atmax; 569 570 /* 571 * If the host does not support 8-byte atomics, wait until we have 572 * examined the atomicity parameters below. 573 */ 574 if (HAVE_al8 && likely((pi & 7) == 0)) { 575 return load_atomic8(pv); 576 } 577 if (HAVE_al16_fast) { 578 return load_atom_extract_al16_or_al8(pv, 8); 579 } 580 581 atmax = required_atomicity(env, pi, memop); 582 if (atmax == MO_64) { 583 if (!HAVE_al8 && (pi & 7) == 0) { 584 load_atomic8_or_exit(env, ra, pv); 585 } 586 return load_atom_extract_al16_or_exit(env, ra, pv, 8); 587 } 588 if (HAVE_al8_fast) { 589 return load_atom_extract_al8x2(pv); 590 } 591 switch (atmax) { 592 case MO_8: 593 return ldq_he_p(pv); 594 case MO_16: 595 return load_atom_8_by_2(pv); 596 case MO_32: 597 return load_atom_8_by_4(pv); 598 case -MO_32: 599 if (HAVE_al8) { 600 return load_atom_extract_al8x2(pv); 601 } 602 cpu_loop_exit_atomic(env_cpu(env), ra); 603 default: 604 g_assert_not_reached(); 605 } 606} 607 608/** 609 * load_atom_16: 610 * @p: host address 611 * @memop: the full memory op 612 * 613 * Load 16 bytes from @p, honoring the atomicity of @memop. 614 */ 615static Int128 load_atom_16(CPUArchState *env, uintptr_t ra, 616 void *pv, MemOp memop) 617{ 618 uintptr_t pi = (uintptr_t)pv; 619 int atmax; 620 Int128 r; 621 uint64_t a, b; 622 623 /* 624 * If the host does not support 16-byte atomics, wait until we have 625 * examined the atomicity parameters below. 626 */ 627 if (HAVE_al16_fast && likely((pi & 15) == 0)) { 628 return load_atomic16(pv); 629 } 630 631 atmax = required_atomicity(env, pi, memop); 632 switch (atmax) { 633 case MO_8: 634 memcpy(&r, pv, 16); 635 return r; 636 case MO_16: 637 a = load_atom_8_by_2(pv); 638 b = load_atom_8_by_2(pv + 8); 639 break; 640 case MO_32: 641 a = load_atom_8_by_4(pv); 642 b = load_atom_8_by_4(pv + 8); 643 break; 644 case MO_64: 645 if (!HAVE_al8) { 646 cpu_loop_exit_atomic(env_cpu(env), ra); 647 } 648 a = load_atomic8(pv); 649 b = load_atomic8(pv + 8); 650 break; 651 case -MO_64: 652 if (!HAVE_al8) { 653 cpu_loop_exit_atomic(env_cpu(env), ra); 654 } 655 a = load_atom_extract_al8x2(pv); 656 b = load_atom_extract_al8x2(pv + 8); 657 break; 658 case MO_128: 659 return load_atomic16_or_exit(env, ra, pv); 660 default: 661 g_assert_not_reached(); 662 } 663 return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b); 664} 665 666/** 667 * store_atomic2: 668 * @pv: host address 669 * @val: value to store 670 * 671 * Atomically store 2 aligned bytes to @pv. 672 */ 673static inline void store_atomic2(void *pv, uint16_t val) 674{ 675 uint16_t *p = __builtin_assume_aligned(pv, 2); 676 qatomic_set(p, val); 677} 678 679/** 680 * store_atomic4: 681 * @pv: host address 682 * @val: value to store 683 * 684 * Atomically store 4 aligned bytes to @pv. 685 */ 686static inline void store_atomic4(void *pv, uint32_t val) 687{ 688 uint32_t *p = __builtin_assume_aligned(pv, 4); 689 qatomic_set(p, val); 690} 691 692/** 693 * store_atomic8: 694 * @pv: host address 695 * @val: value to store 696 * 697 * Atomically store 8 aligned bytes to @pv. 698 */ 699static inline void store_atomic8(void *pv, uint64_t val) 700{ 701 uint64_t *p = __builtin_assume_aligned(pv, 8); 702 703 qemu_build_assert(HAVE_al8); 704 qatomic_set__nocheck(p, val); 705} 706 707/** 708 * store_atomic16: 709 * @pv: host address 710 * @val: value to store 711 * 712 * Atomically store 16 aligned bytes to @pv. 713 */ 714static inline void ATTRIBUTE_ATOMIC128_OPT 715store_atomic16(void *pv, Int128Alias val) 716{ 717#if defined(CONFIG_ATOMIC128) 718 __uint128_t *pu = __builtin_assume_aligned(pv, 16); 719 qatomic_set__nocheck(pu, val.u); 720#elif defined(CONFIG_CMPXCHG128) 721 __uint128_t *pu = __builtin_assume_aligned(pv, 16); 722 __uint128_t o; 723 724 /* 725 * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always 726 * defer to libatomic, so we must use __sync_*_compare_and_swap_16 727 * and accept the sequential consistency that comes with it. 728 */ 729 do { 730 o = *pu; 731 } while (!__sync_bool_compare_and_swap_16(pu, o, val.u)); 732#else 733 qemu_build_not_reached(); 734#endif 735} 736 737/** 738 * store_atom_4x2 739 */ 740static inline void store_atom_4_by_2(void *pv, uint32_t val) 741{ 742 store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); 743 store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); 744} 745 746/** 747 * store_atom_8_by_2 748 */ 749static inline void store_atom_8_by_2(void *pv, uint64_t val) 750{ 751 store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 752 store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 753} 754 755/** 756 * store_atom_8_by_4 757 */ 758static inline void store_atom_8_by_4(void *pv, uint64_t val) 759{ 760 store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 761 store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 762} 763 764/** 765 * store_atom_insert_al4: 766 * @p: host address 767 * @val: shifted value to store 768 * @msk: mask for value to store 769 * 770 * Atomically store @val to @p, masked by @msk. 771 */ 772static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) 773{ 774 uint32_t old, new; 775 776 p = __builtin_assume_aligned(p, 4); 777 old = qatomic_read(p); 778 do { 779 new = (old & ~msk) | val; 780 } while (!__atomic_compare_exchange_n(p, &old, new, true, 781 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 782} 783 784/** 785 * store_atom_insert_al8: 786 * @p: host address 787 * @val: shifted value to store 788 * @msk: mask for value to store 789 * 790 * Atomically store @val to @p masked by @msk. 791 */ 792static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) 793{ 794 uint64_t old, new; 795 796 qemu_build_assert(HAVE_al8); 797 p = __builtin_assume_aligned(p, 8); 798 old = qatomic_read__nocheck(p); 799 do { 800 new = (old & ~msk) | val; 801 } while (!__atomic_compare_exchange_n(p, &old, new, true, 802 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 803} 804 805/** 806 * store_atom_insert_al16: 807 * @p: host address 808 * @val: shifted value to store 809 * @msk: mask for value to store 810 * 811 * Atomically store @val to @p masked by @msk. 812 */ 813static void ATTRIBUTE_ATOMIC128_OPT 814store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk) 815{ 816#if defined(CONFIG_ATOMIC128) 817 __uint128_t *pu, old, new; 818 819 /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */ 820 pu = __builtin_assume_aligned(ps, 16); 821 old = *pu; 822 do { 823 new = (old & ~msk.u) | val.u; 824 } while (!__atomic_compare_exchange_n(pu, &old, new, true, 825 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 826#elif defined(CONFIG_CMPXCHG128) 827 __uint128_t *pu, old, new; 828 829 /* 830 * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always 831 * defer to libatomic, so we must use __sync_*_compare_and_swap_16 832 * and accept the sequential consistency that comes with it. 833 */ 834 pu = __builtin_assume_aligned(ps, 16); 835 do { 836 old = *pu; 837 new = (old & ~msk.u) | val.u; 838 } while (!__sync_bool_compare_and_swap_16(pu, old, new)); 839#else 840 qemu_build_not_reached(); 841#endif 842} 843 844/** 845 * store_bytes_leN: 846 * @pv: host address 847 * @size: number of bytes to store 848 * @val_le: data to store 849 * 850 * Store @size bytes at @p. The bytes to store are extracted in little-endian order 851 * from @val_le; return the bytes of @val_le beyond @size that have not been stored. 852 */ 853static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) 854{ 855 uint8_t *p = pv; 856 for (int i = 0; i < size; i++, val_le >>= 8) { 857 p[i] = val_le; 858 } 859 return val_le; 860} 861 862/** 863 * store_parts_leN 864 * @pv: host address 865 * @size: number of bytes to store 866 * @val_le: data to store 867 * 868 * As store_bytes_leN, but atomically on each aligned part. 869 */ 870G_GNUC_UNUSED 871static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) 872{ 873 do { 874 int n; 875 876 /* Find minimum of alignment and size */ 877 switch (((uintptr_t)pv | size) & 7) { 878 case 4: 879 store_atomic4(pv, le32_to_cpu(val_le)); 880 val_le >>= 32; 881 n = 4; 882 break; 883 case 2: 884 case 6: 885 store_atomic2(pv, le16_to_cpu(val_le)); 886 val_le >>= 16; 887 n = 2; 888 break; 889 default: 890 *(uint8_t *)pv = val_le; 891 val_le >>= 8; 892 n = 1; 893 break; 894 case 0: 895 g_assert_not_reached(); 896 } 897 pv += n; 898 size -= n; 899 } while (size != 0); 900 901 return val_le; 902} 903 904/** 905 * store_whole_le4 906 * @pv: host address 907 * @size: number of bytes to store 908 * @val_le: data to store 909 * 910 * As store_bytes_leN, but atomically as a whole. 911 * Four aligned bytes are guaranteed to cover the store. 912 */ 913static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) 914{ 915 int sz = size * 8; 916 int o = (uintptr_t)pv & 3; 917 int sh = o * 8; 918 uint32_t m = MAKE_64BIT_MASK(0, sz); 919 uint32_t v; 920 921 if (HOST_BIG_ENDIAN) { 922 v = bswap32(val_le) >> sh; 923 m = bswap32(m) >> sh; 924 } else { 925 v = val_le << sh; 926 m <<= sh; 927 } 928 store_atom_insert_al4(pv - o, v, m); 929 return val_le >> sz; 930} 931 932/** 933 * store_whole_le8 934 * @pv: host address 935 * @size: number of bytes to store 936 * @val_le: data to store 937 * 938 * As store_bytes_leN, but atomically as a whole. 939 * Eight aligned bytes are guaranteed to cover the store. 940 */ 941static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) 942{ 943 int sz = size * 8; 944 int o = (uintptr_t)pv & 7; 945 int sh = o * 8; 946 uint64_t m = MAKE_64BIT_MASK(0, sz); 947 uint64_t v; 948 949 qemu_build_assert(HAVE_al8); 950 if (HOST_BIG_ENDIAN) { 951 v = bswap64(val_le) >> sh; 952 m = bswap64(m) >> sh; 953 } else { 954 v = val_le << sh; 955 m <<= sh; 956 } 957 store_atom_insert_al8(pv - o, v, m); 958 return val_le >> sz; 959} 960 961/** 962 * store_whole_le16 963 * @pv: host address 964 * @size: number of bytes to store 965 * @val_le: data to store 966 * 967 * As store_bytes_leN, but atomically as a whole. 968 * 16 aligned bytes are guaranteed to cover the store. 969 */ 970static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) 971{ 972 int sz = size * 8; 973 int o = (uintptr_t)pv & 15; 974 int sh = o * 8; 975 Int128 m, v; 976 977 qemu_build_assert(HAVE_al16); 978 979 /* Like MAKE_64BIT_MASK(0, sz), but larger. */ 980 if (sz <= 64) { 981 m = int128_make64(MAKE_64BIT_MASK(0, sz)); 982 } else { 983 m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); 984 } 985 986 if (HOST_BIG_ENDIAN) { 987 v = int128_urshift(bswap128(val_le), sh); 988 m = int128_urshift(bswap128(m), sh); 989 } else { 990 v = int128_lshift(val_le, sh); 991 m = int128_lshift(m, sh); 992 } 993 store_atom_insert_al16(pv - o, v, m); 994 995 /* Unused if sz <= 64. */ 996 return int128_gethi(val_le) >> (sz - 64); 997} 998 999/** 1000 * store_atom_2: 1001 * @p: host address 1002 * @val: the value to store 1003 * @memop: the full memory op 1004 * 1005 * Store 2 bytes to @p, honoring the atomicity of @memop. 1006 */ 1007static void store_atom_2(CPUArchState *env, uintptr_t ra, 1008 void *pv, MemOp memop, uint16_t val) 1009{ 1010 uintptr_t pi = (uintptr_t)pv; 1011 int atmax; 1012 1013 if (likely((pi & 1) == 0)) { 1014 store_atomic2(pv, val); 1015 return; 1016 } 1017 1018 atmax = required_atomicity(env, pi, memop); 1019 if (atmax == MO_8) { 1020 stw_he_p(pv, val); 1021 return; 1022 } 1023 1024 /* 1025 * The only case remaining is MO_ATOM_WITHIN16. 1026 * Big or little endian, we want the middle two bytes in each test. 1027 */ 1028 if ((pi & 3) == 1) { 1029 store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); 1030 return; 1031 } else if ((pi & 7) == 3) { 1032 if (HAVE_al8) { 1033 store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); 1034 return; 1035 } 1036 } else if ((pi & 15) == 7) { 1037 if (HAVE_al16) { 1038 Int128 v = int128_lshift(int128_make64(val), 56); 1039 Int128 m = int128_lshift(int128_make64(0xffff), 56); 1040 store_atom_insert_al16(pv - 7, v, m); 1041 return; 1042 } 1043 } else { 1044 g_assert_not_reached(); 1045 } 1046 1047 cpu_loop_exit_atomic(env_cpu(env), ra); 1048} 1049 1050/** 1051 * store_atom_4: 1052 * @p: host address 1053 * @val: the value to store 1054 * @memop: the full memory op 1055 * 1056 * Store 4 bytes to @p, honoring the atomicity of @memop. 1057 */ 1058static void store_atom_4(CPUArchState *env, uintptr_t ra, 1059 void *pv, MemOp memop, uint32_t val) 1060{ 1061 uintptr_t pi = (uintptr_t)pv; 1062 int atmax; 1063 1064 if (likely((pi & 3) == 0)) { 1065 store_atomic4(pv, val); 1066 return; 1067 } 1068 1069 atmax = required_atomicity(env, pi, memop); 1070 switch (atmax) { 1071 case MO_8: 1072 stl_he_p(pv, val); 1073 return; 1074 case MO_16: 1075 store_atom_4_by_2(pv, val); 1076 return; 1077 case -MO_16: 1078 { 1079 uint32_t val_le = cpu_to_le32(val); 1080 int s2 = pi & 3; 1081 int s1 = 4 - s2; 1082 1083 switch (s2) { 1084 case 1: 1085 val_le = store_whole_le4(pv, s1, val_le); 1086 *(uint8_t *)(pv + 3) = val_le; 1087 break; 1088 case 3: 1089 *(uint8_t *)pv = val_le; 1090 store_whole_le4(pv + 1, s2, val_le >> 8); 1091 break; 1092 case 0: /* aligned */ 1093 case 2: /* atmax MO_16 */ 1094 default: 1095 g_assert_not_reached(); 1096 } 1097 } 1098 return; 1099 case MO_32: 1100 if ((pi & 7) < 4) { 1101 if (HAVE_al8) { 1102 store_whole_le8(pv, 4, cpu_to_le32(val)); 1103 return; 1104 } 1105 } else { 1106 if (HAVE_al16) { 1107 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); 1108 return; 1109 } 1110 } 1111 cpu_loop_exit_atomic(env_cpu(env), ra); 1112 default: 1113 g_assert_not_reached(); 1114 } 1115} 1116 1117/** 1118 * store_atom_8: 1119 * @p: host address 1120 * @val: the value to store 1121 * @memop: the full memory op 1122 * 1123 * Store 8 bytes to @p, honoring the atomicity of @memop. 1124 */ 1125static void store_atom_8(CPUArchState *env, uintptr_t ra, 1126 void *pv, MemOp memop, uint64_t val) 1127{ 1128 uintptr_t pi = (uintptr_t)pv; 1129 int atmax; 1130 1131 if (HAVE_al8 && likely((pi & 7) == 0)) { 1132 store_atomic8(pv, val); 1133 return; 1134 } 1135 1136 atmax = required_atomicity(env, pi, memop); 1137 switch (atmax) { 1138 case MO_8: 1139 stq_he_p(pv, val); 1140 return; 1141 case MO_16: 1142 store_atom_8_by_2(pv, val); 1143 return; 1144 case MO_32: 1145 store_atom_8_by_4(pv, val); 1146 return; 1147 case -MO_32: 1148 if (HAVE_al8) { 1149 uint64_t val_le = cpu_to_le64(val); 1150 int s2 = pi & 7; 1151 int s1 = 8 - s2; 1152 1153 switch (s2) { 1154 case 1 ... 3: 1155 val_le = store_whole_le8(pv, s1, val_le); 1156 store_bytes_leN(pv + s1, s2, val_le); 1157 break; 1158 case 5 ... 7: 1159 val_le = store_bytes_leN(pv, s1, val_le); 1160 store_whole_le8(pv + s1, s2, val_le); 1161 break; 1162 case 0: /* aligned */ 1163 case 4: /* atmax MO_32 */ 1164 default: 1165 g_assert_not_reached(); 1166 } 1167 return; 1168 } 1169 break; 1170 case MO_64: 1171 if (HAVE_al16) { 1172 store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); 1173 return; 1174 } 1175 break; 1176 default: 1177 g_assert_not_reached(); 1178 } 1179 cpu_loop_exit_atomic(env_cpu(env), ra); 1180} 1181 1182/** 1183 * store_atom_16: 1184 * @p: host address 1185 * @val: the value to store 1186 * @memop: the full memory op 1187 * 1188 * Store 16 bytes to @p, honoring the atomicity of @memop. 1189 */ 1190static void store_atom_16(CPUArchState *env, uintptr_t ra, 1191 void *pv, MemOp memop, Int128 val) 1192{ 1193 uintptr_t pi = (uintptr_t)pv; 1194 uint64_t a, b; 1195 int atmax; 1196 1197 if (HAVE_al16_fast && likely((pi & 15) == 0)) { 1198 store_atomic16(pv, val); 1199 return; 1200 } 1201 1202 atmax = required_atomicity(env, pi, memop); 1203 1204 a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val); 1205 b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val); 1206 switch (atmax) { 1207 case MO_8: 1208 memcpy(pv, &val, 16); 1209 return; 1210 case MO_16: 1211 store_atom_8_by_2(pv, a); 1212 store_atom_8_by_2(pv + 8, b); 1213 return; 1214 case MO_32: 1215 store_atom_8_by_4(pv, a); 1216 store_atom_8_by_4(pv + 8, b); 1217 return; 1218 case MO_64: 1219 if (HAVE_al8) { 1220 store_atomic8(pv, a); 1221 store_atomic8(pv + 8, b); 1222 return; 1223 } 1224 break; 1225 case -MO_64: 1226 if (HAVE_al16) { 1227 uint64_t val_le; 1228 int s2 = pi & 15; 1229 int s1 = 16 - s2; 1230 1231 if (HOST_BIG_ENDIAN) { 1232 val = bswap128(val); 1233 } 1234 switch (s2) { 1235 case 1 ... 7: 1236 val_le = store_whole_le16(pv, s1, val); 1237 store_bytes_leN(pv + s1, s2, val_le); 1238 break; 1239 case 9 ... 15: 1240 store_bytes_leN(pv, s1, int128_getlo(val)); 1241 val = int128_urshift(val, s1 * 8); 1242 store_whole_le16(pv + s1, s2, val); 1243 break; 1244 case 0: /* aligned */ 1245 case 8: /* atmax MO_64 */ 1246 default: 1247 g_assert_not_reached(); 1248 } 1249 return; 1250 } 1251 break; 1252 case MO_128: 1253 if (HAVE_al16) { 1254 store_atomic16(pv, val); 1255 return; 1256 } 1257 break; 1258 default: 1259 g_assert_not_reached(); 1260 } 1261 cpu_loop_exit_atomic(env_cpu(env), ra); 1262} 1263