1/* 2 * Routines common to user and system emulation of load/store. 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * SPDX-License-Identifier: GPL-2.0-or-later 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12#ifdef CONFIG_ATOMIC64 13# define HAVE_al8 true 14#else 15# define HAVE_al8 false 16#endif 17#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) 18 19#if defined(CONFIG_ATOMIC128) 20# define HAVE_al16_fast true 21#else 22# define HAVE_al16_fast false 23#endif 24#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128) 25# define HAVE_al16 true 26#else 27# define HAVE_al16 false 28#endif 29 30 31/** 32 * required_atomicity: 33 * 34 * Return the lg2 bytes of atomicity required by @memop for @p. 35 * If the operation must be split into two operations to be 36 * examined separately for atomicity, return -lg2. 37 */ 38static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop) 39{ 40 MemOp atom = memop & MO_ATOM_MASK; 41 MemOp size = memop & MO_SIZE; 42 MemOp half = size ? size - 1 : 0; 43 unsigned tmp; 44 int atmax; 45 46 switch (atom) { 47 case MO_ATOM_NONE: 48 atmax = MO_8; 49 break; 50 51 case MO_ATOM_IFALIGN_PAIR: 52 size = half; 53 /* fall through */ 54 55 case MO_ATOM_IFALIGN: 56 tmp = (1 << size) - 1; 57 atmax = p & tmp ? MO_8 : size; 58 break; 59 60 case MO_ATOM_WITHIN16: 61 tmp = p & 15; 62 atmax = (tmp + (1 << size) <= 16 ? size : MO_8); 63 break; 64 65 case MO_ATOM_WITHIN16_PAIR: 66 tmp = p & 15; 67 if (tmp + (1 << size) <= 16) { 68 atmax = size; 69 } else if (tmp + (1 << half) == 16) { 70 /* 71 * The pair exactly straddles the boundary. 72 * Both halves are naturally aligned and atomic. 73 */ 74 atmax = half; 75 } else { 76 /* 77 * One of the pair crosses the boundary, and is non-atomic. 78 * The other of the pair does not cross, and is atomic. 79 */ 80 atmax = -half; 81 } 82 break; 83 84 case MO_ATOM_SUBALIGN: 85 /* 86 * Examine the alignment of p to determine if there are subobjects 87 * that must be aligned. Note that we only really need ctz4() -- 88 * any more sigificant bits are discarded by the immediately 89 * following comparison. 90 */ 91 tmp = ctz32(p); 92 atmax = MIN(size, tmp); 93 break; 94 95 default: 96 g_assert_not_reached(); 97 } 98 99 /* 100 * Here we have the architectural atomicity of the operation. 101 * However, when executing in a serial context, we need no extra 102 * host atomicity in order to avoid racing. This reduction 103 * avoids looping with cpu_loop_exit_atomic. 104 */ 105 if (cpu_in_serial_context(env_cpu(env))) { 106 return MO_8; 107 } 108 return atmax; 109} 110 111/** 112 * load_atomic2: 113 * @pv: host address 114 * 115 * Atomically load 2 aligned bytes from @pv. 116 */ 117static inline uint16_t load_atomic2(void *pv) 118{ 119 uint16_t *p = __builtin_assume_aligned(pv, 2); 120 return qatomic_read(p); 121} 122 123/** 124 * load_atomic4: 125 * @pv: host address 126 * 127 * Atomically load 4 aligned bytes from @pv. 128 */ 129static inline uint32_t load_atomic4(void *pv) 130{ 131 uint32_t *p = __builtin_assume_aligned(pv, 4); 132 return qatomic_read(p); 133} 134 135/** 136 * load_atomic8: 137 * @pv: host address 138 * 139 * Atomically load 8 aligned bytes from @pv. 140 */ 141static inline uint64_t load_atomic8(void *pv) 142{ 143 uint64_t *p = __builtin_assume_aligned(pv, 8); 144 145 qemu_build_assert(HAVE_al8); 146 return qatomic_read__nocheck(p); 147} 148 149/** 150 * load_atomic16: 151 * @pv: host address 152 * 153 * Atomically load 16 aligned bytes from @pv. 154 */ 155static inline Int128 load_atomic16(void *pv) 156{ 157#ifdef CONFIG_ATOMIC128 158 __uint128_t *p = __builtin_assume_aligned(pv, 16); 159 Int128Alias r; 160 161 r.u = qatomic_read__nocheck(p); 162 return r.s; 163#else 164 qemu_build_not_reached(); 165#endif 166} 167 168/** 169 * load_atomic8_or_exit: 170 * @env: cpu context 171 * @ra: host unwind address 172 * @pv: host address 173 * 174 * Atomically load 8 aligned bytes from @pv. 175 * If this is not possible, longjmp out to restart serially. 176 */ 177static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 178{ 179 if (HAVE_al8) { 180 return load_atomic8(pv); 181 } 182 183#ifdef CONFIG_USER_ONLY 184 /* 185 * If the page is not writable, then assume the value is immutable 186 * and requires no locking. This ignores the case of MAP_SHARED with 187 * another process, because the fallback start_exclusive solution 188 * provides no protection across processes. 189 */ 190 if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) { 191 uint64_t *p = __builtin_assume_aligned(pv, 8); 192 return *p; 193 } 194#endif 195 196 /* Ultimate fallback: re-execute in serial context. */ 197 cpu_loop_exit_atomic(env_cpu(env), ra); 198} 199 200/** 201 * load_atomic16_or_exit: 202 * @env: cpu context 203 * @ra: host unwind address 204 * @pv: host address 205 * 206 * Atomically load 16 aligned bytes from @pv. 207 * If this is not possible, longjmp out to restart serially. 208 */ 209static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 210{ 211 Int128 *p = __builtin_assume_aligned(pv, 16); 212 213 if (HAVE_al16_fast) { 214 return load_atomic16(p); 215 } 216 217#ifdef CONFIG_USER_ONLY 218 /* 219 * We can only use cmpxchg to emulate a load if the page is writable. 220 * If the page is not writable, then assume the value is immutable 221 * and requires no locking. This ignores the case of MAP_SHARED with 222 * another process, because the fallback start_exclusive solution 223 * provides no protection across processes. 224 */ 225 if (!page_check_range(h2g(p), 16, PAGE_WRITE)) { 226 return *p; 227 } 228#endif 229 230 /* 231 * In system mode all guest pages are writable, and for user-only 232 * we have just checked writability. Try cmpxchg. 233 */ 234#if defined(CONFIG_CMPXCHG128) 235 /* Swap 0 with 0, with the side-effect of returning the old value. */ 236 { 237 Int128Alias r; 238 r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0); 239 return r.s; 240 } 241#endif 242 243 /* Ultimate fallback: re-execute in serial context. */ 244 cpu_loop_exit_atomic(env_cpu(env), ra); 245} 246 247/** 248 * load_atom_extract_al4x2: 249 * @pv: host address 250 * 251 * Load 4 bytes from @p, from two sequential atomic 4-byte loads. 252 */ 253static uint32_t load_atom_extract_al4x2(void *pv) 254{ 255 uintptr_t pi = (uintptr_t)pv; 256 int sh = (pi & 3) * 8; 257 uint32_t a, b; 258 259 pv = (void *)(pi & ~3); 260 a = load_atomic4(pv); 261 b = load_atomic4(pv + 4); 262 263 if (HOST_BIG_ENDIAN) { 264 return (a << sh) | (b >> (-sh & 31)); 265 } else { 266 return (a >> sh) | (b << (-sh & 31)); 267 } 268} 269 270/** 271 * load_atom_extract_al8x2: 272 * @pv: host address 273 * 274 * Load 8 bytes from @p, from two sequential atomic 8-byte loads. 275 */ 276static uint64_t load_atom_extract_al8x2(void *pv) 277{ 278 uintptr_t pi = (uintptr_t)pv; 279 int sh = (pi & 7) * 8; 280 uint64_t a, b; 281 282 pv = (void *)(pi & ~7); 283 a = load_atomic8(pv); 284 b = load_atomic8(pv + 8); 285 286 if (HOST_BIG_ENDIAN) { 287 return (a << sh) | (b >> (-sh & 63)); 288 } else { 289 return (a >> sh) | (b << (-sh & 63)); 290 } 291} 292 293/** 294 * load_atom_extract_al8_or_exit: 295 * @env: cpu context 296 * @ra: host unwind address 297 * @pv: host address 298 * @s: object size in bytes, @s <= 4. 299 * 300 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does 301 * not cross an 8-byte boundary. This means that we can perform an atomic 302 * 8-byte load and extract. 303 * The value is returned in the low bits of a uint32_t. 304 */ 305static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra, 306 void *pv, int s) 307{ 308 uintptr_t pi = (uintptr_t)pv; 309 int o = pi & 7; 310 int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; 311 312 pv = (void *)(pi & ~7); 313 return load_atomic8_or_exit(env, ra, pv) >> shr; 314} 315 316/** 317 * load_atom_extract_al16_or_exit: 318 * @env: cpu context 319 * @ra: host unwind address 320 * @p: host address 321 * @s: object size in bytes, @s <= 8. 322 * 323 * Atomically load @s bytes from @p, when p % 16 < 8 324 * and p % 16 + s > 8. I.e. does not cross a 16-byte 325 * boundary, but *does* cross an 8-byte boundary. 326 * This is the slow version, so we must have eliminated 327 * any faster load_atom_extract_al8_or_exit case. 328 * 329 * If this is not possible, longjmp out to restart serially. 330 */ 331static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra, 332 void *pv, int s) 333{ 334 uintptr_t pi = (uintptr_t)pv; 335 int o = pi & 7; 336 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 337 Int128 r; 338 339 /* 340 * Note constraints above: p & 8 must be clear. 341 * Provoke SIGBUS if possible otherwise. 342 */ 343 pv = (void *)(pi & ~7); 344 r = load_atomic16_or_exit(env, ra, pv); 345 346 r = int128_urshift(r, shr); 347 return int128_getlo(r); 348} 349 350/** 351 * load_atom_extract_al16_or_al8: 352 * @p: host address 353 * @s: object size in bytes, @s <= 8. 354 * 355 * Load @s bytes from @p, when p % s != 0. If [p, p+s-1] does not 356 * cross an 16-byte boundary then the access must be 16-byte atomic, 357 * otherwise the access must be 8-byte atomic. 358 */ 359static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s) 360{ 361#if defined(CONFIG_ATOMIC128) 362 uintptr_t pi = (uintptr_t)pv; 363 int o = pi & 7; 364 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 365 __uint128_t r; 366 367 pv = (void *)(pi & ~7); 368 if (pi & 8) { 369 uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8); 370 uint64_t a = qatomic_read__nocheck(p8); 371 uint64_t b = qatomic_read__nocheck(p8 + 1); 372 373 if (HOST_BIG_ENDIAN) { 374 r = ((__uint128_t)a << 64) | b; 375 } else { 376 r = ((__uint128_t)b << 64) | a; 377 } 378 } else { 379 __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0); 380 r = qatomic_read__nocheck(p16); 381 } 382 return r >> shr; 383#else 384 qemu_build_not_reached(); 385#endif 386} 387 388/** 389 * load_atom_4_by_2: 390 * @pv: host address 391 * 392 * Load 4 bytes from @pv, with two 2-byte atomic loads. 393 */ 394static inline uint32_t load_atom_4_by_2(void *pv) 395{ 396 uint32_t a = load_atomic2(pv); 397 uint32_t b = load_atomic2(pv + 2); 398 399 if (HOST_BIG_ENDIAN) { 400 return (a << 16) | b; 401 } else { 402 return (b << 16) | a; 403 } 404} 405 406/** 407 * load_atom_8_by_2: 408 * @pv: host address 409 * 410 * Load 8 bytes from @pv, with four 2-byte atomic loads. 411 */ 412static inline uint64_t load_atom_8_by_2(void *pv) 413{ 414 uint32_t a = load_atom_4_by_2(pv); 415 uint32_t b = load_atom_4_by_2(pv + 4); 416 417 if (HOST_BIG_ENDIAN) { 418 return ((uint64_t)a << 32) | b; 419 } else { 420 return ((uint64_t)b << 32) | a; 421 } 422} 423 424/** 425 * load_atom_8_by_4: 426 * @pv: host address 427 * 428 * Load 8 bytes from @pv, with two 4-byte atomic loads. 429 */ 430static inline uint64_t load_atom_8_by_4(void *pv) 431{ 432 uint32_t a = load_atomic4(pv); 433 uint32_t b = load_atomic4(pv + 4); 434 435 if (HOST_BIG_ENDIAN) { 436 return ((uint64_t)a << 32) | b; 437 } else { 438 return ((uint64_t)b << 32) | a; 439 } 440} 441 442/** 443 * load_atom_2: 444 * @p: host address 445 * @memop: the full memory op 446 * 447 * Load 2 bytes from @p, honoring the atomicity of @memop. 448 */ 449static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra, 450 void *pv, MemOp memop) 451{ 452 uintptr_t pi = (uintptr_t)pv; 453 int atmax; 454 455 if (likely((pi & 1) == 0)) { 456 return load_atomic2(pv); 457 } 458 if (HAVE_al16_fast) { 459 return load_atom_extract_al16_or_al8(pv, 2); 460 } 461 462 atmax = required_atomicity(env, pi, memop); 463 switch (atmax) { 464 case MO_8: 465 return lduw_he_p(pv); 466 case MO_16: 467 /* The only case remaining is MO_ATOM_WITHIN16. */ 468 if (!HAVE_al8_fast && (pi & 3) == 1) { 469 /* Big or little endian, we want the middle two bytes. */ 470 return load_atomic4(pv - 1) >> 8; 471 } 472 if ((pi & 15) != 7) { 473 return load_atom_extract_al8_or_exit(env, ra, pv, 2); 474 } 475 return load_atom_extract_al16_or_exit(env, ra, pv, 2); 476 default: 477 g_assert_not_reached(); 478 } 479} 480 481/** 482 * load_atom_4: 483 * @p: host address 484 * @memop: the full memory op 485 * 486 * Load 4 bytes from @p, honoring the atomicity of @memop. 487 */ 488static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra, 489 void *pv, MemOp memop) 490{ 491 uintptr_t pi = (uintptr_t)pv; 492 int atmax; 493 494 if (likely((pi & 3) == 0)) { 495 return load_atomic4(pv); 496 } 497 if (HAVE_al16_fast) { 498 return load_atom_extract_al16_or_al8(pv, 4); 499 } 500 501 atmax = required_atomicity(env, pi, memop); 502 switch (atmax) { 503 case MO_8: 504 case MO_16: 505 case -MO_16: 506 /* 507 * For MO_ATOM_IFALIGN, this is more atomicity than required, 508 * but it's trivially supported on all hosts, better than 4 509 * individual byte loads (when the host requires alignment), 510 * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. 511 */ 512 return load_atom_extract_al4x2(pv); 513 case MO_32: 514 if (!(pi & 4)) { 515 return load_atom_extract_al8_or_exit(env, ra, pv, 4); 516 } 517 return load_atom_extract_al16_or_exit(env, ra, pv, 4); 518 default: 519 g_assert_not_reached(); 520 } 521} 522 523/** 524 * load_atom_8: 525 * @p: host address 526 * @memop: the full memory op 527 * 528 * Load 8 bytes from @p, honoring the atomicity of @memop. 529 */ 530static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra, 531 void *pv, MemOp memop) 532{ 533 uintptr_t pi = (uintptr_t)pv; 534 int atmax; 535 536 /* 537 * If the host does not support 8-byte atomics, wait until we have 538 * examined the atomicity parameters below. 539 */ 540 if (HAVE_al8 && likely((pi & 7) == 0)) { 541 return load_atomic8(pv); 542 } 543 if (HAVE_al16_fast) { 544 return load_atom_extract_al16_or_al8(pv, 8); 545 } 546 547 atmax = required_atomicity(env, pi, memop); 548 if (atmax == MO_64) { 549 if (!HAVE_al8 && (pi & 7) == 0) { 550 load_atomic8_or_exit(env, ra, pv); 551 } 552 return load_atom_extract_al16_or_exit(env, ra, pv, 8); 553 } 554 if (HAVE_al8_fast) { 555 return load_atom_extract_al8x2(pv); 556 } 557 switch (atmax) { 558 case MO_8: 559 return ldq_he_p(pv); 560 case MO_16: 561 return load_atom_8_by_2(pv); 562 case MO_32: 563 return load_atom_8_by_4(pv); 564 case -MO_32: 565 if (HAVE_al8) { 566 return load_atom_extract_al8x2(pv); 567 } 568 cpu_loop_exit_atomic(env_cpu(env), ra); 569 default: 570 g_assert_not_reached(); 571 } 572} 573 574/** 575 * store_atomic2: 576 * @pv: host address 577 * @val: value to store 578 * 579 * Atomically store 2 aligned bytes to @pv. 580 */ 581static inline void store_atomic2(void *pv, uint16_t val) 582{ 583 uint16_t *p = __builtin_assume_aligned(pv, 2); 584 qatomic_set(p, val); 585} 586 587/** 588 * store_atomic4: 589 * @pv: host address 590 * @val: value to store 591 * 592 * Atomically store 4 aligned bytes to @pv. 593 */ 594static inline void store_atomic4(void *pv, uint32_t val) 595{ 596 uint32_t *p = __builtin_assume_aligned(pv, 4); 597 qatomic_set(p, val); 598} 599 600/** 601 * store_atomic8: 602 * @pv: host address 603 * @val: value to store 604 * 605 * Atomically store 8 aligned bytes to @pv. 606 */ 607static inline void store_atomic8(void *pv, uint64_t val) 608{ 609 uint64_t *p = __builtin_assume_aligned(pv, 8); 610 611 qemu_build_assert(HAVE_al8); 612 qatomic_set__nocheck(p, val); 613} 614 615/** 616 * store_atom_4x2 617 */ 618static inline void store_atom_4_by_2(void *pv, uint32_t val) 619{ 620 store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); 621 store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); 622} 623 624/** 625 * store_atom_8_by_2 626 */ 627static inline void store_atom_8_by_2(void *pv, uint64_t val) 628{ 629 store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 630 store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 631} 632 633/** 634 * store_atom_8_by_4 635 */ 636static inline void store_atom_8_by_4(void *pv, uint64_t val) 637{ 638 store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 639 store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 640} 641 642/** 643 * store_atom_insert_al4: 644 * @p: host address 645 * @val: shifted value to store 646 * @msk: mask for value to store 647 * 648 * Atomically store @val to @p, masked by @msk. 649 */ 650static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) 651{ 652 uint32_t old, new; 653 654 p = __builtin_assume_aligned(p, 4); 655 old = qatomic_read(p); 656 do { 657 new = (old & ~msk) | val; 658 } while (!__atomic_compare_exchange_n(p, &old, new, true, 659 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 660} 661 662/** 663 * store_atom_insert_al8: 664 * @p: host address 665 * @val: shifted value to store 666 * @msk: mask for value to store 667 * 668 * Atomically store @val to @p masked by @msk. 669 */ 670static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) 671{ 672 uint64_t old, new; 673 674 qemu_build_assert(HAVE_al8); 675 p = __builtin_assume_aligned(p, 8); 676 old = qatomic_read__nocheck(p); 677 do { 678 new = (old & ~msk) | val; 679 } while (!__atomic_compare_exchange_n(p, &old, new, true, 680 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 681} 682 683/** 684 * store_atom_insert_al16: 685 * @p: host address 686 * @val: shifted value to store 687 * @msk: mask for value to store 688 * 689 * Atomically store @val to @p masked by @msk. 690 */ 691static void store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk) 692{ 693#if defined(CONFIG_ATOMIC128) 694 __uint128_t *pu, old, new; 695 696 /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */ 697 pu = __builtin_assume_aligned(ps, 16); 698 old = *pu; 699 do { 700 new = (old & ~msk.u) | val.u; 701 } while (!__atomic_compare_exchange_n(pu, &old, new, true, 702 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 703#elif defined(CONFIG_CMPXCHG128) 704 __uint128_t *pu, old, new; 705 706 /* 707 * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always 708 * defer to libatomic, so we must use __sync_*_compare_and_swap_16 709 * and accept the sequential consistency that comes with it. 710 */ 711 pu = __builtin_assume_aligned(ps, 16); 712 do { 713 old = *pu; 714 new = (old & ~msk.u) | val.u; 715 } while (!__sync_bool_compare_and_swap_16(pu, old, new)); 716#else 717 qemu_build_not_reached(); 718#endif 719} 720 721/** 722 * store_bytes_leN: 723 * @pv: host address 724 * @size: number of bytes to store 725 * @val_le: data to store 726 * 727 * Store @size bytes at @p. The bytes to store are extracted in little-endian order 728 * from @val_le; return the bytes of @val_le beyond @size that have not been stored. 729 */ 730static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) 731{ 732 uint8_t *p = pv; 733 for (int i = 0; i < size; i++, val_le >>= 8) { 734 p[i] = val_le; 735 } 736 return val_le; 737} 738 739/** 740 * store_parts_leN 741 * @pv: host address 742 * @size: number of bytes to store 743 * @val_le: data to store 744 * 745 * As store_bytes_leN, but atomically on each aligned part. 746 */ 747G_GNUC_UNUSED 748static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) 749{ 750 do { 751 int n; 752 753 /* Find minimum of alignment and size */ 754 switch (((uintptr_t)pv | size) & 7) { 755 case 4: 756 store_atomic4(pv, le32_to_cpu(val_le)); 757 val_le >>= 32; 758 n = 4; 759 break; 760 case 2: 761 case 6: 762 store_atomic2(pv, le16_to_cpu(val_le)); 763 val_le >>= 16; 764 n = 2; 765 break; 766 default: 767 *(uint8_t *)pv = val_le; 768 val_le >>= 8; 769 n = 1; 770 break; 771 case 0: 772 g_assert_not_reached(); 773 } 774 pv += n; 775 size -= n; 776 } while (size != 0); 777 778 return val_le; 779} 780 781/** 782 * store_whole_le4 783 * @pv: host address 784 * @size: number of bytes to store 785 * @val_le: data to store 786 * 787 * As store_bytes_leN, but atomically as a whole. 788 * Four aligned bytes are guaranteed to cover the store. 789 */ 790static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) 791{ 792 int sz = size * 8; 793 int o = (uintptr_t)pv & 3; 794 int sh = o * 8; 795 uint32_t m = MAKE_64BIT_MASK(0, sz); 796 uint32_t v; 797 798 if (HOST_BIG_ENDIAN) { 799 v = bswap32(val_le) >> sh; 800 m = bswap32(m) >> sh; 801 } else { 802 v = val_le << sh; 803 m <<= sh; 804 } 805 store_atom_insert_al4(pv - o, v, m); 806 return val_le >> sz; 807} 808 809/** 810 * store_whole_le8 811 * @pv: host address 812 * @size: number of bytes to store 813 * @val_le: data to store 814 * 815 * As store_bytes_leN, but atomically as a whole. 816 * Eight aligned bytes are guaranteed to cover the store. 817 */ 818static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) 819{ 820 int sz = size * 8; 821 int o = (uintptr_t)pv & 7; 822 int sh = o * 8; 823 uint64_t m = MAKE_64BIT_MASK(0, sz); 824 uint64_t v; 825 826 qemu_build_assert(HAVE_al8); 827 if (HOST_BIG_ENDIAN) { 828 v = bswap64(val_le) >> sh; 829 m = bswap64(m) >> sh; 830 } else { 831 v = val_le << sh; 832 m <<= sh; 833 } 834 store_atom_insert_al8(pv - o, v, m); 835 return val_le >> sz; 836} 837 838/** 839 * store_whole_le16 840 * @pv: host address 841 * @size: number of bytes to store 842 * @val_le: data to store 843 * 844 * As store_bytes_leN, but atomically as a whole. 845 * 16 aligned bytes are guaranteed to cover the store. 846 */ 847static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) 848{ 849 int sz = size * 8; 850 int o = (uintptr_t)pv & 15; 851 int sh = o * 8; 852 Int128 m, v; 853 854 qemu_build_assert(HAVE_al16); 855 856 /* Like MAKE_64BIT_MASK(0, sz), but larger. */ 857 if (sz <= 64) { 858 m = int128_make64(MAKE_64BIT_MASK(0, sz)); 859 } else { 860 m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); 861 } 862 863 if (HOST_BIG_ENDIAN) { 864 v = int128_urshift(bswap128(val_le), sh); 865 m = int128_urshift(bswap128(m), sh); 866 } else { 867 v = int128_lshift(val_le, sh); 868 m = int128_lshift(m, sh); 869 } 870 store_atom_insert_al16(pv - o, v, m); 871 872 /* Unused if sz <= 64. */ 873 return int128_gethi(val_le) >> (sz - 64); 874} 875 876/** 877 * store_atom_2: 878 * @p: host address 879 * @val: the value to store 880 * @memop: the full memory op 881 * 882 * Store 2 bytes to @p, honoring the atomicity of @memop. 883 */ 884static void store_atom_2(CPUArchState *env, uintptr_t ra, 885 void *pv, MemOp memop, uint16_t val) 886{ 887 uintptr_t pi = (uintptr_t)pv; 888 int atmax; 889 890 if (likely((pi & 1) == 0)) { 891 store_atomic2(pv, val); 892 return; 893 } 894 895 atmax = required_atomicity(env, pi, memop); 896 if (atmax == MO_8) { 897 stw_he_p(pv, val); 898 return; 899 } 900 901 /* 902 * The only case remaining is MO_ATOM_WITHIN16. 903 * Big or little endian, we want the middle two bytes in each test. 904 */ 905 if ((pi & 3) == 1) { 906 store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); 907 return; 908 } else if ((pi & 7) == 3) { 909 if (HAVE_al8) { 910 store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); 911 return; 912 } 913 } else if ((pi & 15) == 7) { 914 if (HAVE_al16) { 915 Int128 v = int128_lshift(int128_make64(val), 56); 916 Int128 m = int128_lshift(int128_make64(0xffff), 56); 917 store_atom_insert_al16(pv - 7, v, m); 918 return; 919 } 920 } else { 921 g_assert_not_reached(); 922 } 923 924 cpu_loop_exit_atomic(env_cpu(env), ra); 925} 926 927/** 928 * store_atom_4: 929 * @p: host address 930 * @val: the value to store 931 * @memop: the full memory op 932 * 933 * Store 4 bytes to @p, honoring the atomicity of @memop. 934 */ 935static void store_atom_4(CPUArchState *env, uintptr_t ra, 936 void *pv, MemOp memop, uint32_t val) 937{ 938 uintptr_t pi = (uintptr_t)pv; 939 int atmax; 940 941 if (likely((pi & 3) == 0)) { 942 store_atomic4(pv, val); 943 return; 944 } 945 946 atmax = required_atomicity(env, pi, memop); 947 switch (atmax) { 948 case MO_8: 949 stl_he_p(pv, val); 950 return; 951 case MO_16: 952 store_atom_4_by_2(pv, val); 953 return; 954 case -MO_16: 955 { 956 uint32_t val_le = cpu_to_le32(val); 957 int s2 = pi & 3; 958 int s1 = 4 - s2; 959 960 switch (s2) { 961 case 1: 962 val_le = store_whole_le4(pv, s1, val_le); 963 *(uint8_t *)(pv + 3) = val_le; 964 break; 965 case 3: 966 *(uint8_t *)pv = val_le; 967 store_whole_le4(pv + 1, s2, val_le >> 8); 968 break; 969 case 0: /* aligned */ 970 case 2: /* atmax MO_16 */ 971 default: 972 g_assert_not_reached(); 973 } 974 } 975 return; 976 case MO_32: 977 if ((pi & 7) < 4) { 978 if (HAVE_al8) { 979 store_whole_le8(pv, 4, cpu_to_le32(val)); 980 return; 981 } 982 } else { 983 if (HAVE_al16) { 984 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); 985 return; 986 } 987 } 988 cpu_loop_exit_atomic(env_cpu(env), ra); 989 default: 990 g_assert_not_reached(); 991 } 992} 993 994/** 995 * store_atom_8: 996 * @p: host address 997 * @val: the value to store 998 * @memop: the full memory op 999 * 1000 * Store 8 bytes to @p, honoring the atomicity of @memop. 1001 */ 1002static void store_atom_8(CPUArchState *env, uintptr_t ra, 1003 void *pv, MemOp memop, uint64_t val) 1004{ 1005 uintptr_t pi = (uintptr_t)pv; 1006 int atmax; 1007 1008 if (HAVE_al8 && likely((pi & 7) == 0)) { 1009 store_atomic8(pv, val); 1010 return; 1011 } 1012 1013 atmax = required_atomicity(env, pi, memop); 1014 switch (atmax) { 1015 case MO_8: 1016 stq_he_p(pv, val); 1017 return; 1018 case MO_16: 1019 store_atom_8_by_2(pv, val); 1020 return; 1021 case MO_32: 1022 store_atom_8_by_4(pv, val); 1023 return; 1024 case -MO_32: 1025 if (HAVE_al8) { 1026 uint64_t val_le = cpu_to_le64(val); 1027 int s2 = pi & 7; 1028 int s1 = 8 - s2; 1029 1030 switch (s2) { 1031 case 1 ... 3: 1032 val_le = store_whole_le8(pv, s1, val_le); 1033 store_bytes_leN(pv + s1, s2, val_le); 1034 break; 1035 case 5 ... 7: 1036 val_le = store_bytes_leN(pv, s1, val_le); 1037 store_whole_le8(pv + s1, s2, val_le); 1038 break; 1039 case 0: /* aligned */ 1040 case 4: /* atmax MO_32 */ 1041 default: 1042 g_assert_not_reached(); 1043 } 1044 return; 1045 } 1046 break; 1047 case MO_64: 1048 if (HAVE_al16) { 1049 store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); 1050 return; 1051 } 1052 break; 1053 default: 1054 g_assert_not_reached(); 1055 } 1056 cpu_loop_exit_atomic(env_cpu(env), ra); 1057} 1058