1/* 2 * Routines common to user and system emulation of load/store. 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * SPDX-License-Identifier: GPL-2.0-or-later 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12#ifdef CONFIG_ATOMIC64 13# define HAVE_al8 true 14#else 15# define HAVE_al8 false 16#endif 17#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) 18 19/** 20 * required_atomicity: 21 * 22 * Return the lg2 bytes of atomicity required by @memop for @p. 23 * If the operation must be split into two operations to be 24 * examined separately for atomicity, return -lg2. 25 */ 26static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop) 27{ 28 MemOp atom = memop & MO_ATOM_MASK; 29 MemOp size = memop & MO_SIZE; 30 MemOp half = size ? size - 1 : 0; 31 unsigned tmp; 32 int atmax; 33 34 switch (atom) { 35 case MO_ATOM_NONE: 36 atmax = MO_8; 37 break; 38 39 case MO_ATOM_IFALIGN_PAIR: 40 size = half; 41 /* fall through */ 42 43 case MO_ATOM_IFALIGN: 44 tmp = (1 << size) - 1; 45 atmax = p & tmp ? MO_8 : size; 46 break; 47 48 case MO_ATOM_WITHIN16: 49 tmp = p & 15; 50 atmax = (tmp + (1 << size) <= 16 ? size : MO_8); 51 break; 52 53 case MO_ATOM_WITHIN16_PAIR: 54 tmp = p & 15; 55 if (tmp + (1 << size) <= 16) { 56 atmax = size; 57 } else if (tmp + (1 << half) == 16) { 58 /* 59 * The pair exactly straddles the boundary. 60 * Both halves are naturally aligned and atomic. 61 */ 62 atmax = half; 63 } else { 64 /* 65 * One of the pair crosses the boundary, and is non-atomic. 66 * The other of the pair does not cross, and is atomic. 67 */ 68 atmax = -half; 69 } 70 break; 71 72 case MO_ATOM_SUBALIGN: 73 /* 74 * Examine the alignment of p to determine if there are subobjects 75 * that must be aligned. Note that we only really need ctz4() -- 76 * any more sigificant bits are discarded by the immediately 77 * following comparison. 78 */ 79 tmp = ctz32(p); 80 atmax = MIN(size, tmp); 81 break; 82 83 default: 84 g_assert_not_reached(); 85 } 86 87 /* 88 * Here we have the architectural atomicity of the operation. 89 * However, when executing in a serial context, we need no extra 90 * host atomicity in order to avoid racing. This reduction 91 * avoids looping with cpu_loop_exit_atomic. 92 */ 93 if (cpu_in_serial_context(env_cpu(env))) { 94 return MO_8; 95 } 96 return atmax; 97} 98 99/** 100 * load_atomic2: 101 * @pv: host address 102 * 103 * Atomically load 2 aligned bytes from @pv. 104 */ 105static inline uint16_t load_atomic2(void *pv) 106{ 107 uint16_t *p = __builtin_assume_aligned(pv, 2); 108 return qatomic_read(p); 109} 110 111/** 112 * load_atomic4: 113 * @pv: host address 114 * 115 * Atomically load 4 aligned bytes from @pv. 116 */ 117static inline uint32_t load_atomic4(void *pv) 118{ 119 uint32_t *p = __builtin_assume_aligned(pv, 4); 120 return qatomic_read(p); 121} 122 123/** 124 * load_atomic8: 125 * @pv: host address 126 * 127 * Atomically load 8 aligned bytes from @pv. 128 */ 129static inline uint64_t load_atomic8(void *pv) 130{ 131 uint64_t *p = __builtin_assume_aligned(pv, 8); 132 133 qemu_build_assert(HAVE_al8); 134 return qatomic_read__nocheck(p); 135} 136 137/** 138 * load_atomic8_or_exit: 139 * @env: cpu context 140 * @ra: host unwind address 141 * @pv: host address 142 * 143 * Atomically load 8 aligned bytes from @pv. 144 * If this is not possible, longjmp out to restart serially. 145 */ 146static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 147{ 148 if (HAVE_al8) { 149 return load_atomic8(pv); 150 } 151 152#ifdef CONFIG_USER_ONLY 153 /* 154 * If the page is not writable, then assume the value is immutable 155 * and requires no locking. This ignores the case of MAP_SHARED with 156 * another process, because the fallback start_exclusive solution 157 * provides no protection across processes. 158 */ 159 if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) { 160 uint64_t *p = __builtin_assume_aligned(pv, 8); 161 return *p; 162 } 163#endif 164 165 /* Ultimate fallback: re-execute in serial context. */ 166 cpu_loop_exit_atomic(env_cpu(env), ra); 167} 168 169/** 170 * load_atomic16_or_exit: 171 * @env: cpu context 172 * @ra: host unwind address 173 * @pv: host address 174 * 175 * Atomically load 16 aligned bytes from @pv. 176 * If this is not possible, longjmp out to restart serially. 177 */ 178static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 179{ 180 Int128 *p = __builtin_assume_aligned(pv, 16); 181 182 if (HAVE_ATOMIC128_RO) { 183 return atomic16_read_ro(p); 184 } 185 186#ifdef CONFIG_USER_ONLY 187 /* 188 * We can only use cmpxchg to emulate a load if the page is writable. 189 * If the page is not writable, then assume the value is immutable 190 * and requires no locking. This ignores the case of MAP_SHARED with 191 * another process, because the fallback start_exclusive solution 192 * provides no protection across processes. 193 */ 194 if (!page_check_range(h2g(p), 16, PAGE_WRITE)) { 195 return *p; 196 } 197#endif 198 199 /* 200 * In system mode all guest pages are writable, and for user-only 201 * we have just checked writability. Try cmpxchg. 202 */ 203 if (HAVE_ATOMIC128_RW) { 204 return atomic16_read_rw(p); 205 } 206 207 /* Ultimate fallback: re-execute in serial context. */ 208 cpu_loop_exit_atomic(env_cpu(env), ra); 209} 210 211/** 212 * load_atom_extract_al4x2: 213 * @pv: host address 214 * 215 * Load 4 bytes from @p, from two sequential atomic 4-byte loads. 216 */ 217static uint32_t load_atom_extract_al4x2(void *pv) 218{ 219 uintptr_t pi = (uintptr_t)pv; 220 int sh = (pi & 3) * 8; 221 uint32_t a, b; 222 223 pv = (void *)(pi & ~3); 224 a = load_atomic4(pv); 225 b = load_atomic4(pv + 4); 226 227 if (HOST_BIG_ENDIAN) { 228 return (a << sh) | (b >> (-sh & 31)); 229 } else { 230 return (a >> sh) | (b << (-sh & 31)); 231 } 232} 233 234/** 235 * load_atom_extract_al8x2: 236 * @pv: host address 237 * 238 * Load 8 bytes from @p, from two sequential atomic 8-byte loads. 239 */ 240static uint64_t load_atom_extract_al8x2(void *pv) 241{ 242 uintptr_t pi = (uintptr_t)pv; 243 int sh = (pi & 7) * 8; 244 uint64_t a, b; 245 246 pv = (void *)(pi & ~7); 247 a = load_atomic8(pv); 248 b = load_atomic8(pv + 8); 249 250 if (HOST_BIG_ENDIAN) { 251 return (a << sh) | (b >> (-sh & 63)); 252 } else { 253 return (a >> sh) | (b << (-sh & 63)); 254 } 255} 256 257/** 258 * load_atom_extract_al8_or_exit: 259 * @env: cpu context 260 * @ra: host unwind address 261 * @pv: host address 262 * @s: object size in bytes, @s <= 4. 263 * 264 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does 265 * not cross an 8-byte boundary. This means that we can perform an atomic 266 * 8-byte load and extract. 267 * The value is returned in the low bits of a uint32_t. 268 */ 269static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra, 270 void *pv, int s) 271{ 272 uintptr_t pi = (uintptr_t)pv; 273 int o = pi & 7; 274 int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; 275 276 pv = (void *)(pi & ~7); 277 return load_atomic8_or_exit(env, ra, pv) >> shr; 278} 279 280/** 281 * load_atom_extract_al16_or_exit: 282 * @env: cpu context 283 * @ra: host unwind address 284 * @p: host address 285 * @s: object size in bytes, @s <= 8. 286 * 287 * Atomically load @s bytes from @p, when p % 16 < 8 288 * and p % 16 + s > 8. I.e. does not cross a 16-byte 289 * boundary, but *does* cross an 8-byte boundary. 290 * This is the slow version, so we must have eliminated 291 * any faster load_atom_extract_al8_or_exit case. 292 * 293 * If this is not possible, longjmp out to restart serially. 294 */ 295static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra, 296 void *pv, int s) 297{ 298 uintptr_t pi = (uintptr_t)pv; 299 int o = pi & 7; 300 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 301 Int128 r; 302 303 /* 304 * Note constraints above: p & 8 must be clear. 305 * Provoke SIGBUS if possible otherwise. 306 */ 307 pv = (void *)(pi & ~7); 308 r = load_atomic16_or_exit(env, ra, pv); 309 310 r = int128_urshift(r, shr); 311 return int128_getlo(r); 312} 313 314/** 315 * load_atom_extract_al16_or_al8: 316 * @p: host address 317 * @s: object size in bytes, @s <= 8. 318 * 319 * Load @s bytes from @p, when p % s != 0. If [p, p+s-1] does not 320 * cross an 16-byte boundary then the access must be 16-byte atomic, 321 * otherwise the access must be 8-byte atomic. 322 */ 323static inline uint64_t ATTRIBUTE_ATOMIC128_OPT 324load_atom_extract_al16_or_al8(void *pv, int s) 325{ 326 uintptr_t pi = (uintptr_t)pv; 327 int o = pi & 7; 328 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 329 Int128 r; 330 331 pv = (void *)(pi & ~7); 332 if (pi & 8) { 333 uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8); 334 uint64_t a = qatomic_read__nocheck(p8); 335 uint64_t b = qatomic_read__nocheck(p8 + 1); 336 337 if (HOST_BIG_ENDIAN) { 338 r = int128_make128(b, a); 339 } else { 340 r = int128_make128(a, b); 341 } 342 } else { 343 r = atomic16_read_ro(pv); 344 } 345 return int128_getlo(int128_urshift(r, shr)); 346} 347 348/** 349 * load_atom_4_by_2: 350 * @pv: host address 351 * 352 * Load 4 bytes from @pv, with two 2-byte atomic loads. 353 */ 354static inline uint32_t load_atom_4_by_2(void *pv) 355{ 356 uint32_t a = load_atomic2(pv); 357 uint32_t b = load_atomic2(pv + 2); 358 359 if (HOST_BIG_ENDIAN) { 360 return (a << 16) | b; 361 } else { 362 return (b << 16) | a; 363 } 364} 365 366/** 367 * load_atom_8_by_2: 368 * @pv: host address 369 * 370 * Load 8 bytes from @pv, with four 2-byte atomic loads. 371 */ 372static inline uint64_t load_atom_8_by_2(void *pv) 373{ 374 uint32_t a = load_atom_4_by_2(pv); 375 uint32_t b = load_atom_4_by_2(pv + 4); 376 377 if (HOST_BIG_ENDIAN) { 378 return ((uint64_t)a << 32) | b; 379 } else { 380 return ((uint64_t)b << 32) | a; 381 } 382} 383 384/** 385 * load_atom_8_by_4: 386 * @pv: host address 387 * 388 * Load 8 bytes from @pv, with two 4-byte atomic loads. 389 */ 390static inline uint64_t load_atom_8_by_4(void *pv) 391{ 392 uint32_t a = load_atomic4(pv); 393 uint32_t b = load_atomic4(pv + 4); 394 395 if (HOST_BIG_ENDIAN) { 396 return ((uint64_t)a << 32) | b; 397 } else { 398 return ((uint64_t)b << 32) | a; 399 } 400} 401 402/** 403 * load_atom_8_by_8_or_4: 404 * @pv: host address 405 * 406 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity. 407 */ 408static inline uint64_t load_atom_8_by_8_or_4(void *pv) 409{ 410 if (HAVE_al8_fast) { 411 return load_atomic8(pv); 412 } else { 413 return load_atom_8_by_4(pv); 414 } 415} 416 417/** 418 * load_atom_2: 419 * @p: host address 420 * @memop: the full memory op 421 * 422 * Load 2 bytes from @p, honoring the atomicity of @memop. 423 */ 424static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra, 425 void *pv, MemOp memop) 426{ 427 uintptr_t pi = (uintptr_t)pv; 428 int atmax; 429 430 if (likely((pi & 1) == 0)) { 431 return load_atomic2(pv); 432 } 433 if (HAVE_ATOMIC128_RO) { 434 return load_atom_extract_al16_or_al8(pv, 2); 435 } 436 437 atmax = required_atomicity(env, pi, memop); 438 switch (atmax) { 439 case MO_8: 440 return lduw_he_p(pv); 441 case MO_16: 442 /* The only case remaining is MO_ATOM_WITHIN16. */ 443 if (!HAVE_al8_fast && (pi & 3) == 1) { 444 /* Big or little endian, we want the middle two bytes. */ 445 return load_atomic4(pv - 1) >> 8; 446 } 447 if ((pi & 15) != 7) { 448 return load_atom_extract_al8_or_exit(env, ra, pv, 2); 449 } 450 return load_atom_extract_al16_or_exit(env, ra, pv, 2); 451 default: 452 g_assert_not_reached(); 453 } 454} 455 456/** 457 * load_atom_4: 458 * @p: host address 459 * @memop: the full memory op 460 * 461 * Load 4 bytes from @p, honoring the atomicity of @memop. 462 */ 463static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra, 464 void *pv, MemOp memop) 465{ 466 uintptr_t pi = (uintptr_t)pv; 467 int atmax; 468 469 if (likely((pi & 3) == 0)) { 470 return load_atomic4(pv); 471 } 472 if (HAVE_ATOMIC128_RO) { 473 return load_atom_extract_al16_or_al8(pv, 4); 474 } 475 476 atmax = required_atomicity(env, pi, memop); 477 switch (atmax) { 478 case MO_8: 479 case MO_16: 480 case -MO_16: 481 /* 482 * For MO_ATOM_IFALIGN, this is more atomicity than required, 483 * but it's trivially supported on all hosts, better than 4 484 * individual byte loads (when the host requires alignment), 485 * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. 486 */ 487 return load_atom_extract_al4x2(pv); 488 case MO_32: 489 if (!(pi & 4)) { 490 return load_atom_extract_al8_or_exit(env, ra, pv, 4); 491 } 492 return load_atom_extract_al16_or_exit(env, ra, pv, 4); 493 default: 494 g_assert_not_reached(); 495 } 496} 497 498/** 499 * load_atom_8: 500 * @p: host address 501 * @memop: the full memory op 502 * 503 * Load 8 bytes from @p, honoring the atomicity of @memop. 504 */ 505static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra, 506 void *pv, MemOp memop) 507{ 508 uintptr_t pi = (uintptr_t)pv; 509 int atmax; 510 511 /* 512 * If the host does not support 8-byte atomics, wait until we have 513 * examined the atomicity parameters below. 514 */ 515 if (HAVE_al8 && likely((pi & 7) == 0)) { 516 return load_atomic8(pv); 517 } 518 if (HAVE_ATOMIC128_RO) { 519 return load_atom_extract_al16_or_al8(pv, 8); 520 } 521 522 atmax = required_atomicity(env, pi, memop); 523 if (atmax == MO_64) { 524 if (!HAVE_al8 && (pi & 7) == 0) { 525 load_atomic8_or_exit(env, ra, pv); 526 } 527 return load_atom_extract_al16_or_exit(env, ra, pv, 8); 528 } 529 if (HAVE_al8_fast) { 530 return load_atom_extract_al8x2(pv); 531 } 532 switch (atmax) { 533 case MO_8: 534 return ldq_he_p(pv); 535 case MO_16: 536 return load_atom_8_by_2(pv); 537 case MO_32: 538 return load_atom_8_by_4(pv); 539 case -MO_32: 540 if (HAVE_al8) { 541 return load_atom_extract_al8x2(pv); 542 } 543 cpu_loop_exit_atomic(env_cpu(env), ra); 544 default: 545 g_assert_not_reached(); 546 } 547} 548 549/** 550 * load_atom_16: 551 * @p: host address 552 * @memop: the full memory op 553 * 554 * Load 16 bytes from @p, honoring the atomicity of @memop. 555 */ 556static Int128 load_atom_16(CPUArchState *env, uintptr_t ra, 557 void *pv, MemOp memop) 558{ 559 uintptr_t pi = (uintptr_t)pv; 560 int atmax; 561 Int128 r; 562 uint64_t a, b; 563 564 /* 565 * If the host does not support 16-byte atomics, wait until we have 566 * examined the atomicity parameters below. 567 */ 568 if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) { 569 return atomic16_read_ro(pv); 570 } 571 572 atmax = required_atomicity(env, pi, memop); 573 switch (atmax) { 574 case MO_8: 575 memcpy(&r, pv, 16); 576 return r; 577 case MO_16: 578 a = load_atom_8_by_2(pv); 579 b = load_atom_8_by_2(pv + 8); 580 break; 581 case MO_32: 582 a = load_atom_8_by_4(pv); 583 b = load_atom_8_by_4(pv + 8); 584 break; 585 case MO_64: 586 if (!HAVE_al8) { 587 cpu_loop_exit_atomic(env_cpu(env), ra); 588 } 589 a = load_atomic8(pv); 590 b = load_atomic8(pv + 8); 591 break; 592 case -MO_64: 593 if (!HAVE_al8) { 594 cpu_loop_exit_atomic(env_cpu(env), ra); 595 } 596 a = load_atom_extract_al8x2(pv); 597 b = load_atom_extract_al8x2(pv + 8); 598 break; 599 case MO_128: 600 return load_atomic16_or_exit(env, ra, pv); 601 default: 602 g_assert_not_reached(); 603 } 604 return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b); 605} 606 607/** 608 * store_atomic2: 609 * @pv: host address 610 * @val: value to store 611 * 612 * Atomically store 2 aligned bytes to @pv. 613 */ 614static inline void store_atomic2(void *pv, uint16_t val) 615{ 616 uint16_t *p = __builtin_assume_aligned(pv, 2); 617 qatomic_set(p, val); 618} 619 620/** 621 * store_atomic4: 622 * @pv: host address 623 * @val: value to store 624 * 625 * Atomically store 4 aligned bytes to @pv. 626 */ 627static inline void store_atomic4(void *pv, uint32_t val) 628{ 629 uint32_t *p = __builtin_assume_aligned(pv, 4); 630 qatomic_set(p, val); 631} 632 633/** 634 * store_atomic8: 635 * @pv: host address 636 * @val: value to store 637 * 638 * Atomically store 8 aligned bytes to @pv. 639 */ 640static inline void store_atomic8(void *pv, uint64_t val) 641{ 642 uint64_t *p = __builtin_assume_aligned(pv, 8); 643 644 qemu_build_assert(HAVE_al8); 645 qatomic_set__nocheck(p, val); 646} 647 648/** 649 * store_atom_4x2 650 */ 651static inline void store_atom_4_by_2(void *pv, uint32_t val) 652{ 653 store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); 654 store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); 655} 656 657/** 658 * store_atom_8_by_2 659 */ 660static inline void store_atom_8_by_2(void *pv, uint64_t val) 661{ 662 store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 663 store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 664} 665 666/** 667 * store_atom_8_by_4 668 */ 669static inline void store_atom_8_by_4(void *pv, uint64_t val) 670{ 671 store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 672 store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 673} 674 675/** 676 * store_atom_insert_al4: 677 * @p: host address 678 * @val: shifted value to store 679 * @msk: mask for value to store 680 * 681 * Atomically store @val to @p, masked by @msk. 682 */ 683static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) 684{ 685 uint32_t old, new; 686 687 p = __builtin_assume_aligned(p, 4); 688 old = qatomic_read(p); 689 do { 690 new = (old & ~msk) | val; 691 } while (!__atomic_compare_exchange_n(p, &old, new, true, 692 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 693} 694 695/** 696 * store_atom_insert_al8: 697 * @p: host address 698 * @val: shifted value to store 699 * @msk: mask for value to store 700 * 701 * Atomically store @val to @p masked by @msk. 702 */ 703static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) 704{ 705 uint64_t old, new; 706 707 qemu_build_assert(HAVE_al8); 708 p = __builtin_assume_aligned(p, 8); 709 old = qatomic_read__nocheck(p); 710 do { 711 new = (old & ~msk) | val; 712 } while (!__atomic_compare_exchange_n(p, &old, new, true, 713 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 714} 715 716/** 717 * store_atom_insert_al16: 718 * @p: host address 719 * @val: shifted value to store 720 * @msk: mask for value to store 721 * 722 * Atomically store @val to @p masked by @msk. 723 */ 724static void ATTRIBUTE_ATOMIC128_OPT 725store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk) 726{ 727#if defined(CONFIG_ATOMIC128) 728 __uint128_t *pu, old, new; 729 730 /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */ 731 pu = __builtin_assume_aligned(ps, 16); 732 old = *pu; 733 do { 734 new = (old & ~msk.u) | val.u; 735 } while (!__atomic_compare_exchange_n(pu, &old, new, true, 736 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 737#elif defined(CONFIG_CMPXCHG128) 738 __uint128_t *pu, old, new; 739 740 /* 741 * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always 742 * defer to libatomic, so we must use __sync_*_compare_and_swap_16 743 * and accept the sequential consistency that comes with it. 744 */ 745 pu = __builtin_assume_aligned(ps, 16); 746 do { 747 old = *pu; 748 new = (old & ~msk.u) | val.u; 749 } while (!__sync_bool_compare_and_swap_16(pu, old, new)); 750#else 751 qemu_build_not_reached(); 752#endif 753} 754 755/** 756 * store_bytes_leN: 757 * @pv: host address 758 * @size: number of bytes to store 759 * @val_le: data to store 760 * 761 * Store @size bytes at @p. The bytes to store are extracted in little-endian order 762 * from @val_le; return the bytes of @val_le beyond @size that have not been stored. 763 */ 764static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) 765{ 766 uint8_t *p = pv; 767 for (int i = 0; i < size; i++, val_le >>= 8) { 768 p[i] = val_le; 769 } 770 return val_le; 771} 772 773/** 774 * store_parts_leN 775 * @pv: host address 776 * @size: number of bytes to store 777 * @val_le: data to store 778 * 779 * As store_bytes_leN, but atomically on each aligned part. 780 */ 781G_GNUC_UNUSED 782static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) 783{ 784 do { 785 int n; 786 787 /* Find minimum of alignment and size */ 788 switch (((uintptr_t)pv | size) & 7) { 789 case 4: 790 store_atomic4(pv, le32_to_cpu(val_le)); 791 val_le >>= 32; 792 n = 4; 793 break; 794 case 2: 795 case 6: 796 store_atomic2(pv, le16_to_cpu(val_le)); 797 val_le >>= 16; 798 n = 2; 799 break; 800 default: 801 *(uint8_t *)pv = val_le; 802 val_le >>= 8; 803 n = 1; 804 break; 805 case 0: 806 g_assert_not_reached(); 807 } 808 pv += n; 809 size -= n; 810 } while (size != 0); 811 812 return val_le; 813} 814 815/** 816 * store_whole_le4 817 * @pv: host address 818 * @size: number of bytes to store 819 * @val_le: data to store 820 * 821 * As store_bytes_leN, but atomically as a whole. 822 * Four aligned bytes are guaranteed to cover the store. 823 */ 824static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) 825{ 826 int sz = size * 8; 827 int o = (uintptr_t)pv & 3; 828 int sh = o * 8; 829 uint32_t m = MAKE_64BIT_MASK(0, sz); 830 uint32_t v; 831 832 if (HOST_BIG_ENDIAN) { 833 v = bswap32(val_le) >> sh; 834 m = bswap32(m) >> sh; 835 } else { 836 v = val_le << sh; 837 m <<= sh; 838 } 839 store_atom_insert_al4(pv - o, v, m); 840 return val_le >> sz; 841} 842 843/** 844 * store_whole_le8 845 * @pv: host address 846 * @size: number of bytes to store 847 * @val_le: data to store 848 * 849 * As store_bytes_leN, but atomically as a whole. 850 * Eight aligned bytes are guaranteed to cover the store. 851 */ 852static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) 853{ 854 int sz = size * 8; 855 int o = (uintptr_t)pv & 7; 856 int sh = o * 8; 857 uint64_t m = MAKE_64BIT_MASK(0, sz); 858 uint64_t v; 859 860 qemu_build_assert(HAVE_al8); 861 if (HOST_BIG_ENDIAN) { 862 v = bswap64(val_le) >> sh; 863 m = bswap64(m) >> sh; 864 } else { 865 v = val_le << sh; 866 m <<= sh; 867 } 868 store_atom_insert_al8(pv - o, v, m); 869 return val_le >> sz; 870} 871 872/** 873 * store_whole_le16 874 * @pv: host address 875 * @size: number of bytes to store 876 * @val_le: data to store 877 * 878 * As store_bytes_leN, but atomically as a whole. 879 * 16 aligned bytes are guaranteed to cover the store. 880 */ 881static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) 882{ 883 int sz = size * 8; 884 int o = (uintptr_t)pv & 15; 885 int sh = o * 8; 886 Int128 m, v; 887 888 qemu_build_assert(HAVE_ATOMIC128_RW); 889 890 /* Like MAKE_64BIT_MASK(0, sz), but larger. */ 891 if (sz <= 64) { 892 m = int128_make64(MAKE_64BIT_MASK(0, sz)); 893 } else { 894 m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); 895 } 896 897 if (HOST_BIG_ENDIAN) { 898 v = int128_urshift(bswap128(val_le), sh); 899 m = int128_urshift(bswap128(m), sh); 900 } else { 901 v = int128_lshift(val_le, sh); 902 m = int128_lshift(m, sh); 903 } 904 store_atom_insert_al16(pv - o, v, m); 905 906 /* Unused if sz <= 64. */ 907 return int128_gethi(val_le) >> (sz - 64); 908} 909 910/** 911 * store_atom_2: 912 * @p: host address 913 * @val: the value to store 914 * @memop: the full memory op 915 * 916 * Store 2 bytes to @p, honoring the atomicity of @memop. 917 */ 918static void store_atom_2(CPUArchState *env, uintptr_t ra, 919 void *pv, MemOp memop, uint16_t val) 920{ 921 uintptr_t pi = (uintptr_t)pv; 922 int atmax; 923 924 if (likely((pi & 1) == 0)) { 925 store_atomic2(pv, val); 926 return; 927 } 928 929 atmax = required_atomicity(env, pi, memop); 930 if (atmax == MO_8) { 931 stw_he_p(pv, val); 932 return; 933 } 934 935 /* 936 * The only case remaining is MO_ATOM_WITHIN16. 937 * Big or little endian, we want the middle two bytes in each test. 938 */ 939 if ((pi & 3) == 1) { 940 store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); 941 return; 942 } else if ((pi & 7) == 3) { 943 if (HAVE_al8) { 944 store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); 945 return; 946 } 947 } else if ((pi & 15) == 7) { 948 if (HAVE_ATOMIC128_RW) { 949 Int128 v = int128_lshift(int128_make64(val), 56); 950 Int128 m = int128_lshift(int128_make64(0xffff), 56); 951 store_atom_insert_al16(pv - 7, v, m); 952 return; 953 } 954 } else { 955 g_assert_not_reached(); 956 } 957 958 cpu_loop_exit_atomic(env_cpu(env), ra); 959} 960 961/** 962 * store_atom_4: 963 * @p: host address 964 * @val: the value to store 965 * @memop: the full memory op 966 * 967 * Store 4 bytes to @p, honoring the atomicity of @memop. 968 */ 969static void store_atom_4(CPUArchState *env, uintptr_t ra, 970 void *pv, MemOp memop, uint32_t val) 971{ 972 uintptr_t pi = (uintptr_t)pv; 973 int atmax; 974 975 if (likely((pi & 3) == 0)) { 976 store_atomic4(pv, val); 977 return; 978 } 979 980 atmax = required_atomicity(env, pi, memop); 981 switch (atmax) { 982 case MO_8: 983 stl_he_p(pv, val); 984 return; 985 case MO_16: 986 store_atom_4_by_2(pv, val); 987 return; 988 case -MO_16: 989 { 990 uint32_t val_le = cpu_to_le32(val); 991 int s2 = pi & 3; 992 int s1 = 4 - s2; 993 994 switch (s2) { 995 case 1: 996 val_le = store_whole_le4(pv, s1, val_le); 997 *(uint8_t *)(pv + 3) = val_le; 998 break; 999 case 3: 1000 *(uint8_t *)pv = val_le; 1001 store_whole_le4(pv + 1, s2, val_le >> 8); 1002 break; 1003 case 0: /* aligned */ 1004 case 2: /* atmax MO_16 */ 1005 default: 1006 g_assert_not_reached(); 1007 } 1008 } 1009 return; 1010 case MO_32: 1011 if ((pi & 7) < 4) { 1012 if (HAVE_al8) { 1013 store_whole_le8(pv, 4, cpu_to_le32(val)); 1014 return; 1015 } 1016 } else { 1017 if (HAVE_ATOMIC128_RW) { 1018 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); 1019 return; 1020 } 1021 } 1022 cpu_loop_exit_atomic(env_cpu(env), ra); 1023 default: 1024 g_assert_not_reached(); 1025 } 1026} 1027 1028/** 1029 * store_atom_8: 1030 * @p: host address 1031 * @val: the value to store 1032 * @memop: the full memory op 1033 * 1034 * Store 8 bytes to @p, honoring the atomicity of @memop. 1035 */ 1036static void store_atom_8(CPUArchState *env, uintptr_t ra, 1037 void *pv, MemOp memop, uint64_t val) 1038{ 1039 uintptr_t pi = (uintptr_t)pv; 1040 int atmax; 1041 1042 if (HAVE_al8 && likely((pi & 7) == 0)) { 1043 store_atomic8(pv, val); 1044 return; 1045 } 1046 1047 atmax = required_atomicity(env, pi, memop); 1048 switch (atmax) { 1049 case MO_8: 1050 stq_he_p(pv, val); 1051 return; 1052 case MO_16: 1053 store_atom_8_by_2(pv, val); 1054 return; 1055 case MO_32: 1056 store_atom_8_by_4(pv, val); 1057 return; 1058 case -MO_32: 1059 if (HAVE_al8) { 1060 uint64_t val_le = cpu_to_le64(val); 1061 int s2 = pi & 7; 1062 int s1 = 8 - s2; 1063 1064 switch (s2) { 1065 case 1 ... 3: 1066 val_le = store_whole_le8(pv, s1, val_le); 1067 store_bytes_leN(pv + s1, s2, val_le); 1068 break; 1069 case 5 ... 7: 1070 val_le = store_bytes_leN(pv, s1, val_le); 1071 store_whole_le8(pv + s1, s2, val_le); 1072 break; 1073 case 0: /* aligned */ 1074 case 4: /* atmax MO_32 */ 1075 default: 1076 g_assert_not_reached(); 1077 } 1078 return; 1079 } 1080 break; 1081 case MO_64: 1082 if (HAVE_ATOMIC128_RW) { 1083 store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); 1084 return; 1085 } 1086 break; 1087 default: 1088 g_assert_not_reached(); 1089 } 1090 cpu_loop_exit_atomic(env_cpu(env), ra); 1091} 1092 1093/** 1094 * store_atom_16: 1095 * @p: host address 1096 * @val: the value to store 1097 * @memop: the full memory op 1098 * 1099 * Store 16 bytes to @p, honoring the atomicity of @memop. 1100 */ 1101static void store_atom_16(CPUArchState *env, uintptr_t ra, 1102 void *pv, MemOp memop, Int128 val) 1103{ 1104 uintptr_t pi = (uintptr_t)pv; 1105 uint64_t a, b; 1106 int atmax; 1107 1108 if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) { 1109 atomic16_set(pv, val); 1110 return; 1111 } 1112 1113 atmax = required_atomicity(env, pi, memop); 1114 1115 a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val); 1116 b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val); 1117 switch (atmax) { 1118 case MO_8: 1119 memcpy(pv, &val, 16); 1120 return; 1121 case MO_16: 1122 store_atom_8_by_2(pv, a); 1123 store_atom_8_by_2(pv + 8, b); 1124 return; 1125 case MO_32: 1126 store_atom_8_by_4(pv, a); 1127 store_atom_8_by_4(pv + 8, b); 1128 return; 1129 case MO_64: 1130 if (HAVE_al8) { 1131 store_atomic8(pv, a); 1132 store_atomic8(pv + 8, b); 1133 return; 1134 } 1135 break; 1136 case -MO_64: 1137 if (HAVE_ATOMIC128_RW) { 1138 uint64_t val_le; 1139 int s2 = pi & 15; 1140 int s1 = 16 - s2; 1141 1142 if (HOST_BIG_ENDIAN) { 1143 val = bswap128(val); 1144 } 1145 switch (s2) { 1146 case 1 ... 7: 1147 val_le = store_whole_le16(pv, s1, val); 1148 store_bytes_leN(pv + s1, s2, val_le); 1149 break; 1150 case 9 ... 15: 1151 store_bytes_leN(pv, s1, int128_getlo(val)); 1152 val = int128_urshift(val, s1 * 8); 1153 store_whole_le16(pv + s1, s2, val); 1154 break; 1155 case 0: /* aligned */ 1156 case 8: /* atmax MO_64 */ 1157 default: 1158 g_assert_not_reached(); 1159 } 1160 return; 1161 } 1162 break; 1163 case MO_128: 1164 if (HAVE_ATOMIC128_RW) { 1165 atomic16_set(pv, val); 1166 return; 1167 } 1168 break; 1169 default: 1170 g_assert_not_reached(); 1171 } 1172 cpu_loop_exit_atomic(env_cpu(env), ra); 1173} 1174