1/* 2 * Routines common to user and system emulation of load/store. 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * SPDX-License-Identifier: GPL-2.0-or-later 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12#include "host/load-extract-al16-al8.h" 13#include "host/store-insert-al16.h" 14 15#ifdef CONFIG_ATOMIC64 16# define HAVE_al8 true 17#else 18# define HAVE_al8 false 19#endif 20#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) 21 22/** 23 * required_atomicity: 24 * 25 * Return the lg2 bytes of atomicity required by @memop for @p. 26 * If the operation must be split into two operations to be 27 * examined separately for atomicity, return -lg2. 28 */ 29static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop) 30{ 31 MemOp atom = memop & MO_ATOM_MASK; 32 MemOp size = memop & MO_SIZE; 33 MemOp half = size ? size - 1 : 0; 34 unsigned tmp; 35 int atmax; 36 37 switch (atom) { 38 case MO_ATOM_NONE: 39 atmax = MO_8; 40 break; 41 42 case MO_ATOM_IFALIGN_PAIR: 43 size = half; 44 /* fall through */ 45 46 case MO_ATOM_IFALIGN: 47 tmp = (1 << size) - 1; 48 atmax = p & tmp ? MO_8 : size; 49 break; 50 51 case MO_ATOM_WITHIN16: 52 tmp = p & 15; 53 atmax = (tmp + (1 << size) <= 16 ? size : MO_8); 54 break; 55 56 case MO_ATOM_WITHIN16_PAIR: 57 tmp = p & 15; 58 if (tmp + (1 << size) <= 16) { 59 atmax = size; 60 } else if (tmp + (1 << half) == 16) { 61 /* 62 * The pair exactly straddles the boundary. 63 * Both halves are naturally aligned and atomic. 64 */ 65 atmax = half; 66 } else { 67 /* 68 * One of the pair crosses the boundary, and is non-atomic. 69 * The other of the pair does not cross, and is atomic. 70 */ 71 atmax = -half; 72 } 73 break; 74 75 case MO_ATOM_SUBALIGN: 76 /* 77 * Examine the alignment of p to determine if there are subobjects 78 * that must be aligned. Note that we only really need ctz4() -- 79 * any more sigificant bits are discarded by the immediately 80 * following comparison. 81 */ 82 tmp = ctz32(p); 83 atmax = MIN(size, tmp); 84 break; 85 86 default: 87 g_assert_not_reached(); 88 } 89 90 /* 91 * Here we have the architectural atomicity of the operation. 92 * However, when executing in a serial context, we need no extra 93 * host atomicity in order to avoid racing. This reduction 94 * avoids looping with cpu_loop_exit_atomic. 95 */ 96 if (cpu_in_serial_context(env_cpu(env))) { 97 return MO_8; 98 } 99 return atmax; 100} 101 102/** 103 * load_atomic2: 104 * @pv: host address 105 * 106 * Atomically load 2 aligned bytes from @pv. 107 */ 108static inline uint16_t load_atomic2(void *pv) 109{ 110 uint16_t *p = __builtin_assume_aligned(pv, 2); 111 return qatomic_read(p); 112} 113 114/** 115 * load_atomic4: 116 * @pv: host address 117 * 118 * Atomically load 4 aligned bytes from @pv. 119 */ 120static inline uint32_t load_atomic4(void *pv) 121{ 122 uint32_t *p = __builtin_assume_aligned(pv, 4); 123 return qatomic_read(p); 124} 125 126/** 127 * load_atomic8: 128 * @pv: host address 129 * 130 * Atomically load 8 aligned bytes from @pv. 131 */ 132static inline uint64_t load_atomic8(void *pv) 133{ 134 uint64_t *p = __builtin_assume_aligned(pv, 8); 135 136 qemu_build_assert(HAVE_al8); 137 return qatomic_read__nocheck(p); 138} 139 140/** 141 * load_atomic8_or_exit: 142 * @env: cpu context 143 * @ra: host unwind address 144 * @pv: host address 145 * 146 * Atomically load 8 aligned bytes from @pv. 147 * If this is not possible, longjmp out to restart serially. 148 */ 149static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 150{ 151 if (HAVE_al8) { 152 return load_atomic8(pv); 153 } 154 155#ifdef CONFIG_USER_ONLY 156 /* 157 * If the page is not writable, then assume the value is immutable 158 * and requires no locking. This ignores the case of MAP_SHARED with 159 * another process, because the fallback start_exclusive solution 160 * provides no protection across processes. 161 */ 162 if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) { 163 uint64_t *p = __builtin_assume_aligned(pv, 8); 164 return *p; 165 } 166#endif 167 168 /* Ultimate fallback: re-execute in serial context. */ 169 cpu_loop_exit_atomic(env_cpu(env), ra); 170} 171 172/** 173 * load_atomic16_or_exit: 174 * @env: cpu context 175 * @ra: host unwind address 176 * @pv: host address 177 * 178 * Atomically load 16 aligned bytes from @pv. 179 * If this is not possible, longjmp out to restart serially. 180 */ 181static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 182{ 183 Int128 *p = __builtin_assume_aligned(pv, 16); 184 185 if (HAVE_ATOMIC128_RO) { 186 return atomic16_read_ro(p); 187 } 188 189#ifdef CONFIG_USER_ONLY 190 /* 191 * We can only use cmpxchg to emulate a load if the page is writable. 192 * If the page is not writable, then assume the value is immutable 193 * and requires no locking. This ignores the case of MAP_SHARED with 194 * another process, because the fallback start_exclusive solution 195 * provides no protection across processes. 196 */ 197 if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) { 198 return *p; 199 } 200#endif 201 202 /* 203 * In system mode all guest pages are writable, and for user-only 204 * we have just checked writability. Try cmpxchg. 205 */ 206 if (HAVE_ATOMIC128_RW) { 207 return atomic16_read_rw(p); 208 } 209 210 /* Ultimate fallback: re-execute in serial context. */ 211 cpu_loop_exit_atomic(env_cpu(env), ra); 212} 213 214/** 215 * load_atom_extract_al4x2: 216 * @pv: host address 217 * 218 * Load 4 bytes from @p, from two sequential atomic 4-byte loads. 219 */ 220static uint32_t load_atom_extract_al4x2(void *pv) 221{ 222 uintptr_t pi = (uintptr_t)pv; 223 int sh = (pi & 3) * 8; 224 uint32_t a, b; 225 226 pv = (void *)(pi & ~3); 227 a = load_atomic4(pv); 228 b = load_atomic4(pv + 4); 229 230 if (HOST_BIG_ENDIAN) { 231 return (a << sh) | (b >> (-sh & 31)); 232 } else { 233 return (a >> sh) | (b << (-sh & 31)); 234 } 235} 236 237/** 238 * load_atom_extract_al8x2: 239 * @pv: host address 240 * 241 * Load 8 bytes from @p, from two sequential atomic 8-byte loads. 242 */ 243static uint64_t load_atom_extract_al8x2(void *pv) 244{ 245 uintptr_t pi = (uintptr_t)pv; 246 int sh = (pi & 7) * 8; 247 uint64_t a, b; 248 249 pv = (void *)(pi & ~7); 250 a = load_atomic8(pv); 251 b = load_atomic8(pv + 8); 252 253 if (HOST_BIG_ENDIAN) { 254 return (a << sh) | (b >> (-sh & 63)); 255 } else { 256 return (a >> sh) | (b << (-sh & 63)); 257 } 258} 259 260/** 261 * load_atom_extract_al8_or_exit: 262 * @env: cpu context 263 * @ra: host unwind address 264 * @pv: host address 265 * @s: object size in bytes, @s <= 4. 266 * 267 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does 268 * not cross an 8-byte boundary. This means that we can perform an atomic 269 * 8-byte load and extract. 270 * The value is returned in the low bits of a uint32_t. 271 */ 272static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra, 273 void *pv, int s) 274{ 275 uintptr_t pi = (uintptr_t)pv; 276 int o = pi & 7; 277 int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; 278 279 pv = (void *)(pi & ~7); 280 return load_atomic8_or_exit(env, ra, pv) >> shr; 281} 282 283/** 284 * load_atom_extract_al16_or_exit: 285 * @env: cpu context 286 * @ra: host unwind address 287 * @p: host address 288 * @s: object size in bytes, @s <= 8. 289 * 290 * Atomically load @s bytes from @p, when p % 16 < 8 291 * and p % 16 + s > 8. I.e. does not cross a 16-byte 292 * boundary, but *does* cross an 8-byte boundary. 293 * This is the slow version, so we must have eliminated 294 * any faster load_atom_extract_al8_or_exit case. 295 * 296 * If this is not possible, longjmp out to restart serially. 297 */ 298static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra, 299 void *pv, int s) 300{ 301 uintptr_t pi = (uintptr_t)pv; 302 int o = pi & 7; 303 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 304 Int128 r; 305 306 /* 307 * Note constraints above: p & 8 must be clear. 308 * Provoke SIGBUS if possible otherwise. 309 */ 310 pv = (void *)(pi & ~7); 311 r = load_atomic16_or_exit(env, ra, pv); 312 313 r = int128_urshift(r, shr); 314 return int128_getlo(r); 315} 316 317/** 318 * load_atom_4_by_2: 319 * @pv: host address 320 * 321 * Load 4 bytes from @pv, with two 2-byte atomic loads. 322 */ 323static inline uint32_t load_atom_4_by_2(void *pv) 324{ 325 uint32_t a = load_atomic2(pv); 326 uint32_t b = load_atomic2(pv + 2); 327 328 if (HOST_BIG_ENDIAN) { 329 return (a << 16) | b; 330 } else { 331 return (b << 16) | a; 332 } 333} 334 335/** 336 * load_atom_8_by_2: 337 * @pv: host address 338 * 339 * Load 8 bytes from @pv, with four 2-byte atomic loads. 340 */ 341static inline uint64_t load_atom_8_by_2(void *pv) 342{ 343 uint32_t a = load_atom_4_by_2(pv); 344 uint32_t b = load_atom_4_by_2(pv + 4); 345 346 if (HOST_BIG_ENDIAN) { 347 return ((uint64_t)a << 32) | b; 348 } else { 349 return ((uint64_t)b << 32) | a; 350 } 351} 352 353/** 354 * load_atom_8_by_4: 355 * @pv: host address 356 * 357 * Load 8 bytes from @pv, with two 4-byte atomic loads. 358 */ 359static inline uint64_t load_atom_8_by_4(void *pv) 360{ 361 uint32_t a = load_atomic4(pv); 362 uint32_t b = load_atomic4(pv + 4); 363 364 if (HOST_BIG_ENDIAN) { 365 return ((uint64_t)a << 32) | b; 366 } else { 367 return ((uint64_t)b << 32) | a; 368 } 369} 370 371/** 372 * load_atom_8_by_8_or_4: 373 * @pv: host address 374 * 375 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity. 376 */ 377static inline uint64_t load_atom_8_by_8_or_4(void *pv) 378{ 379 if (HAVE_al8_fast) { 380 return load_atomic8(pv); 381 } else { 382 return load_atom_8_by_4(pv); 383 } 384} 385 386/** 387 * load_atom_2: 388 * @p: host address 389 * @memop: the full memory op 390 * 391 * Load 2 bytes from @p, honoring the atomicity of @memop. 392 */ 393static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra, 394 void *pv, MemOp memop) 395{ 396 uintptr_t pi = (uintptr_t)pv; 397 int atmax; 398 399 if (likely((pi & 1) == 0)) { 400 return load_atomic2(pv); 401 } 402 if (HAVE_ATOMIC128_RO) { 403 return load_atom_extract_al16_or_al8(pv, 2); 404 } 405 406 atmax = required_atomicity(env, pi, memop); 407 switch (atmax) { 408 case MO_8: 409 return lduw_he_p(pv); 410 case MO_16: 411 /* The only case remaining is MO_ATOM_WITHIN16. */ 412 if (!HAVE_al8_fast && (pi & 3) == 1) { 413 /* Big or little endian, we want the middle two bytes. */ 414 return load_atomic4(pv - 1) >> 8; 415 } 416 if ((pi & 15) != 7) { 417 return load_atom_extract_al8_or_exit(env, ra, pv, 2); 418 } 419 return load_atom_extract_al16_or_exit(env, ra, pv, 2); 420 default: 421 g_assert_not_reached(); 422 } 423} 424 425/** 426 * load_atom_4: 427 * @p: host address 428 * @memop: the full memory op 429 * 430 * Load 4 bytes from @p, honoring the atomicity of @memop. 431 */ 432static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra, 433 void *pv, MemOp memop) 434{ 435 uintptr_t pi = (uintptr_t)pv; 436 int atmax; 437 438 if (likely((pi & 3) == 0)) { 439 return load_atomic4(pv); 440 } 441 if (HAVE_ATOMIC128_RO) { 442 return load_atom_extract_al16_or_al8(pv, 4); 443 } 444 445 atmax = required_atomicity(env, pi, memop); 446 switch (atmax) { 447 case MO_8: 448 case MO_16: 449 case -MO_16: 450 /* 451 * For MO_ATOM_IFALIGN, this is more atomicity than required, 452 * but it's trivially supported on all hosts, better than 4 453 * individual byte loads (when the host requires alignment), 454 * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. 455 */ 456 return load_atom_extract_al4x2(pv); 457 case MO_32: 458 if (!(pi & 4)) { 459 return load_atom_extract_al8_or_exit(env, ra, pv, 4); 460 } 461 return load_atom_extract_al16_or_exit(env, ra, pv, 4); 462 default: 463 g_assert_not_reached(); 464 } 465} 466 467/** 468 * load_atom_8: 469 * @p: host address 470 * @memop: the full memory op 471 * 472 * Load 8 bytes from @p, honoring the atomicity of @memop. 473 */ 474static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra, 475 void *pv, MemOp memop) 476{ 477 uintptr_t pi = (uintptr_t)pv; 478 int atmax; 479 480 /* 481 * If the host does not support 8-byte atomics, wait until we have 482 * examined the atomicity parameters below. 483 */ 484 if (HAVE_al8 && likely((pi & 7) == 0)) { 485 return load_atomic8(pv); 486 } 487 if (HAVE_ATOMIC128_RO) { 488 return load_atom_extract_al16_or_al8(pv, 8); 489 } 490 491 atmax = required_atomicity(env, pi, memop); 492 if (atmax == MO_64) { 493 if (!HAVE_al8 && (pi & 7) == 0) { 494 load_atomic8_or_exit(env, ra, pv); 495 } 496 return load_atom_extract_al16_or_exit(env, ra, pv, 8); 497 } 498 if (HAVE_al8_fast) { 499 return load_atom_extract_al8x2(pv); 500 } 501 switch (atmax) { 502 case MO_8: 503 return ldq_he_p(pv); 504 case MO_16: 505 return load_atom_8_by_2(pv); 506 case MO_32: 507 return load_atom_8_by_4(pv); 508 case -MO_32: 509 if (HAVE_al8) { 510 return load_atom_extract_al8x2(pv); 511 } 512 cpu_loop_exit_atomic(env_cpu(env), ra); 513 default: 514 g_assert_not_reached(); 515 } 516} 517 518/** 519 * load_atom_16: 520 * @p: host address 521 * @memop: the full memory op 522 * 523 * Load 16 bytes from @p, honoring the atomicity of @memop. 524 */ 525static Int128 load_atom_16(CPUArchState *env, uintptr_t ra, 526 void *pv, MemOp memop) 527{ 528 uintptr_t pi = (uintptr_t)pv; 529 int atmax; 530 Int128 r; 531 uint64_t a, b; 532 533 /* 534 * If the host does not support 16-byte atomics, wait until we have 535 * examined the atomicity parameters below. 536 */ 537 if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) { 538 return atomic16_read_ro(pv); 539 } 540 541 atmax = required_atomicity(env, pi, memop); 542 switch (atmax) { 543 case MO_8: 544 memcpy(&r, pv, 16); 545 return r; 546 case MO_16: 547 a = load_atom_8_by_2(pv); 548 b = load_atom_8_by_2(pv + 8); 549 break; 550 case MO_32: 551 a = load_atom_8_by_4(pv); 552 b = load_atom_8_by_4(pv + 8); 553 break; 554 case MO_64: 555 if (!HAVE_al8) { 556 cpu_loop_exit_atomic(env_cpu(env), ra); 557 } 558 a = load_atomic8(pv); 559 b = load_atomic8(pv + 8); 560 break; 561 case -MO_64: 562 if (!HAVE_al8) { 563 cpu_loop_exit_atomic(env_cpu(env), ra); 564 } 565 a = load_atom_extract_al8x2(pv); 566 b = load_atom_extract_al8x2(pv + 8); 567 break; 568 case MO_128: 569 return load_atomic16_or_exit(env, ra, pv); 570 default: 571 g_assert_not_reached(); 572 } 573 return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b); 574} 575 576/** 577 * store_atomic2: 578 * @pv: host address 579 * @val: value to store 580 * 581 * Atomically store 2 aligned bytes to @pv. 582 */ 583static inline void store_atomic2(void *pv, uint16_t val) 584{ 585 uint16_t *p = __builtin_assume_aligned(pv, 2); 586 qatomic_set(p, val); 587} 588 589/** 590 * store_atomic4: 591 * @pv: host address 592 * @val: value to store 593 * 594 * Atomically store 4 aligned bytes to @pv. 595 */ 596static inline void store_atomic4(void *pv, uint32_t val) 597{ 598 uint32_t *p = __builtin_assume_aligned(pv, 4); 599 qatomic_set(p, val); 600} 601 602/** 603 * store_atomic8: 604 * @pv: host address 605 * @val: value to store 606 * 607 * Atomically store 8 aligned bytes to @pv. 608 */ 609static inline void store_atomic8(void *pv, uint64_t val) 610{ 611 uint64_t *p = __builtin_assume_aligned(pv, 8); 612 613 qemu_build_assert(HAVE_al8); 614 qatomic_set__nocheck(p, val); 615} 616 617/** 618 * store_atom_4x2 619 */ 620static inline void store_atom_4_by_2(void *pv, uint32_t val) 621{ 622 store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); 623 store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); 624} 625 626/** 627 * store_atom_8_by_2 628 */ 629static inline void store_atom_8_by_2(void *pv, uint64_t val) 630{ 631 store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 632 store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 633} 634 635/** 636 * store_atom_8_by_4 637 */ 638static inline void store_atom_8_by_4(void *pv, uint64_t val) 639{ 640 store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 641 store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 642} 643 644/** 645 * store_atom_insert_al4: 646 * @p: host address 647 * @val: shifted value to store 648 * @msk: mask for value to store 649 * 650 * Atomically store @val to @p, masked by @msk. 651 */ 652static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) 653{ 654 uint32_t old, new; 655 656 p = __builtin_assume_aligned(p, 4); 657 old = qatomic_read(p); 658 do { 659 new = (old & ~msk) | val; 660 } while (!__atomic_compare_exchange_n(p, &old, new, true, 661 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 662} 663 664/** 665 * store_atom_insert_al8: 666 * @p: host address 667 * @val: shifted value to store 668 * @msk: mask for value to store 669 * 670 * Atomically store @val to @p masked by @msk. 671 */ 672static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) 673{ 674 uint64_t old, new; 675 676 qemu_build_assert(HAVE_al8); 677 p = __builtin_assume_aligned(p, 8); 678 old = qatomic_read__nocheck(p); 679 do { 680 new = (old & ~msk) | val; 681 } while (!__atomic_compare_exchange_n(p, &old, new, true, 682 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 683} 684 685/** 686 * store_bytes_leN: 687 * @pv: host address 688 * @size: number of bytes to store 689 * @val_le: data to store 690 * 691 * Store @size bytes at @p. The bytes to store are extracted in little-endian order 692 * from @val_le; return the bytes of @val_le beyond @size that have not been stored. 693 */ 694static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) 695{ 696 uint8_t *p = pv; 697 for (int i = 0; i < size; i++, val_le >>= 8) { 698 p[i] = val_le; 699 } 700 return val_le; 701} 702 703/** 704 * store_parts_leN 705 * @pv: host address 706 * @size: number of bytes to store 707 * @val_le: data to store 708 * 709 * As store_bytes_leN, but atomically on each aligned part. 710 */ 711G_GNUC_UNUSED 712static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) 713{ 714 do { 715 int n; 716 717 /* Find minimum of alignment and size */ 718 switch (((uintptr_t)pv | size) & 7) { 719 case 4: 720 store_atomic4(pv, le32_to_cpu(val_le)); 721 val_le >>= 32; 722 n = 4; 723 break; 724 case 2: 725 case 6: 726 store_atomic2(pv, le16_to_cpu(val_le)); 727 val_le >>= 16; 728 n = 2; 729 break; 730 default: 731 *(uint8_t *)pv = val_le; 732 val_le >>= 8; 733 n = 1; 734 break; 735 case 0: 736 g_assert_not_reached(); 737 } 738 pv += n; 739 size -= n; 740 } while (size != 0); 741 742 return val_le; 743} 744 745/** 746 * store_whole_le4 747 * @pv: host address 748 * @size: number of bytes to store 749 * @val_le: data to store 750 * 751 * As store_bytes_leN, but atomically as a whole. 752 * Four aligned bytes are guaranteed to cover the store. 753 */ 754static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) 755{ 756 int sz = size * 8; 757 int o = (uintptr_t)pv & 3; 758 int sh = o * 8; 759 uint32_t m = MAKE_64BIT_MASK(0, sz); 760 uint32_t v; 761 762 if (HOST_BIG_ENDIAN) { 763 v = bswap32(val_le) >> sh; 764 m = bswap32(m) >> sh; 765 } else { 766 v = val_le << sh; 767 m <<= sh; 768 } 769 store_atom_insert_al4(pv - o, v, m); 770 return val_le >> sz; 771} 772 773/** 774 * store_whole_le8 775 * @pv: host address 776 * @size: number of bytes to store 777 * @val_le: data to store 778 * 779 * As store_bytes_leN, but atomically as a whole. 780 * Eight aligned bytes are guaranteed to cover the store. 781 */ 782static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) 783{ 784 int sz = size * 8; 785 int o = (uintptr_t)pv & 7; 786 int sh = o * 8; 787 uint64_t m = MAKE_64BIT_MASK(0, sz); 788 uint64_t v; 789 790 qemu_build_assert(HAVE_al8); 791 if (HOST_BIG_ENDIAN) { 792 v = bswap64(val_le) >> sh; 793 m = bswap64(m) >> sh; 794 } else { 795 v = val_le << sh; 796 m <<= sh; 797 } 798 store_atom_insert_al8(pv - o, v, m); 799 return val_le >> sz; 800} 801 802/** 803 * store_whole_le16 804 * @pv: host address 805 * @size: number of bytes to store 806 * @val_le: data to store 807 * 808 * As store_bytes_leN, but atomically as a whole. 809 * 16 aligned bytes are guaranteed to cover the store. 810 */ 811static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) 812{ 813 int sz = size * 8; 814 int o = (uintptr_t)pv & 15; 815 int sh = o * 8; 816 Int128 m, v; 817 818 qemu_build_assert(HAVE_ATOMIC128_RW); 819 820 /* Like MAKE_64BIT_MASK(0, sz), but larger. */ 821 if (sz <= 64) { 822 m = int128_make64(MAKE_64BIT_MASK(0, sz)); 823 } else { 824 m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); 825 } 826 827 if (HOST_BIG_ENDIAN) { 828 v = int128_urshift(bswap128(val_le), sh); 829 m = int128_urshift(bswap128(m), sh); 830 } else { 831 v = int128_lshift(val_le, sh); 832 m = int128_lshift(m, sh); 833 } 834 store_atom_insert_al16(pv - o, v, m); 835 836 if (sz <= 64) { 837 return 0; 838 } 839 return int128_gethi(val_le) >> (sz - 64); 840} 841 842/** 843 * store_atom_2: 844 * @p: host address 845 * @val: the value to store 846 * @memop: the full memory op 847 * 848 * Store 2 bytes to @p, honoring the atomicity of @memop. 849 */ 850static void store_atom_2(CPUArchState *env, uintptr_t ra, 851 void *pv, MemOp memop, uint16_t val) 852{ 853 uintptr_t pi = (uintptr_t)pv; 854 int atmax; 855 856 if (likely((pi & 1) == 0)) { 857 store_atomic2(pv, val); 858 return; 859 } 860 861 atmax = required_atomicity(env, pi, memop); 862 if (atmax == MO_8) { 863 stw_he_p(pv, val); 864 return; 865 } 866 867 /* 868 * The only case remaining is MO_ATOM_WITHIN16. 869 * Big or little endian, we want the middle two bytes in each test. 870 */ 871 if ((pi & 3) == 1) { 872 store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); 873 return; 874 } else if ((pi & 7) == 3) { 875 if (HAVE_al8) { 876 store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); 877 return; 878 } 879 } else if ((pi & 15) == 7) { 880 if (HAVE_ATOMIC128_RW) { 881 Int128 v = int128_lshift(int128_make64(val), 56); 882 Int128 m = int128_lshift(int128_make64(0xffff), 56); 883 store_atom_insert_al16(pv - 7, v, m); 884 return; 885 } 886 } else { 887 g_assert_not_reached(); 888 } 889 890 cpu_loop_exit_atomic(env_cpu(env), ra); 891} 892 893/** 894 * store_atom_4: 895 * @p: host address 896 * @val: the value to store 897 * @memop: the full memory op 898 * 899 * Store 4 bytes to @p, honoring the atomicity of @memop. 900 */ 901static void store_atom_4(CPUArchState *env, uintptr_t ra, 902 void *pv, MemOp memop, uint32_t val) 903{ 904 uintptr_t pi = (uintptr_t)pv; 905 int atmax; 906 907 if (likely((pi & 3) == 0)) { 908 store_atomic4(pv, val); 909 return; 910 } 911 912 atmax = required_atomicity(env, pi, memop); 913 switch (atmax) { 914 case MO_8: 915 stl_he_p(pv, val); 916 return; 917 case MO_16: 918 store_atom_4_by_2(pv, val); 919 return; 920 case -MO_16: 921 { 922 uint32_t val_le = cpu_to_le32(val); 923 int s2 = pi & 3; 924 int s1 = 4 - s2; 925 926 switch (s2) { 927 case 1: 928 val_le = store_whole_le4(pv, s1, val_le); 929 *(uint8_t *)(pv + 3) = val_le; 930 break; 931 case 3: 932 *(uint8_t *)pv = val_le; 933 store_whole_le4(pv + 1, s2, val_le >> 8); 934 break; 935 case 0: /* aligned */ 936 case 2: /* atmax MO_16 */ 937 default: 938 g_assert_not_reached(); 939 } 940 } 941 return; 942 case MO_32: 943 if ((pi & 7) < 4) { 944 if (HAVE_al8) { 945 store_whole_le8(pv, 4, cpu_to_le32(val)); 946 return; 947 } 948 } else { 949 if (HAVE_ATOMIC128_RW) { 950 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); 951 return; 952 } 953 } 954 cpu_loop_exit_atomic(env_cpu(env), ra); 955 default: 956 g_assert_not_reached(); 957 } 958} 959 960/** 961 * store_atom_8: 962 * @p: host address 963 * @val: the value to store 964 * @memop: the full memory op 965 * 966 * Store 8 bytes to @p, honoring the atomicity of @memop. 967 */ 968static void store_atom_8(CPUArchState *env, uintptr_t ra, 969 void *pv, MemOp memop, uint64_t val) 970{ 971 uintptr_t pi = (uintptr_t)pv; 972 int atmax; 973 974 if (HAVE_al8 && likely((pi & 7) == 0)) { 975 store_atomic8(pv, val); 976 return; 977 } 978 979 atmax = required_atomicity(env, pi, memop); 980 switch (atmax) { 981 case MO_8: 982 stq_he_p(pv, val); 983 return; 984 case MO_16: 985 store_atom_8_by_2(pv, val); 986 return; 987 case MO_32: 988 store_atom_8_by_4(pv, val); 989 return; 990 case -MO_32: 991 if (HAVE_al8) { 992 uint64_t val_le = cpu_to_le64(val); 993 int s2 = pi & 7; 994 int s1 = 8 - s2; 995 996 switch (s2) { 997 case 1 ... 3: 998 val_le = store_whole_le8(pv, s1, val_le); 999 store_bytes_leN(pv + s1, s2, val_le); 1000 break; 1001 case 5 ... 7: 1002 val_le = store_bytes_leN(pv, s1, val_le); 1003 store_whole_le8(pv + s1, s2, val_le); 1004 break; 1005 case 0: /* aligned */ 1006 case 4: /* atmax MO_32 */ 1007 default: 1008 g_assert_not_reached(); 1009 } 1010 return; 1011 } 1012 break; 1013 case MO_64: 1014 if (HAVE_ATOMIC128_RW) { 1015 store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); 1016 return; 1017 } 1018 break; 1019 default: 1020 g_assert_not_reached(); 1021 } 1022 cpu_loop_exit_atomic(env_cpu(env), ra); 1023} 1024 1025/** 1026 * store_atom_16: 1027 * @p: host address 1028 * @val: the value to store 1029 * @memop: the full memory op 1030 * 1031 * Store 16 bytes to @p, honoring the atomicity of @memop. 1032 */ 1033static void store_atom_16(CPUArchState *env, uintptr_t ra, 1034 void *pv, MemOp memop, Int128 val) 1035{ 1036 uintptr_t pi = (uintptr_t)pv; 1037 uint64_t a, b; 1038 int atmax; 1039 1040 if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) { 1041 atomic16_set(pv, val); 1042 return; 1043 } 1044 1045 atmax = required_atomicity(env, pi, memop); 1046 1047 a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val); 1048 b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val); 1049 switch (atmax) { 1050 case MO_8: 1051 memcpy(pv, &val, 16); 1052 return; 1053 case MO_16: 1054 store_atom_8_by_2(pv, a); 1055 store_atom_8_by_2(pv + 8, b); 1056 return; 1057 case MO_32: 1058 store_atom_8_by_4(pv, a); 1059 store_atom_8_by_4(pv + 8, b); 1060 return; 1061 case MO_64: 1062 if (HAVE_al8) { 1063 store_atomic8(pv, a); 1064 store_atomic8(pv + 8, b); 1065 return; 1066 } 1067 break; 1068 case -MO_64: 1069 if (HAVE_ATOMIC128_RW) { 1070 uint64_t val_le; 1071 int s2 = pi & 15; 1072 int s1 = 16 - s2; 1073 1074 if (HOST_BIG_ENDIAN) { 1075 val = bswap128(val); 1076 } 1077 switch (s2) { 1078 case 1 ... 7: 1079 val_le = store_whole_le16(pv, s1, val); 1080 store_bytes_leN(pv + s1, s2, val_le); 1081 break; 1082 case 9 ... 15: 1083 store_bytes_leN(pv, s1, int128_getlo(val)); 1084 val = int128_urshift(val, s1 * 8); 1085 store_whole_le16(pv + s1, s2, val); 1086 break; 1087 case 0: /* aligned */ 1088 case 8: /* atmax MO_64 */ 1089 default: 1090 g_assert_not_reached(); 1091 } 1092 return; 1093 } 1094 break; 1095 case MO_128: 1096 if (HAVE_ATOMIC128_RW) { 1097 atomic16_set(pv, val); 1098 return; 1099 } 1100 break; 1101 default: 1102 g_assert_not_reached(); 1103 } 1104 cpu_loop_exit_atomic(env_cpu(env), ra); 1105} 1106