1/* 2 * Routines common to user and system emulation of load/store. 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * SPDX-License-Identifier: GPL-2.0-or-later 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12#include "host/load-extract-al16-al8.h.inc" 13#include "host/store-insert-al16.h.inc" 14 15#ifdef CONFIG_ATOMIC64 16# define HAVE_al8 true 17#else 18# define HAVE_al8 false 19#endif 20#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) 21 22/** 23 * required_atomicity: 24 * 25 * Return the lg2 bytes of atomicity required by @memop for @p. 26 * If the operation must be split into two operations to be 27 * examined separately for atomicity, return -lg2. 28 */ 29static int required_atomicity(CPUState *cpu, uintptr_t p, MemOp memop) 30{ 31 MemOp atom = memop & MO_ATOM_MASK; 32 MemOp size = memop & MO_SIZE; 33 MemOp half = size ? size - 1 : 0; 34 unsigned tmp; 35 int atmax; 36 37 switch (atom) { 38 case MO_ATOM_NONE: 39 atmax = MO_8; 40 break; 41 42 case MO_ATOM_IFALIGN_PAIR: 43 size = half; 44 /* fall through */ 45 46 case MO_ATOM_IFALIGN: 47 tmp = (1 << size) - 1; 48 atmax = p & tmp ? MO_8 : size; 49 break; 50 51 case MO_ATOM_WITHIN16: 52 tmp = p & 15; 53 atmax = (tmp + (1 << size) <= 16 ? size : MO_8); 54 break; 55 56 case MO_ATOM_WITHIN16_PAIR: 57 tmp = p & 15; 58 if (tmp + (1 << size) <= 16) { 59 atmax = size; 60 } else if (tmp + (1 << half) == 16) { 61 /* 62 * The pair exactly straddles the boundary. 63 * Both halves are naturally aligned and atomic. 64 */ 65 atmax = half; 66 } else { 67 /* 68 * One of the pair crosses the boundary, and is non-atomic. 69 * The other of the pair does not cross, and is atomic. 70 */ 71 atmax = -half; 72 } 73 break; 74 75 case MO_ATOM_SUBALIGN: 76 /* 77 * Examine the alignment of p to determine if there are subobjects 78 * that must be aligned. Note that we only really need ctz4() -- 79 * any more significant bits are discarded by the immediately 80 * following comparison. 81 */ 82 tmp = ctz32(p); 83 atmax = MIN(size, tmp); 84 break; 85 86 default: 87 g_assert_not_reached(); 88 } 89 90 /* 91 * Here we have the architectural atomicity of the operation. 92 * However, when executing in a serial context, we need no extra 93 * host atomicity in order to avoid racing. This reduction 94 * avoids looping with cpu_loop_exit_atomic. 95 */ 96 if (cpu_in_serial_context(cpu)) { 97 return MO_8; 98 } 99 return atmax; 100} 101 102/** 103 * load_atomic2: 104 * @pv: host address 105 * 106 * Atomically load 2 aligned bytes from @pv. 107 */ 108static inline uint16_t load_atomic2(void *pv) 109{ 110 uint16_t *p = __builtin_assume_aligned(pv, 2); 111 return qatomic_read(p); 112} 113 114/** 115 * load_atomic4: 116 * @pv: host address 117 * 118 * Atomically load 4 aligned bytes from @pv. 119 */ 120static inline uint32_t load_atomic4(void *pv) 121{ 122 uint32_t *p = __builtin_assume_aligned(pv, 4); 123 return qatomic_read(p); 124} 125 126/** 127 * load_atomic8: 128 * @pv: host address 129 * 130 * Atomically load 8 aligned bytes from @pv. 131 */ 132static inline uint64_t load_atomic8(void *pv) 133{ 134 uint64_t *p = __builtin_assume_aligned(pv, 8); 135 136 qemu_build_assert(HAVE_al8); 137 return qatomic_read__nocheck(p); 138} 139 140/** 141 * load_atomic8_or_exit: 142 * @cpu: generic cpu state 143 * @ra: host unwind address 144 * @pv: host address 145 * 146 * Atomically load 8 aligned bytes from @pv. 147 * If this is not possible, longjmp out to restart serially. 148 */ 149static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv) 150{ 151 if (HAVE_al8) { 152 return load_atomic8(pv); 153 } 154 155#ifdef CONFIG_USER_ONLY 156 /* 157 * If the page is not writable, then assume the value is immutable 158 * and requires no locking. This ignores the case of MAP_SHARED with 159 * another process, because the fallback start_exclusive solution 160 * provides no protection across processes. 161 */ 162 WITH_MMAP_LOCK_GUARD() { 163 if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) { 164 uint64_t *p = __builtin_assume_aligned(pv, 8); 165 return *p; 166 } 167 } 168#endif 169 170 /* Ultimate fallback: re-execute in serial context. */ 171 trace_load_atom8_or_exit_fallback(ra); 172 cpu_loop_exit_atomic(cpu, ra); 173} 174 175/** 176 * load_atomic16_or_exit: 177 * @cpu: generic cpu state 178 * @ra: host unwind address 179 * @pv: host address 180 * 181 * Atomically load 16 aligned bytes from @pv. 182 * If this is not possible, longjmp out to restart serially. 183 */ 184static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv) 185{ 186 Int128 *p = __builtin_assume_aligned(pv, 16); 187 188 if (HAVE_ATOMIC128_RO) { 189 return atomic16_read_ro(p); 190 } 191 192 /* 193 * We can only use cmpxchg to emulate a load if the page is writable. 194 * If the page is not writable, then assume the value is immutable 195 * and requires no locking. This ignores the case of MAP_SHARED with 196 * another process, because the fallback start_exclusive solution 197 * provides no protection across processes. 198 * 199 * In system mode all guest pages are writable. For user mode, 200 * we must take mmap_lock so that the query remains valid until 201 * the write is complete -- tests/tcg/multiarch/munmap-pthread.c 202 * is an example that can race. 203 */ 204 WITH_MMAP_LOCK_GUARD() { 205#ifdef CONFIG_USER_ONLY 206 if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) { 207 return *p; 208 } 209#endif 210 if (HAVE_ATOMIC128_RW) { 211 return atomic16_read_rw(p); 212 } 213 } 214 215 /* Ultimate fallback: re-execute in serial context. */ 216 trace_load_atom16_or_exit_fallback(ra); 217 cpu_loop_exit_atomic(cpu, ra); 218} 219 220/** 221 * load_atom_extract_al4x2: 222 * @pv: host address 223 * 224 * Load 4 bytes from @p, from two sequential atomic 4-byte loads. 225 */ 226static uint32_t load_atom_extract_al4x2(void *pv) 227{ 228 uintptr_t pi = (uintptr_t)pv; 229 int sh = (pi & 3) * 8; 230 uint32_t a, b; 231 232 pv = (void *)(pi & ~3); 233 a = load_atomic4(pv); 234 b = load_atomic4(pv + 4); 235 236 if (HOST_BIG_ENDIAN) { 237 return (a << sh) | (b >> (-sh & 31)); 238 } else { 239 return (a >> sh) | (b << (-sh & 31)); 240 } 241} 242 243/** 244 * load_atom_extract_al8x2: 245 * @pv: host address 246 * 247 * Load 8 bytes from @p, from two sequential atomic 8-byte loads. 248 */ 249static uint64_t load_atom_extract_al8x2(void *pv) 250{ 251 uintptr_t pi = (uintptr_t)pv; 252 int sh = (pi & 7) * 8; 253 uint64_t a, b; 254 255 pv = (void *)(pi & ~7); 256 a = load_atomic8(pv); 257 b = load_atomic8(pv + 8); 258 259 if (HOST_BIG_ENDIAN) { 260 return (a << sh) | (b >> (-sh & 63)); 261 } else { 262 return (a >> sh) | (b << (-sh & 63)); 263 } 264} 265 266/** 267 * load_atom_extract_al8_or_exit: 268 * @cpu: generic cpu state 269 * @ra: host unwind address 270 * @pv: host address 271 * @s: object size in bytes, @s <= 4. 272 * 273 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does 274 * not cross an 8-byte boundary. This means that we can perform an atomic 275 * 8-byte load and extract. 276 * The value is returned in the low bits of a uint32_t. 277 */ 278static uint32_t load_atom_extract_al8_or_exit(CPUState *cpu, uintptr_t ra, 279 void *pv, int s) 280{ 281 uintptr_t pi = (uintptr_t)pv; 282 int o = pi & 7; 283 int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; 284 285 pv = (void *)(pi & ~7); 286 return load_atomic8_or_exit(cpu, ra, pv) >> shr; 287} 288 289/** 290 * load_atom_extract_al16_or_exit: 291 * @cpu: generic cpu state 292 * @ra: host unwind address 293 * @p: host address 294 * @s: object size in bytes, @s <= 8. 295 * 296 * Atomically load @s bytes from @p, when p % 16 < 8 297 * and p % 16 + s > 8. I.e. does not cross a 16-byte 298 * boundary, but *does* cross an 8-byte boundary. 299 * This is the slow version, so we must have eliminated 300 * any faster load_atom_extract_al8_or_exit case. 301 * 302 * If this is not possible, longjmp out to restart serially. 303 */ 304static uint64_t load_atom_extract_al16_or_exit(CPUState *cpu, uintptr_t ra, 305 void *pv, int s) 306{ 307 uintptr_t pi = (uintptr_t)pv; 308 int o = pi & 7; 309 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 310 Int128 r; 311 312 /* 313 * Note constraints above: p & 8 must be clear. 314 * Provoke SIGBUS if possible otherwise. 315 */ 316 pv = (void *)(pi & ~7); 317 r = load_atomic16_or_exit(cpu, ra, pv); 318 319 r = int128_urshift(r, shr); 320 return int128_getlo(r); 321} 322 323/** 324 * load_atom_4_by_2: 325 * @pv: host address 326 * 327 * Load 4 bytes from @pv, with two 2-byte atomic loads. 328 */ 329static inline uint32_t load_atom_4_by_2(void *pv) 330{ 331 uint32_t a = load_atomic2(pv); 332 uint32_t b = load_atomic2(pv + 2); 333 334 if (HOST_BIG_ENDIAN) { 335 return (a << 16) | b; 336 } else { 337 return (b << 16) | a; 338 } 339} 340 341/** 342 * load_atom_8_by_2: 343 * @pv: host address 344 * 345 * Load 8 bytes from @pv, with four 2-byte atomic loads. 346 */ 347static inline uint64_t load_atom_8_by_2(void *pv) 348{ 349 uint32_t a = load_atom_4_by_2(pv); 350 uint32_t b = load_atom_4_by_2(pv + 4); 351 352 if (HOST_BIG_ENDIAN) { 353 return ((uint64_t)a << 32) | b; 354 } else { 355 return ((uint64_t)b << 32) | a; 356 } 357} 358 359/** 360 * load_atom_8_by_4: 361 * @pv: host address 362 * 363 * Load 8 bytes from @pv, with two 4-byte atomic loads. 364 */ 365static inline uint64_t load_atom_8_by_4(void *pv) 366{ 367 uint32_t a = load_atomic4(pv); 368 uint32_t b = load_atomic4(pv + 4); 369 370 if (HOST_BIG_ENDIAN) { 371 return ((uint64_t)a << 32) | b; 372 } else { 373 return ((uint64_t)b << 32) | a; 374 } 375} 376 377/** 378 * load_atom_8_by_8_or_4: 379 * @pv: host address 380 * 381 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity. 382 */ 383static inline uint64_t load_atom_8_by_8_or_4(void *pv) 384{ 385 if (HAVE_al8_fast) { 386 return load_atomic8(pv); 387 } else { 388 return load_atom_8_by_4(pv); 389 } 390} 391 392/** 393 * load_atom_2: 394 * @p: host address 395 * @memop: the full memory op 396 * 397 * Load 2 bytes from @p, honoring the atomicity of @memop. 398 */ 399static uint16_t load_atom_2(CPUState *cpu, uintptr_t ra, 400 void *pv, MemOp memop) 401{ 402 uintptr_t pi = (uintptr_t)pv; 403 int atmax; 404 405 if (likely((pi & 1) == 0)) { 406 return load_atomic2(pv); 407 } 408 if (HAVE_ATOMIC128_RO) { 409 intptr_t left_in_page = -(pi | TARGET_PAGE_MASK); 410 if (likely(left_in_page > 8)) { 411 return load_atom_extract_al16_or_al8(pv, 2); 412 } 413 } 414 415 atmax = required_atomicity(cpu, pi, memop); 416 switch (atmax) { 417 case MO_8: 418 return lduw_he_p(pv); 419 case MO_16: 420 /* The only case remaining is MO_ATOM_WITHIN16. */ 421 if (!HAVE_al8_fast && (pi & 3) == 1) { 422 /* Big or little endian, we want the middle two bytes. */ 423 return load_atomic4(pv - 1) >> 8; 424 } 425 if ((pi & 15) != 7) { 426 return load_atom_extract_al8_or_exit(cpu, ra, pv, 2); 427 } 428 return load_atom_extract_al16_or_exit(cpu, ra, pv, 2); 429 default: 430 g_assert_not_reached(); 431 } 432} 433 434/** 435 * load_atom_4: 436 * @p: host address 437 * @memop: the full memory op 438 * 439 * Load 4 bytes from @p, honoring the atomicity of @memop. 440 */ 441static uint32_t load_atom_4(CPUState *cpu, uintptr_t ra, 442 void *pv, MemOp memop) 443{ 444 uintptr_t pi = (uintptr_t)pv; 445 int atmax; 446 447 if (likely((pi & 3) == 0)) { 448 return load_atomic4(pv); 449 } 450 if (HAVE_ATOMIC128_RO) { 451 intptr_t left_in_page = -(pi | TARGET_PAGE_MASK); 452 if (likely(left_in_page > 8)) { 453 return load_atom_extract_al16_or_al8(pv, 4); 454 } 455 } 456 457 atmax = required_atomicity(cpu, pi, memop); 458 switch (atmax) { 459 case MO_8: 460 case MO_16: 461 case -MO_16: 462 /* 463 * For MO_ATOM_IFALIGN, this is more atomicity than required, 464 * but it's trivially supported on all hosts, better than 4 465 * individual byte loads (when the host requires alignment), 466 * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. 467 */ 468 return load_atom_extract_al4x2(pv); 469 case MO_32: 470 if (!(pi & 4)) { 471 return load_atom_extract_al8_or_exit(cpu, ra, pv, 4); 472 } 473 return load_atom_extract_al16_or_exit(cpu, ra, pv, 4); 474 default: 475 g_assert_not_reached(); 476 } 477} 478 479/** 480 * load_atom_8: 481 * @p: host address 482 * @memop: the full memory op 483 * 484 * Load 8 bytes from @p, honoring the atomicity of @memop. 485 */ 486static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra, 487 void *pv, MemOp memop) 488{ 489 uintptr_t pi = (uintptr_t)pv; 490 int atmax; 491 492 /* 493 * If the host does not support 8-byte atomics, wait until we have 494 * examined the atomicity parameters below. 495 */ 496 if (HAVE_al8 && likely((pi & 7) == 0)) { 497 return load_atomic8(pv); 498 } 499 if (HAVE_ATOMIC128_RO) { 500 return load_atom_extract_al16_or_al8(pv, 8); 501 } 502 503 atmax = required_atomicity(cpu, pi, memop); 504 if (atmax == MO_64) { 505 if (!HAVE_al8 && (pi & 7) == 0) { 506 load_atomic8_or_exit(cpu, ra, pv); 507 } 508 return load_atom_extract_al16_or_exit(cpu, ra, pv, 8); 509 } 510 if (HAVE_al8_fast) { 511 return load_atom_extract_al8x2(pv); 512 } 513 switch (atmax) { 514 case MO_8: 515 return ldq_he_p(pv); 516 case MO_16: 517 return load_atom_8_by_2(pv); 518 case MO_32: 519 return load_atom_8_by_4(pv); 520 case -MO_32: 521 if (HAVE_al8) { 522 return load_atom_extract_al8x2(pv); 523 } 524 trace_load_atom8_fallback(memop, ra); 525 cpu_loop_exit_atomic(cpu, ra); 526 default: 527 g_assert_not_reached(); 528 } 529} 530 531/** 532 * load_atom_16: 533 * @p: host address 534 * @memop: the full memory op 535 * 536 * Load 16 bytes from @p, honoring the atomicity of @memop. 537 */ 538static Int128 load_atom_16(CPUState *cpu, uintptr_t ra, 539 void *pv, MemOp memop) 540{ 541 uintptr_t pi = (uintptr_t)pv; 542 int atmax; 543 Int128 r; 544 uint64_t a, b; 545 546 /* 547 * If the host does not support 16-byte atomics, wait until we have 548 * examined the atomicity parameters below. 549 */ 550 if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) { 551 return atomic16_read_ro(pv); 552 } 553 554 atmax = required_atomicity(cpu, pi, memop); 555 switch (atmax) { 556 case MO_8: 557 memcpy(&r, pv, 16); 558 return r; 559 case MO_16: 560 a = load_atom_8_by_2(pv); 561 b = load_atom_8_by_2(pv + 8); 562 break; 563 case MO_32: 564 a = load_atom_8_by_4(pv); 565 b = load_atom_8_by_4(pv + 8); 566 break; 567 case MO_64: 568 if (!HAVE_al8) { 569 trace_load_atom16_fallback(memop, ra); 570 cpu_loop_exit_atomic(cpu, ra); 571 } 572 a = load_atomic8(pv); 573 b = load_atomic8(pv + 8); 574 break; 575 case -MO_64: 576 if (!HAVE_al8) { 577 trace_load_atom16_fallback(memop, ra); 578 cpu_loop_exit_atomic(cpu, ra); 579 } 580 a = load_atom_extract_al8x2(pv); 581 b = load_atom_extract_al8x2(pv + 8); 582 break; 583 case MO_128: 584 return load_atomic16_or_exit(cpu, ra, pv); 585 default: 586 g_assert_not_reached(); 587 } 588 return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b); 589} 590 591/** 592 * store_atomic2: 593 * @pv: host address 594 * @val: value to store 595 * 596 * Atomically store 2 aligned bytes to @pv. 597 */ 598static inline void store_atomic2(void *pv, uint16_t val) 599{ 600 uint16_t *p = __builtin_assume_aligned(pv, 2); 601 qatomic_set(p, val); 602} 603 604/** 605 * store_atomic4: 606 * @pv: host address 607 * @val: value to store 608 * 609 * Atomically store 4 aligned bytes to @pv. 610 */ 611static inline void store_atomic4(void *pv, uint32_t val) 612{ 613 uint32_t *p = __builtin_assume_aligned(pv, 4); 614 qatomic_set(p, val); 615} 616 617/** 618 * store_atomic8: 619 * @pv: host address 620 * @val: value to store 621 * 622 * Atomically store 8 aligned bytes to @pv. 623 */ 624static inline void store_atomic8(void *pv, uint64_t val) 625{ 626 uint64_t *p = __builtin_assume_aligned(pv, 8); 627 628 qemu_build_assert(HAVE_al8); 629 qatomic_set__nocheck(p, val); 630} 631 632/** 633 * store_atom_4x2 634 */ 635static inline void store_atom_4_by_2(void *pv, uint32_t val) 636{ 637 store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); 638 store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); 639} 640 641/** 642 * store_atom_8_by_2 643 */ 644static inline void store_atom_8_by_2(void *pv, uint64_t val) 645{ 646 store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 647 store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 648} 649 650/** 651 * store_atom_8_by_4 652 */ 653static inline void store_atom_8_by_4(void *pv, uint64_t val) 654{ 655 store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 656 store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 657} 658 659/** 660 * store_atom_insert_al4: 661 * @p: host address 662 * @val: shifted value to store 663 * @msk: mask for value to store 664 * 665 * Atomically store @val to @p, masked by @msk. 666 */ 667static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) 668{ 669 uint32_t old, new; 670 671 p = __builtin_assume_aligned(p, 4); 672 old = qatomic_read(p); 673 do { 674 new = (old & ~msk) | val; 675 } while (!__atomic_compare_exchange_n(p, &old, new, true, 676 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 677} 678 679/** 680 * store_atom_insert_al8: 681 * @p: host address 682 * @val: shifted value to store 683 * @msk: mask for value to store 684 * 685 * Atomically store @val to @p masked by @msk. 686 */ 687static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) 688{ 689 uint64_t old, new; 690 691 qemu_build_assert(HAVE_al8); 692 p = __builtin_assume_aligned(p, 8); 693 old = qatomic_read__nocheck(p); 694 do { 695 new = (old & ~msk) | val; 696 } while (!__atomic_compare_exchange_n(p, &old, new, true, 697 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 698} 699 700/** 701 * store_bytes_leN: 702 * @pv: host address 703 * @size: number of bytes to store 704 * @val_le: data to store 705 * 706 * Store @size bytes at @p. The bytes to store are extracted in little-endian order 707 * from @val_le; return the bytes of @val_le beyond @size that have not been stored. 708 */ 709static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) 710{ 711 uint8_t *p = pv; 712 for (int i = 0; i < size; i++, val_le >>= 8) { 713 p[i] = val_le; 714 } 715 return val_le; 716} 717 718/** 719 * store_parts_leN 720 * @pv: host address 721 * @size: number of bytes to store 722 * @val_le: data to store 723 * 724 * As store_bytes_leN, but atomically on each aligned part. 725 */ 726G_GNUC_UNUSED 727static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) 728{ 729 do { 730 int n; 731 732 /* Find minimum of alignment and size */ 733 switch (((uintptr_t)pv | size) & 7) { 734 case 4: 735 store_atomic4(pv, le32_to_cpu(val_le)); 736 val_le >>= 32; 737 n = 4; 738 break; 739 case 2: 740 case 6: 741 store_atomic2(pv, le16_to_cpu(val_le)); 742 val_le >>= 16; 743 n = 2; 744 break; 745 default: 746 *(uint8_t *)pv = val_le; 747 val_le >>= 8; 748 n = 1; 749 break; 750 case 0: 751 g_assert_not_reached(); 752 } 753 pv += n; 754 size -= n; 755 } while (size != 0); 756 757 return val_le; 758} 759 760/** 761 * store_whole_le4 762 * @pv: host address 763 * @size: number of bytes to store 764 * @val_le: data to store 765 * 766 * As store_bytes_leN, but atomically as a whole. 767 * Four aligned bytes are guaranteed to cover the store. 768 */ 769static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) 770{ 771 int sz = size * 8; 772 int o = (uintptr_t)pv & 3; 773 int sh = o * 8; 774 uint32_t m = MAKE_64BIT_MASK(0, sz); 775 uint32_t v; 776 777 if (HOST_BIG_ENDIAN) { 778 v = bswap32(val_le) >> sh; 779 m = bswap32(m) >> sh; 780 } else { 781 v = val_le << sh; 782 m <<= sh; 783 } 784 store_atom_insert_al4(pv - o, v, m); 785 return val_le >> sz; 786} 787 788/** 789 * store_whole_le8 790 * @pv: host address 791 * @size: number of bytes to store 792 * @val_le: data to store 793 * 794 * As store_bytes_leN, but atomically as a whole. 795 * Eight aligned bytes are guaranteed to cover the store. 796 */ 797static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) 798{ 799 int sz = size * 8; 800 int o = (uintptr_t)pv & 7; 801 int sh = o * 8; 802 uint64_t m = MAKE_64BIT_MASK(0, sz); 803 uint64_t v; 804 805 qemu_build_assert(HAVE_al8); 806 if (HOST_BIG_ENDIAN) { 807 v = bswap64(val_le) >> sh; 808 m = bswap64(m) >> sh; 809 } else { 810 v = val_le << sh; 811 m <<= sh; 812 } 813 store_atom_insert_al8(pv - o, v, m); 814 return val_le >> sz; 815} 816 817/** 818 * store_whole_le16 819 * @pv: host address 820 * @size: number of bytes to store 821 * @val_le: data to store 822 * 823 * As store_bytes_leN, but atomically as a whole. 824 * 16 aligned bytes are guaranteed to cover the store. 825 */ 826static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) 827{ 828 int sz = size * 8; 829 int o = (uintptr_t)pv & 15; 830 int sh = o * 8; 831 Int128 m, v; 832 833 qemu_build_assert(HAVE_CMPXCHG128); 834 835 /* Like MAKE_64BIT_MASK(0, sz), but larger. */ 836 if (sz <= 64) { 837 m = int128_make64(MAKE_64BIT_MASK(0, sz)); 838 } else { 839 m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); 840 } 841 842 if (HOST_BIG_ENDIAN) { 843 v = int128_urshift(bswap128(val_le), sh); 844 m = int128_urshift(bswap128(m), sh); 845 } else { 846 v = int128_lshift(val_le, sh); 847 m = int128_lshift(m, sh); 848 } 849 store_atom_insert_al16(pv - o, v, m); 850 851 if (sz <= 64) { 852 return 0; 853 } 854 return int128_gethi(val_le) >> (sz - 64); 855} 856 857/** 858 * store_atom_2: 859 * @p: host address 860 * @val: the value to store 861 * @memop: the full memory op 862 * 863 * Store 2 bytes to @p, honoring the atomicity of @memop. 864 */ 865static void store_atom_2(CPUState *cpu, uintptr_t ra, 866 void *pv, MemOp memop, uint16_t val) 867{ 868 uintptr_t pi = (uintptr_t)pv; 869 int atmax; 870 871 if (likely((pi & 1) == 0)) { 872 store_atomic2(pv, val); 873 return; 874 } 875 876 atmax = required_atomicity(cpu, pi, memop); 877 if (atmax == MO_8) { 878 stw_he_p(pv, val); 879 return; 880 } 881 882 /* 883 * The only case remaining is MO_ATOM_WITHIN16. 884 * Big or little endian, we want the middle two bytes in each test. 885 */ 886 if ((pi & 3) == 1) { 887 store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); 888 return; 889 } else if ((pi & 7) == 3) { 890 if (HAVE_al8) { 891 store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); 892 return; 893 } 894 } else if ((pi & 15) == 7) { 895 if (HAVE_CMPXCHG128) { 896 Int128 v = int128_lshift(int128_make64(val), 56); 897 Int128 m = int128_lshift(int128_make64(0xffff), 56); 898 store_atom_insert_al16(pv - 7, v, m); 899 return; 900 } 901 } else { 902 g_assert_not_reached(); 903 } 904 905 trace_store_atom2_fallback(memop, ra); 906 cpu_loop_exit_atomic(cpu, ra); 907} 908 909/** 910 * store_atom_4: 911 * @p: host address 912 * @val: the value to store 913 * @memop: the full memory op 914 * 915 * Store 4 bytes to @p, honoring the atomicity of @memop. 916 */ 917static void store_atom_4(CPUState *cpu, uintptr_t ra, 918 void *pv, MemOp memop, uint32_t val) 919{ 920 uintptr_t pi = (uintptr_t)pv; 921 int atmax; 922 923 if (likely((pi & 3) == 0)) { 924 store_atomic4(pv, val); 925 return; 926 } 927 928 atmax = required_atomicity(cpu, pi, memop); 929 switch (atmax) { 930 case MO_8: 931 stl_he_p(pv, val); 932 return; 933 case MO_16: 934 store_atom_4_by_2(pv, val); 935 return; 936 case -MO_16: 937 { 938 uint32_t val_le = cpu_to_le32(val); 939 int s2 = pi & 3; 940 int s1 = 4 - s2; 941 942 switch (s2) { 943 case 1: 944 val_le = store_whole_le4(pv, s1, val_le); 945 *(uint8_t *)(pv + 3) = val_le; 946 break; 947 case 3: 948 *(uint8_t *)pv = val_le; 949 store_whole_le4(pv + 1, s2, val_le >> 8); 950 break; 951 case 0: /* aligned */ 952 case 2: /* atmax MO_16 */ 953 default: 954 g_assert_not_reached(); 955 } 956 } 957 return; 958 case MO_32: 959 if ((pi & 7) < 4) { 960 if (HAVE_al8) { 961 store_whole_le8(pv, 4, cpu_to_le32(val)); 962 return; 963 } 964 } else { 965 if (HAVE_CMPXCHG128) { 966 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); 967 return; 968 } 969 } 970 trace_store_atom4_fallback(memop, ra); 971 cpu_loop_exit_atomic(cpu, ra); 972 default: 973 g_assert_not_reached(); 974 } 975} 976 977/** 978 * store_atom_8: 979 * @p: host address 980 * @val: the value to store 981 * @memop: the full memory op 982 * 983 * Store 8 bytes to @p, honoring the atomicity of @memop. 984 */ 985static void store_atom_8(CPUState *cpu, uintptr_t ra, 986 void *pv, MemOp memop, uint64_t val) 987{ 988 uintptr_t pi = (uintptr_t)pv; 989 int atmax; 990 991 if (HAVE_al8 && likely((pi & 7) == 0)) { 992 store_atomic8(pv, val); 993 return; 994 } 995 996 atmax = required_atomicity(cpu, pi, memop); 997 switch (atmax) { 998 case MO_8: 999 stq_he_p(pv, val); 1000 return; 1001 case MO_16: 1002 store_atom_8_by_2(pv, val); 1003 return; 1004 case MO_32: 1005 store_atom_8_by_4(pv, val); 1006 return; 1007 case -MO_32: 1008 if (HAVE_al8) { 1009 uint64_t val_le = cpu_to_le64(val); 1010 int s2 = pi & 7; 1011 int s1 = 8 - s2; 1012 1013 switch (s2) { 1014 case 1 ... 3: 1015 val_le = store_whole_le8(pv, s1, val_le); 1016 store_bytes_leN(pv + s1, s2, val_le); 1017 break; 1018 case 5 ... 7: 1019 val_le = store_bytes_leN(pv, s1, val_le); 1020 store_whole_le8(pv + s1, s2, val_le); 1021 break; 1022 case 0: /* aligned */ 1023 case 4: /* atmax MO_32 */ 1024 default: 1025 g_assert_not_reached(); 1026 } 1027 return; 1028 } 1029 break; 1030 case MO_64: 1031 if (HAVE_CMPXCHG128) { 1032 store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); 1033 return; 1034 } 1035 break; 1036 default: 1037 g_assert_not_reached(); 1038 } 1039 trace_store_atom8_fallback(memop, ra); 1040 cpu_loop_exit_atomic(cpu, ra); 1041} 1042 1043/** 1044 * store_atom_16: 1045 * @p: host address 1046 * @val: the value to store 1047 * @memop: the full memory op 1048 * 1049 * Store 16 bytes to @p, honoring the atomicity of @memop. 1050 */ 1051static void store_atom_16(CPUState *cpu, uintptr_t ra, 1052 void *pv, MemOp memop, Int128 val) 1053{ 1054 uintptr_t pi = (uintptr_t)pv; 1055 uint64_t a, b; 1056 int atmax; 1057 1058 if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) { 1059 atomic16_set(pv, val); 1060 return; 1061 } 1062 1063 atmax = required_atomicity(cpu, pi, memop); 1064 1065 a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val); 1066 b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val); 1067 switch (atmax) { 1068 case MO_8: 1069 memcpy(pv, &val, 16); 1070 return; 1071 case MO_16: 1072 store_atom_8_by_2(pv, a); 1073 store_atom_8_by_2(pv + 8, b); 1074 return; 1075 case MO_32: 1076 store_atom_8_by_4(pv, a); 1077 store_atom_8_by_4(pv + 8, b); 1078 return; 1079 case MO_64: 1080 if (HAVE_al8) { 1081 store_atomic8(pv, a); 1082 store_atomic8(pv + 8, b); 1083 return; 1084 } 1085 break; 1086 case -MO_64: 1087 if (HAVE_CMPXCHG128) { 1088 uint64_t val_le; 1089 int s2 = pi & 15; 1090 int s1 = 16 - s2; 1091 1092 if (HOST_BIG_ENDIAN) { 1093 val = bswap128(val); 1094 } 1095 switch (s2) { 1096 case 1 ... 7: 1097 val_le = store_whole_le16(pv, s1, val); 1098 store_bytes_leN(pv + s1, s2, val_le); 1099 break; 1100 case 9 ... 15: 1101 store_bytes_leN(pv, s1, int128_getlo(val)); 1102 val = int128_urshift(val, s1 * 8); 1103 store_whole_le16(pv + s1, s2, val); 1104 break; 1105 case 0: /* aligned */ 1106 case 8: /* atmax MO_64 */ 1107 default: 1108 g_assert_not_reached(); 1109 } 1110 return; 1111 } 1112 break; 1113 case MO_128: 1114 break; 1115 default: 1116 g_assert_not_reached(); 1117 } 1118 trace_store_atom16_fallback(memop, ra); 1119 cpu_loop_exit_atomic(cpu, ra); 1120} 1121