1/* 2 * Routines common to user and system emulation of load/store. 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * SPDX-License-Identifier: GPL-2.0-or-later 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12#include "host/load-extract-al16-al8.h" 13#include "host/store-insert-al16.h" 14 15#ifdef CONFIG_ATOMIC64 16# define HAVE_al8 true 17#else 18# define HAVE_al8 false 19#endif 20#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) 21 22/** 23 * required_atomicity: 24 * 25 * Return the lg2 bytes of atomicity required by @memop for @p. 26 * If the operation must be split into two operations to be 27 * examined separately for atomicity, return -lg2. 28 */ 29static int required_atomicity(CPUState *cpu, uintptr_t p, MemOp memop) 30{ 31 MemOp atom = memop & MO_ATOM_MASK; 32 MemOp size = memop & MO_SIZE; 33 MemOp half = size ? size - 1 : 0; 34 unsigned tmp; 35 int atmax; 36 37 switch (atom) { 38 case MO_ATOM_NONE: 39 atmax = MO_8; 40 break; 41 42 case MO_ATOM_IFALIGN_PAIR: 43 size = half; 44 /* fall through */ 45 46 case MO_ATOM_IFALIGN: 47 tmp = (1 << size) - 1; 48 atmax = p & tmp ? MO_8 : size; 49 break; 50 51 case MO_ATOM_WITHIN16: 52 tmp = p & 15; 53 atmax = (tmp + (1 << size) <= 16 ? size : MO_8); 54 break; 55 56 case MO_ATOM_WITHIN16_PAIR: 57 tmp = p & 15; 58 if (tmp + (1 << size) <= 16) { 59 atmax = size; 60 } else if (tmp + (1 << half) == 16) { 61 /* 62 * The pair exactly straddles the boundary. 63 * Both halves are naturally aligned and atomic. 64 */ 65 atmax = half; 66 } else { 67 /* 68 * One of the pair crosses the boundary, and is non-atomic. 69 * The other of the pair does not cross, and is atomic. 70 */ 71 atmax = -half; 72 } 73 break; 74 75 case MO_ATOM_SUBALIGN: 76 /* 77 * Examine the alignment of p to determine if there are subobjects 78 * that must be aligned. Note that we only really need ctz4() -- 79 * any more sigificant bits are discarded by the immediately 80 * following comparison. 81 */ 82 tmp = ctz32(p); 83 atmax = MIN(size, tmp); 84 break; 85 86 default: 87 g_assert_not_reached(); 88 } 89 90 /* 91 * Here we have the architectural atomicity of the operation. 92 * However, when executing in a serial context, we need no extra 93 * host atomicity in order to avoid racing. This reduction 94 * avoids looping with cpu_loop_exit_atomic. 95 */ 96 if (cpu_in_serial_context(cpu)) { 97 return MO_8; 98 } 99 return atmax; 100} 101 102/** 103 * load_atomic2: 104 * @pv: host address 105 * 106 * Atomically load 2 aligned bytes from @pv. 107 */ 108static inline uint16_t load_atomic2(void *pv) 109{ 110 uint16_t *p = __builtin_assume_aligned(pv, 2); 111 return qatomic_read(p); 112} 113 114/** 115 * load_atomic4: 116 * @pv: host address 117 * 118 * Atomically load 4 aligned bytes from @pv. 119 */ 120static inline uint32_t load_atomic4(void *pv) 121{ 122 uint32_t *p = __builtin_assume_aligned(pv, 4); 123 return qatomic_read(p); 124} 125 126/** 127 * load_atomic8: 128 * @pv: host address 129 * 130 * Atomically load 8 aligned bytes from @pv. 131 */ 132static inline uint64_t load_atomic8(void *pv) 133{ 134 uint64_t *p = __builtin_assume_aligned(pv, 8); 135 136 qemu_build_assert(HAVE_al8); 137 return qatomic_read__nocheck(p); 138} 139 140/** 141 * load_atomic8_or_exit: 142 * @cpu: generic cpu state 143 * @ra: host unwind address 144 * @pv: host address 145 * 146 * Atomically load 8 aligned bytes from @pv. 147 * If this is not possible, longjmp out to restart serially. 148 */ 149static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv) 150{ 151 if (HAVE_al8) { 152 return load_atomic8(pv); 153 } 154 155#ifdef CONFIG_USER_ONLY 156 /* 157 * If the page is not writable, then assume the value is immutable 158 * and requires no locking. This ignores the case of MAP_SHARED with 159 * another process, because the fallback start_exclusive solution 160 * provides no protection across processes. 161 */ 162 WITH_MMAP_LOCK_GUARD() { 163 if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) { 164 uint64_t *p = __builtin_assume_aligned(pv, 8); 165 return *p; 166 } 167 } 168#endif 169 170 /* Ultimate fallback: re-execute in serial context. */ 171 cpu_loop_exit_atomic(cpu, ra); 172} 173 174/** 175 * load_atomic16_or_exit: 176 * @cpu: generic cpu state 177 * @ra: host unwind address 178 * @pv: host address 179 * 180 * Atomically load 16 aligned bytes from @pv. 181 * If this is not possible, longjmp out to restart serially. 182 */ 183static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv) 184{ 185 Int128 *p = __builtin_assume_aligned(pv, 16); 186 187 if (HAVE_ATOMIC128_RO) { 188 return atomic16_read_ro(p); 189 } 190 191 /* 192 * We can only use cmpxchg to emulate a load if the page is writable. 193 * If the page is not writable, then assume the value is immutable 194 * and requires no locking. This ignores the case of MAP_SHARED with 195 * another process, because the fallback start_exclusive solution 196 * provides no protection across processes. 197 * 198 * In system mode all guest pages are writable. For user mode, 199 * we must take mmap_lock so that the query remains valid until 200 * the write is complete -- tests/tcg/multiarch/munmap-pthread.c 201 * is an example that can race. 202 */ 203 WITH_MMAP_LOCK_GUARD() { 204#ifdef CONFIG_USER_ONLY 205 if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) { 206 return *p; 207 } 208#endif 209 if (HAVE_ATOMIC128_RW) { 210 return atomic16_read_rw(p); 211 } 212 } 213 214 /* Ultimate fallback: re-execute in serial context. */ 215 cpu_loop_exit_atomic(cpu, ra); 216} 217 218/** 219 * load_atom_extract_al4x2: 220 * @pv: host address 221 * 222 * Load 4 bytes from @p, from two sequential atomic 4-byte loads. 223 */ 224static uint32_t load_atom_extract_al4x2(void *pv) 225{ 226 uintptr_t pi = (uintptr_t)pv; 227 int sh = (pi & 3) * 8; 228 uint32_t a, b; 229 230 pv = (void *)(pi & ~3); 231 a = load_atomic4(pv); 232 b = load_atomic4(pv + 4); 233 234 if (HOST_BIG_ENDIAN) { 235 return (a << sh) | (b >> (-sh & 31)); 236 } else { 237 return (a >> sh) | (b << (-sh & 31)); 238 } 239} 240 241/** 242 * load_atom_extract_al8x2: 243 * @pv: host address 244 * 245 * Load 8 bytes from @p, from two sequential atomic 8-byte loads. 246 */ 247static uint64_t load_atom_extract_al8x2(void *pv) 248{ 249 uintptr_t pi = (uintptr_t)pv; 250 int sh = (pi & 7) * 8; 251 uint64_t a, b; 252 253 pv = (void *)(pi & ~7); 254 a = load_atomic8(pv); 255 b = load_atomic8(pv + 8); 256 257 if (HOST_BIG_ENDIAN) { 258 return (a << sh) | (b >> (-sh & 63)); 259 } else { 260 return (a >> sh) | (b << (-sh & 63)); 261 } 262} 263 264/** 265 * load_atom_extract_al8_or_exit: 266 * @cpu: generic cpu state 267 * @ra: host unwind address 268 * @pv: host address 269 * @s: object size in bytes, @s <= 4. 270 * 271 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does 272 * not cross an 8-byte boundary. This means that we can perform an atomic 273 * 8-byte load and extract. 274 * The value is returned in the low bits of a uint32_t. 275 */ 276static uint32_t load_atom_extract_al8_or_exit(CPUState *cpu, uintptr_t ra, 277 void *pv, int s) 278{ 279 uintptr_t pi = (uintptr_t)pv; 280 int o = pi & 7; 281 int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; 282 283 pv = (void *)(pi & ~7); 284 return load_atomic8_or_exit(cpu, ra, pv) >> shr; 285} 286 287/** 288 * load_atom_extract_al16_or_exit: 289 * @cpu: generic cpu state 290 * @ra: host unwind address 291 * @p: host address 292 * @s: object size in bytes, @s <= 8. 293 * 294 * Atomically load @s bytes from @p, when p % 16 < 8 295 * and p % 16 + s > 8. I.e. does not cross a 16-byte 296 * boundary, but *does* cross an 8-byte boundary. 297 * This is the slow version, so we must have eliminated 298 * any faster load_atom_extract_al8_or_exit case. 299 * 300 * If this is not possible, longjmp out to restart serially. 301 */ 302static uint64_t load_atom_extract_al16_or_exit(CPUState *cpu, uintptr_t ra, 303 void *pv, int s) 304{ 305 uintptr_t pi = (uintptr_t)pv; 306 int o = pi & 7; 307 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 308 Int128 r; 309 310 /* 311 * Note constraints above: p & 8 must be clear. 312 * Provoke SIGBUS if possible otherwise. 313 */ 314 pv = (void *)(pi & ~7); 315 r = load_atomic16_or_exit(cpu, ra, pv); 316 317 r = int128_urshift(r, shr); 318 return int128_getlo(r); 319} 320 321/** 322 * load_atom_4_by_2: 323 * @pv: host address 324 * 325 * Load 4 bytes from @pv, with two 2-byte atomic loads. 326 */ 327static inline uint32_t load_atom_4_by_2(void *pv) 328{ 329 uint32_t a = load_atomic2(pv); 330 uint32_t b = load_atomic2(pv + 2); 331 332 if (HOST_BIG_ENDIAN) { 333 return (a << 16) | b; 334 } else { 335 return (b << 16) | a; 336 } 337} 338 339/** 340 * load_atom_8_by_2: 341 * @pv: host address 342 * 343 * Load 8 bytes from @pv, with four 2-byte atomic loads. 344 */ 345static inline uint64_t load_atom_8_by_2(void *pv) 346{ 347 uint32_t a = load_atom_4_by_2(pv); 348 uint32_t b = load_atom_4_by_2(pv + 4); 349 350 if (HOST_BIG_ENDIAN) { 351 return ((uint64_t)a << 32) | b; 352 } else { 353 return ((uint64_t)b << 32) | a; 354 } 355} 356 357/** 358 * load_atom_8_by_4: 359 * @pv: host address 360 * 361 * Load 8 bytes from @pv, with two 4-byte atomic loads. 362 */ 363static inline uint64_t load_atom_8_by_4(void *pv) 364{ 365 uint32_t a = load_atomic4(pv); 366 uint32_t b = load_atomic4(pv + 4); 367 368 if (HOST_BIG_ENDIAN) { 369 return ((uint64_t)a << 32) | b; 370 } else { 371 return ((uint64_t)b << 32) | a; 372 } 373} 374 375/** 376 * load_atom_8_by_8_or_4: 377 * @pv: host address 378 * 379 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity. 380 */ 381static inline uint64_t load_atom_8_by_8_or_4(void *pv) 382{ 383 if (HAVE_al8_fast) { 384 return load_atomic8(pv); 385 } else { 386 return load_atom_8_by_4(pv); 387 } 388} 389 390/** 391 * load_atom_2: 392 * @p: host address 393 * @memop: the full memory op 394 * 395 * Load 2 bytes from @p, honoring the atomicity of @memop. 396 */ 397static uint16_t load_atom_2(CPUState *cpu, uintptr_t ra, 398 void *pv, MemOp memop) 399{ 400 uintptr_t pi = (uintptr_t)pv; 401 int atmax; 402 403 if (likely((pi & 1) == 0)) { 404 return load_atomic2(pv); 405 } 406 if (HAVE_ATOMIC128_RO) { 407 intptr_t left_in_page = -(pi | TARGET_PAGE_MASK); 408 if (likely(left_in_page > 8)) { 409 return load_atom_extract_al16_or_al8(pv, 2); 410 } 411 } 412 413 atmax = required_atomicity(cpu, pi, memop); 414 switch (atmax) { 415 case MO_8: 416 return lduw_he_p(pv); 417 case MO_16: 418 /* The only case remaining is MO_ATOM_WITHIN16. */ 419 if (!HAVE_al8_fast && (pi & 3) == 1) { 420 /* Big or little endian, we want the middle two bytes. */ 421 return load_atomic4(pv - 1) >> 8; 422 } 423 if ((pi & 15) != 7) { 424 return load_atom_extract_al8_or_exit(cpu, ra, pv, 2); 425 } 426 return load_atom_extract_al16_or_exit(cpu, ra, pv, 2); 427 default: 428 g_assert_not_reached(); 429 } 430} 431 432/** 433 * load_atom_4: 434 * @p: host address 435 * @memop: the full memory op 436 * 437 * Load 4 bytes from @p, honoring the atomicity of @memop. 438 */ 439static uint32_t load_atom_4(CPUState *cpu, uintptr_t ra, 440 void *pv, MemOp memop) 441{ 442 uintptr_t pi = (uintptr_t)pv; 443 int atmax; 444 445 if (likely((pi & 3) == 0)) { 446 return load_atomic4(pv); 447 } 448 if (HAVE_ATOMIC128_RO) { 449 intptr_t left_in_page = -(pi | TARGET_PAGE_MASK); 450 if (likely(left_in_page > 8)) { 451 return load_atom_extract_al16_or_al8(pv, 4); 452 } 453 } 454 455 atmax = required_atomicity(cpu, pi, memop); 456 switch (atmax) { 457 case MO_8: 458 case MO_16: 459 case -MO_16: 460 /* 461 * For MO_ATOM_IFALIGN, this is more atomicity than required, 462 * but it's trivially supported on all hosts, better than 4 463 * individual byte loads (when the host requires alignment), 464 * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. 465 */ 466 return load_atom_extract_al4x2(pv); 467 case MO_32: 468 if (!(pi & 4)) { 469 return load_atom_extract_al8_or_exit(cpu, ra, pv, 4); 470 } 471 return load_atom_extract_al16_or_exit(cpu, ra, pv, 4); 472 default: 473 g_assert_not_reached(); 474 } 475} 476 477/** 478 * load_atom_8: 479 * @p: host address 480 * @memop: the full memory op 481 * 482 * Load 8 bytes from @p, honoring the atomicity of @memop. 483 */ 484static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra, 485 void *pv, MemOp memop) 486{ 487 uintptr_t pi = (uintptr_t)pv; 488 int atmax; 489 490 /* 491 * If the host does not support 8-byte atomics, wait until we have 492 * examined the atomicity parameters below. 493 */ 494 if (HAVE_al8 && likely((pi & 7) == 0)) { 495 return load_atomic8(pv); 496 } 497 if (HAVE_ATOMIC128_RO) { 498 return load_atom_extract_al16_or_al8(pv, 8); 499 } 500 501 atmax = required_atomicity(cpu, pi, memop); 502 if (atmax == MO_64) { 503 if (!HAVE_al8 && (pi & 7) == 0) { 504 load_atomic8_or_exit(cpu, ra, pv); 505 } 506 return load_atom_extract_al16_or_exit(cpu, ra, pv, 8); 507 } 508 if (HAVE_al8_fast) { 509 return load_atom_extract_al8x2(pv); 510 } 511 switch (atmax) { 512 case MO_8: 513 return ldq_he_p(pv); 514 case MO_16: 515 return load_atom_8_by_2(pv); 516 case MO_32: 517 return load_atom_8_by_4(pv); 518 case -MO_32: 519 if (HAVE_al8) { 520 return load_atom_extract_al8x2(pv); 521 } 522 cpu_loop_exit_atomic(cpu, ra); 523 default: 524 g_assert_not_reached(); 525 } 526} 527 528/** 529 * load_atom_16: 530 * @p: host address 531 * @memop: the full memory op 532 * 533 * Load 16 bytes from @p, honoring the atomicity of @memop. 534 */ 535static Int128 load_atom_16(CPUState *cpu, uintptr_t ra, 536 void *pv, MemOp memop) 537{ 538 uintptr_t pi = (uintptr_t)pv; 539 int atmax; 540 Int128 r; 541 uint64_t a, b; 542 543 /* 544 * If the host does not support 16-byte atomics, wait until we have 545 * examined the atomicity parameters below. 546 */ 547 if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) { 548 return atomic16_read_ro(pv); 549 } 550 551 atmax = required_atomicity(cpu, pi, memop); 552 switch (atmax) { 553 case MO_8: 554 memcpy(&r, pv, 16); 555 return r; 556 case MO_16: 557 a = load_atom_8_by_2(pv); 558 b = load_atom_8_by_2(pv + 8); 559 break; 560 case MO_32: 561 a = load_atom_8_by_4(pv); 562 b = load_atom_8_by_4(pv + 8); 563 break; 564 case MO_64: 565 if (!HAVE_al8) { 566 cpu_loop_exit_atomic(cpu, ra); 567 } 568 a = load_atomic8(pv); 569 b = load_atomic8(pv + 8); 570 break; 571 case -MO_64: 572 if (!HAVE_al8) { 573 cpu_loop_exit_atomic(cpu, ra); 574 } 575 a = load_atom_extract_al8x2(pv); 576 b = load_atom_extract_al8x2(pv + 8); 577 break; 578 case MO_128: 579 return load_atomic16_or_exit(cpu, ra, pv); 580 default: 581 g_assert_not_reached(); 582 } 583 return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b); 584} 585 586/** 587 * store_atomic2: 588 * @pv: host address 589 * @val: value to store 590 * 591 * Atomically store 2 aligned bytes to @pv. 592 */ 593static inline void store_atomic2(void *pv, uint16_t val) 594{ 595 uint16_t *p = __builtin_assume_aligned(pv, 2); 596 qatomic_set(p, val); 597} 598 599/** 600 * store_atomic4: 601 * @pv: host address 602 * @val: value to store 603 * 604 * Atomically store 4 aligned bytes to @pv. 605 */ 606static inline void store_atomic4(void *pv, uint32_t val) 607{ 608 uint32_t *p = __builtin_assume_aligned(pv, 4); 609 qatomic_set(p, val); 610} 611 612/** 613 * store_atomic8: 614 * @pv: host address 615 * @val: value to store 616 * 617 * Atomically store 8 aligned bytes to @pv. 618 */ 619static inline void store_atomic8(void *pv, uint64_t val) 620{ 621 uint64_t *p = __builtin_assume_aligned(pv, 8); 622 623 qemu_build_assert(HAVE_al8); 624 qatomic_set__nocheck(p, val); 625} 626 627/** 628 * store_atom_4x2 629 */ 630static inline void store_atom_4_by_2(void *pv, uint32_t val) 631{ 632 store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); 633 store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); 634} 635 636/** 637 * store_atom_8_by_2 638 */ 639static inline void store_atom_8_by_2(void *pv, uint64_t val) 640{ 641 store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 642 store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 643} 644 645/** 646 * store_atom_8_by_4 647 */ 648static inline void store_atom_8_by_4(void *pv, uint64_t val) 649{ 650 store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 651 store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 652} 653 654/** 655 * store_atom_insert_al4: 656 * @p: host address 657 * @val: shifted value to store 658 * @msk: mask for value to store 659 * 660 * Atomically store @val to @p, masked by @msk. 661 */ 662static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) 663{ 664 uint32_t old, new; 665 666 p = __builtin_assume_aligned(p, 4); 667 old = qatomic_read(p); 668 do { 669 new = (old & ~msk) | val; 670 } while (!__atomic_compare_exchange_n(p, &old, new, true, 671 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 672} 673 674/** 675 * store_atom_insert_al8: 676 * @p: host address 677 * @val: shifted value to store 678 * @msk: mask for value to store 679 * 680 * Atomically store @val to @p masked by @msk. 681 */ 682static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) 683{ 684 uint64_t old, new; 685 686 qemu_build_assert(HAVE_al8); 687 p = __builtin_assume_aligned(p, 8); 688 old = qatomic_read__nocheck(p); 689 do { 690 new = (old & ~msk) | val; 691 } while (!__atomic_compare_exchange_n(p, &old, new, true, 692 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 693} 694 695/** 696 * store_bytes_leN: 697 * @pv: host address 698 * @size: number of bytes to store 699 * @val_le: data to store 700 * 701 * Store @size bytes at @p. The bytes to store are extracted in little-endian order 702 * from @val_le; return the bytes of @val_le beyond @size that have not been stored. 703 */ 704static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) 705{ 706 uint8_t *p = pv; 707 for (int i = 0; i < size; i++, val_le >>= 8) { 708 p[i] = val_le; 709 } 710 return val_le; 711} 712 713/** 714 * store_parts_leN 715 * @pv: host address 716 * @size: number of bytes to store 717 * @val_le: data to store 718 * 719 * As store_bytes_leN, but atomically on each aligned part. 720 */ 721G_GNUC_UNUSED 722static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) 723{ 724 do { 725 int n; 726 727 /* Find minimum of alignment and size */ 728 switch (((uintptr_t)pv | size) & 7) { 729 case 4: 730 store_atomic4(pv, le32_to_cpu(val_le)); 731 val_le >>= 32; 732 n = 4; 733 break; 734 case 2: 735 case 6: 736 store_atomic2(pv, le16_to_cpu(val_le)); 737 val_le >>= 16; 738 n = 2; 739 break; 740 default: 741 *(uint8_t *)pv = val_le; 742 val_le >>= 8; 743 n = 1; 744 break; 745 case 0: 746 g_assert_not_reached(); 747 } 748 pv += n; 749 size -= n; 750 } while (size != 0); 751 752 return val_le; 753} 754 755/** 756 * store_whole_le4 757 * @pv: host address 758 * @size: number of bytes to store 759 * @val_le: data to store 760 * 761 * As store_bytes_leN, but atomically as a whole. 762 * Four aligned bytes are guaranteed to cover the store. 763 */ 764static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) 765{ 766 int sz = size * 8; 767 int o = (uintptr_t)pv & 3; 768 int sh = o * 8; 769 uint32_t m = MAKE_64BIT_MASK(0, sz); 770 uint32_t v; 771 772 if (HOST_BIG_ENDIAN) { 773 v = bswap32(val_le) >> sh; 774 m = bswap32(m) >> sh; 775 } else { 776 v = val_le << sh; 777 m <<= sh; 778 } 779 store_atom_insert_al4(pv - o, v, m); 780 return val_le >> sz; 781} 782 783/** 784 * store_whole_le8 785 * @pv: host address 786 * @size: number of bytes to store 787 * @val_le: data to store 788 * 789 * As store_bytes_leN, but atomically as a whole. 790 * Eight aligned bytes are guaranteed to cover the store. 791 */ 792static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) 793{ 794 int sz = size * 8; 795 int o = (uintptr_t)pv & 7; 796 int sh = o * 8; 797 uint64_t m = MAKE_64BIT_MASK(0, sz); 798 uint64_t v; 799 800 qemu_build_assert(HAVE_al8); 801 if (HOST_BIG_ENDIAN) { 802 v = bswap64(val_le) >> sh; 803 m = bswap64(m) >> sh; 804 } else { 805 v = val_le << sh; 806 m <<= sh; 807 } 808 store_atom_insert_al8(pv - o, v, m); 809 return val_le >> sz; 810} 811 812/** 813 * store_whole_le16 814 * @pv: host address 815 * @size: number of bytes to store 816 * @val_le: data to store 817 * 818 * As store_bytes_leN, but atomically as a whole. 819 * 16 aligned bytes are guaranteed to cover the store. 820 */ 821static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) 822{ 823 int sz = size * 8; 824 int o = (uintptr_t)pv & 15; 825 int sh = o * 8; 826 Int128 m, v; 827 828 qemu_build_assert(HAVE_CMPXCHG128); 829 830 /* Like MAKE_64BIT_MASK(0, sz), but larger. */ 831 if (sz <= 64) { 832 m = int128_make64(MAKE_64BIT_MASK(0, sz)); 833 } else { 834 m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); 835 } 836 837 if (HOST_BIG_ENDIAN) { 838 v = int128_urshift(bswap128(val_le), sh); 839 m = int128_urshift(bswap128(m), sh); 840 } else { 841 v = int128_lshift(val_le, sh); 842 m = int128_lshift(m, sh); 843 } 844 store_atom_insert_al16(pv - o, v, m); 845 846 if (sz <= 64) { 847 return 0; 848 } 849 return int128_gethi(val_le) >> (sz - 64); 850} 851 852/** 853 * store_atom_2: 854 * @p: host address 855 * @val: the value to store 856 * @memop: the full memory op 857 * 858 * Store 2 bytes to @p, honoring the atomicity of @memop. 859 */ 860static void store_atom_2(CPUState *cpu, uintptr_t ra, 861 void *pv, MemOp memop, uint16_t val) 862{ 863 uintptr_t pi = (uintptr_t)pv; 864 int atmax; 865 866 if (likely((pi & 1) == 0)) { 867 store_atomic2(pv, val); 868 return; 869 } 870 871 atmax = required_atomicity(cpu, pi, memop); 872 if (atmax == MO_8) { 873 stw_he_p(pv, val); 874 return; 875 } 876 877 /* 878 * The only case remaining is MO_ATOM_WITHIN16. 879 * Big or little endian, we want the middle two bytes in each test. 880 */ 881 if ((pi & 3) == 1) { 882 store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); 883 return; 884 } else if ((pi & 7) == 3) { 885 if (HAVE_al8) { 886 store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); 887 return; 888 } 889 } else if ((pi & 15) == 7) { 890 if (HAVE_CMPXCHG128) { 891 Int128 v = int128_lshift(int128_make64(val), 56); 892 Int128 m = int128_lshift(int128_make64(0xffff), 56); 893 store_atom_insert_al16(pv - 7, v, m); 894 return; 895 } 896 } else { 897 g_assert_not_reached(); 898 } 899 900 cpu_loop_exit_atomic(cpu, ra); 901} 902 903/** 904 * store_atom_4: 905 * @p: host address 906 * @val: the value to store 907 * @memop: the full memory op 908 * 909 * Store 4 bytes to @p, honoring the atomicity of @memop. 910 */ 911static void store_atom_4(CPUState *cpu, uintptr_t ra, 912 void *pv, MemOp memop, uint32_t val) 913{ 914 uintptr_t pi = (uintptr_t)pv; 915 int atmax; 916 917 if (likely((pi & 3) == 0)) { 918 store_atomic4(pv, val); 919 return; 920 } 921 922 atmax = required_atomicity(cpu, pi, memop); 923 switch (atmax) { 924 case MO_8: 925 stl_he_p(pv, val); 926 return; 927 case MO_16: 928 store_atom_4_by_2(pv, val); 929 return; 930 case -MO_16: 931 { 932 uint32_t val_le = cpu_to_le32(val); 933 int s2 = pi & 3; 934 int s1 = 4 - s2; 935 936 switch (s2) { 937 case 1: 938 val_le = store_whole_le4(pv, s1, val_le); 939 *(uint8_t *)(pv + 3) = val_le; 940 break; 941 case 3: 942 *(uint8_t *)pv = val_le; 943 store_whole_le4(pv + 1, s2, val_le >> 8); 944 break; 945 case 0: /* aligned */ 946 case 2: /* atmax MO_16 */ 947 default: 948 g_assert_not_reached(); 949 } 950 } 951 return; 952 case MO_32: 953 if ((pi & 7) < 4) { 954 if (HAVE_al8) { 955 store_whole_le8(pv, 4, cpu_to_le32(val)); 956 return; 957 } 958 } else { 959 if (HAVE_CMPXCHG128) { 960 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); 961 return; 962 } 963 } 964 cpu_loop_exit_atomic(cpu, ra); 965 default: 966 g_assert_not_reached(); 967 } 968} 969 970/** 971 * store_atom_8: 972 * @p: host address 973 * @val: the value to store 974 * @memop: the full memory op 975 * 976 * Store 8 bytes to @p, honoring the atomicity of @memop. 977 */ 978static void store_atom_8(CPUState *cpu, uintptr_t ra, 979 void *pv, MemOp memop, uint64_t val) 980{ 981 uintptr_t pi = (uintptr_t)pv; 982 int atmax; 983 984 if (HAVE_al8 && likely((pi & 7) == 0)) { 985 store_atomic8(pv, val); 986 return; 987 } 988 989 atmax = required_atomicity(cpu, pi, memop); 990 switch (atmax) { 991 case MO_8: 992 stq_he_p(pv, val); 993 return; 994 case MO_16: 995 store_atom_8_by_2(pv, val); 996 return; 997 case MO_32: 998 store_atom_8_by_4(pv, val); 999 return; 1000 case -MO_32: 1001 if (HAVE_al8) { 1002 uint64_t val_le = cpu_to_le64(val); 1003 int s2 = pi & 7; 1004 int s1 = 8 - s2; 1005 1006 switch (s2) { 1007 case 1 ... 3: 1008 val_le = store_whole_le8(pv, s1, val_le); 1009 store_bytes_leN(pv + s1, s2, val_le); 1010 break; 1011 case 5 ... 7: 1012 val_le = store_bytes_leN(pv, s1, val_le); 1013 store_whole_le8(pv + s1, s2, val_le); 1014 break; 1015 case 0: /* aligned */ 1016 case 4: /* atmax MO_32 */ 1017 default: 1018 g_assert_not_reached(); 1019 } 1020 return; 1021 } 1022 break; 1023 case MO_64: 1024 if (HAVE_CMPXCHG128) { 1025 store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); 1026 return; 1027 } 1028 break; 1029 default: 1030 g_assert_not_reached(); 1031 } 1032 cpu_loop_exit_atomic(cpu, ra); 1033} 1034 1035/** 1036 * store_atom_16: 1037 * @p: host address 1038 * @val: the value to store 1039 * @memop: the full memory op 1040 * 1041 * Store 16 bytes to @p, honoring the atomicity of @memop. 1042 */ 1043static void store_atom_16(CPUState *cpu, uintptr_t ra, 1044 void *pv, MemOp memop, Int128 val) 1045{ 1046 uintptr_t pi = (uintptr_t)pv; 1047 uint64_t a, b; 1048 int atmax; 1049 1050 if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) { 1051 atomic16_set(pv, val); 1052 return; 1053 } 1054 1055 atmax = required_atomicity(cpu, pi, memop); 1056 1057 a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val); 1058 b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val); 1059 switch (atmax) { 1060 case MO_8: 1061 memcpy(pv, &val, 16); 1062 return; 1063 case MO_16: 1064 store_atom_8_by_2(pv, a); 1065 store_atom_8_by_2(pv + 8, b); 1066 return; 1067 case MO_32: 1068 store_atom_8_by_4(pv, a); 1069 store_atom_8_by_4(pv + 8, b); 1070 return; 1071 case MO_64: 1072 if (HAVE_al8) { 1073 store_atomic8(pv, a); 1074 store_atomic8(pv + 8, b); 1075 return; 1076 } 1077 break; 1078 case -MO_64: 1079 if (HAVE_CMPXCHG128) { 1080 uint64_t val_le; 1081 int s2 = pi & 15; 1082 int s1 = 16 - s2; 1083 1084 if (HOST_BIG_ENDIAN) { 1085 val = bswap128(val); 1086 } 1087 switch (s2) { 1088 case 1 ... 7: 1089 val_le = store_whole_le16(pv, s1, val); 1090 store_bytes_leN(pv + s1, s2, val_le); 1091 break; 1092 case 9 ... 15: 1093 store_bytes_leN(pv, s1, int128_getlo(val)); 1094 val = int128_urshift(val, s1 * 8); 1095 store_whole_le16(pv + s1, s2, val); 1096 break; 1097 case 0: /* aligned */ 1098 case 8: /* atmax MO_64 */ 1099 default: 1100 g_assert_not_reached(); 1101 } 1102 return; 1103 } 1104 break; 1105 case MO_128: 1106 break; 1107 default: 1108 g_assert_not_reached(); 1109 } 1110 cpu_loop_exit_atomic(cpu, ra); 1111} 1112