1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * xsave/xrstor support. 4 * 5 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 6 */ 7 #include <linux/bitops.h> 8 #include <linux/compat.h> 9 #include <linux/cpu.h> 10 #include <linux/mman.h> 11 #include <linux/nospec.h> 12 #include <linux/pkeys.h> 13 #include <linux/seq_file.h> 14 #include <linux/proc_fs.h> 15 #include <linux/vmalloc.h> 16 17 #include <asm/fpu/api.h> 18 #include <asm/fpu/regset.h> 19 #include <asm/fpu/signal.h> 20 #include <asm/fpu/xcr.h> 21 22 #include <asm/tlbflush.h> 23 #include <asm/prctl.h> 24 #include <asm/elf.h> 25 26 #include "context.h" 27 #include "internal.h" 28 #include "legacy.h" 29 #include "xstate.h" 30 31 #define for_each_extended_xfeature(bit, mask) \ 32 (bit) = FIRST_EXTENDED_XFEATURE; \ 33 for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) 34 35 /* 36 * Although we spell it out in here, the Processor Trace 37 * xfeature is completely unused. We use other mechanisms 38 * to save/restore PT state in Linux. 39 */ 40 static const char *xfeature_names[] = 41 { 42 "x87 floating point registers" , 43 "SSE registers" , 44 "AVX registers" , 45 "MPX bounds registers" , 46 "MPX CSR" , 47 "AVX-512 opmask" , 48 "AVX-512 Hi256" , 49 "AVX-512 ZMM_Hi256" , 50 "Processor Trace (unused)" , 51 "Protection Keys User registers", 52 "PASID state", 53 "unknown xstate feature" , 54 "unknown xstate feature" , 55 "unknown xstate feature" , 56 "unknown xstate feature" , 57 "unknown xstate feature" , 58 "unknown xstate feature" , 59 "AMX Tile config" , 60 "AMX Tile data" , 61 "unknown xstate feature" , 62 }; 63 64 static unsigned short xsave_cpuid_features[] __initdata = { 65 [XFEATURE_FP] = X86_FEATURE_FPU, 66 [XFEATURE_SSE] = X86_FEATURE_XMM, 67 [XFEATURE_YMM] = X86_FEATURE_AVX, 68 [XFEATURE_BNDREGS] = X86_FEATURE_MPX, 69 [XFEATURE_BNDCSR] = X86_FEATURE_MPX, 70 [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, 71 [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, 72 [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, 73 [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, 74 [XFEATURE_PKRU] = X86_FEATURE_PKU, 75 [XFEATURE_PASID] = X86_FEATURE_ENQCMD, 76 [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, 77 [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, 78 }; 79 80 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = 81 { [ 0 ... XFEATURE_MAX - 1] = -1}; 82 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = 83 { [ 0 ... XFEATURE_MAX - 1] = -1}; 84 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; 85 86 #define XSTATE_FLAG_SUPERVISOR BIT(0) 87 #define XSTATE_FLAG_ALIGNED64 BIT(1) 88 89 /* 90 * Return whether the system supports a given xfeature. 91 * 92 * Also return the name of the (most advanced) feature that the caller requested: 93 */ 94 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) 95 { 96 u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; 97 98 if (unlikely(feature_name)) { 99 long xfeature_idx, max_idx; 100 u64 xfeatures_print; 101 /* 102 * So we use FLS here to be able to print the most advanced 103 * feature that was requested but is missing. So if a driver 104 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the 105 * missing AVX feature - this is the most informative message 106 * to users: 107 */ 108 if (xfeatures_missing) 109 xfeatures_print = xfeatures_missing; 110 else 111 xfeatures_print = xfeatures_needed; 112 113 xfeature_idx = fls64(xfeatures_print)-1; 114 max_idx = ARRAY_SIZE(xfeature_names)-1; 115 xfeature_idx = min(xfeature_idx, max_idx); 116 117 *feature_name = xfeature_names[xfeature_idx]; 118 } 119 120 if (xfeatures_missing) 121 return 0; 122 123 return 1; 124 } 125 EXPORT_SYMBOL_GPL(cpu_has_xfeatures); 126 127 static bool xfeature_is_aligned64(int xfeature_nr) 128 { 129 return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; 130 } 131 132 static bool xfeature_is_supervisor(int xfeature_nr) 133 { 134 return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; 135 } 136 137 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) 138 { 139 unsigned int offs, i; 140 141 /* 142 * Non-compacted format and legacy features use the cached fixed 143 * offsets. 144 */ 145 if (!cpu_feature_enabled(X86_FEATURE_XSAVES) || xfeature <= XFEATURE_SSE) 146 return xstate_offsets[xfeature]; 147 148 /* 149 * Compacted format offsets depend on the actual content of the 150 * compacted xsave area which is determined by the xcomp_bv header 151 * field. 152 */ 153 offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; 154 for_each_extended_xfeature(i, xcomp_bv) { 155 if (xfeature_is_aligned64(i)) 156 offs = ALIGN(offs, 64); 157 if (i == xfeature) 158 break; 159 offs += xstate_sizes[i]; 160 } 161 return offs; 162 } 163 164 /* 165 * Enable the extended processor state save/restore feature. 166 * Called once per CPU onlining. 167 */ 168 void fpu__init_cpu_xstate(void) 169 { 170 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) 171 return; 172 173 cr4_set_bits(X86_CR4_OSXSAVE); 174 175 /* 176 * Must happen after CR4 setup and before xsetbv() to allow KVM 177 * lazy passthrough. Write independent of the dynamic state static 178 * key as that does not work on the boot CPU. This also ensures 179 * that any stale state is wiped out from XFD. 180 */ 181 if (cpu_feature_enabled(X86_FEATURE_XFD)) 182 wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); 183 184 /* 185 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features 186 * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user 187 * states can be set here. 188 */ 189 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 190 191 /* 192 * MSR_IA32_XSS sets supervisor states managed by XSAVES. 193 */ 194 if (boot_cpu_has(X86_FEATURE_XSAVES)) { 195 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 196 xfeatures_mask_independent()); 197 } 198 } 199 200 static bool xfeature_enabled(enum xfeature xfeature) 201 { 202 return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); 203 } 204 205 /* 206 * Record the offsets and sizes of various xstates contained 207 * in the XSAVE state memory layout. 208 */ 209 static void __init setup_xstate_cache(void) 210 { 211 u32 eax, ebx, ecx, edx, i; 212 /* start at the beginning of the "extended state" */ 213 unsigned int last_good_offset = offsetof(struct xregs_state, 214 extended_state_area); 215 /* 216 * The FP xstates and SSE xstates are legacy states. They are always 217 * in the fixed offsets in the xsave area in either compacted form 218 * or standard form. 219 */ 220 xstate_offsets[XFEATURE_FP] = 0; 221 xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, 222 xmm_space); 223 224 xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; 225 xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, 226 xmm_space); 227 228 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 229 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); 230 231 xstate_sizes[i] = eax; 232 xstate_flags[i] = ecx; 233 234 /* 235 * If an xfeature is supervisor state, the offset in EBX is 236 * invalid, leave it to -1. 237 */ 238 if (xfeature_is_supervisor(i)) 239 continue; 240 241 xstate_offsets[i] = ebx; 242 243 /* 244 * In our xstate size checks, we assume that the highest-numbered 245 * xstate feature has the highest offset in the buffer. Ensure 246 * it does. 247 */ 248 WARN_ONCE(last_good_offset > xstate_offsets[i], 249 "x86/fpu: misordered xstate at %d\n", last_good_offset); 250 251 last_good_offset = xstate_offsets[i]; 252 } 253 } 254 255 static void __init print_xstate_feature(u64 xstate_mask) 256 { 257 const char *feature_name; 258 259 if (cpu_has_xfeatures(xstate_mask, &feature_name)) 260 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); 261 } 262 263 /* 264 * Print out all the supported xstate features: 265 */ 266 static void __init print_xstate_features(void) 267 { 268 print_xstate_feature(XFEATURE_MASK_FP); 269 print_xstate_feature(XFEATURE_MASK_SSE); 270 print_xstate_feature(XFEATURE_MASK_YMM); 271 print_xstate_feature(XFEATURE_MASK_BNDREGS); 272 print_xstate_feature(XFEATURE_MASK_BNDCSR); 273 print_xstate_feature(XFEATURE_MASK_OPMASK); 274 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); 275 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); 276 print_xstate_feature(XFEATURE_MASK_PKRU); 277 print_xstate_feature(XFEATURE_MASK_PASID); 278 print_xstate_feature(XFEATURE_MASK_XTILE_CFG); 279 print_xstate_feature(XFEATURE_MASK_XTILE_DATA); 280 } 281 282 /* 283 * This check is important because it is easy to get XSTATE_* 284 * confused with XSTATE_BIT_*. 285 */ 286 #define CHECK_XFEATURE(nr) do { \ 287 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ 288 WARN_ON(nr >= XFEATURE_MAX); \ 289 } while (0) 290 291 /* 292 * Print out xstate component offsets and sizes 293 */ 294 static void __init print_xstate_offset_size(void) 295 { 296 int i; 297 298 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 299 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", 300 i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), 301 i, xstate_sizes[i]); 302 } 303 } 304 305 /* 306 * This function is called only during boot time when x86 caps are not set 307 * up and alternative can not be used yet. 308 */ 309 static __init void os_xrstor_booting(struct xregs_state *xstate) 310 { 311 u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; 312 u32 lmask = mask; 313 u32 hmask = mask >> 32; 314 int err; 315 316 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) 317 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); 318 else 319 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); 320 321 /* 322 * We should never fault when copying from a kernel buffer, and the FPU 323 * state we set at boot time should be valid. 324 */ 325 WARN_ON_FPU(err); 326 } 327 328 /* 329 * All supported features have either init state all zeros or are 330 * handled in setup_init_fpu() individually. This is an explicit 331 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch 332 * newly added supported features at build time and make people 333 * actually look at the init state for the new feature. 334 */ 335 #define XFEATURES_INIT_FPSTATE_HANDLED \ 336 (XFEATURE_MASK_FP | \ 337 XFEATURE_MASK_SSE | \ 338 XFEATURE_MASK_YMM | \ 339 XFEATURE_MASK_OPMASK | \ 340 XFEATURE_MASK_ZMM_Hi256 | \ 341 XFEATURE_MASK_Hi16_ZMM | \ 342 XFEATURE_MASK_PKRU | \ 343 XFEATURE_MASK_BNDREGS | \ 344 XFEATURE_MASK_BNDCSR | \ 345 XFEATURE_MASK_PASID | \ 346 XFEATURE_MASK_XTILE) 347 348 /* 349 * setup the xstate image representing the init state 350 */ 351 static void __init setup_init_fpu_buf(void) 352 { 353 BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | 354 XFEATURE_MASK_SUPERVISOR_SUPPORTED) != 355 XFEATURES_INIT_FPSTATE_HANDLED); 356 357 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 358 return; 359 360 print_xstate_features(); 361 362 xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); 363 364 /* 365 * Init all the features state with header.xfeatures being 0x0 366 */ 367 os_xrstor_booting(&init_fpstate.regs.xsave); 368 369 /* 370 * All components are now in init state. Read the state back so 371 * that init_fpstate contains all non-zero init state. This only 372 * works with XSAVE, but not with XSAVEOPT and XSAVES because 373 * those use the init optimization which skips writing data for 374 * components in init state. 375 * 376 * XSAVE could be used, but that would require to reshuffle the 377 * data when XSAVES is available because XSAVES uses xstate 378 * compaction. But doing so is a pointless exercise because most 379 * components have an all zeros init state except for the legacy 380 * ones (FP and SSE). Those can be saved with FXSAVE into the 381 * legacy area. Adding new features requires to ensure that init 382 * state is all zeroes or if not to add the necessary handling 383 * here. 384 */ 385 fxsave(&init_fpstate.regs.fxsave); 386 } 387 388 int xfeature_size(int xfeature_nr) 389 { 390 u32 eax, ebx, ecx, edx; 391 392 CHECK_XFEATURE(xfeature_nr); 393 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); 394 return eax; 395 } 396 397 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ 398 static int validate_user_xstate_header(const struct xstate_header *hdr, 399 struct fpstate *fpstate) 400 { 401 /* No unknown or supervisor features may be set */ 402 if (hdr->xfeatures & ~fpstate->user_xfeatures) 403 return -EINVAL; 404 405 /* Userspace must use the uncompacted format */ 406 if (hdr->xcomp_bv) 407 return -EINVAL; 408 409 /* 410 * If 'reserved' is shrunken to add a new field, make sure to validate 411 * that new field here! 412 */ 413 BUILD_BUG_ON(sizeof(hdr->reserved) != 48); 414 415 /* No reserved bits may be set */ 416 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) 417 return -EINVAL; 418 419 return 0; 420 } 421 422 static void __init __xstate_dump_leaves(void) 423 { 424 int i; 425 u32 eax, ebx, ecx, edx; 426 static int should_dump = 1; 427 428 if (!should_dump) 429 return; 430 should_dump = 0; 431 /* 432 * Dump out a few leaves past the ones that we support 433 * just in case there are some goodies up there 434 */ 435 for (i = 0; i < XFEATURE_MAX + 10; i++) { 436 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); 437 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", 438 XSTATE_CPUID, i, eax, ebx, ecx, edx); 439 } 440 } 441 442 #define XSTATE_WARN_ON(x) do { \ 443 if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ 444 __xstate_dump_leaves(); \ 445 } \ 446 } while (0) 447 448 #define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ 449 if ((nr == nr_macro) && \ 450 WARN_ONCE(sz != sizeof(__struct), \ 451 "%s: struct is %zu bytes, cpu state %d bytes\n", \ 452 __stringify(nr_macro), sizeof(__struct), sz)) { \ 453 __xstate_dump_leaves(); \ 454 } \ 455 } while (0) 456 457 /** 458 * check_xtile_data_against_struct - Check tile data state size. 459 * 460 * Calculate the state size by multiplying the single tile size which is 461 * recorded in a C struct, and the number of tiles that the CPU informs. 462 * Compare the provided size with the calculation. 463 * 464 * @size: The tile data state size 465 * 466 * Returns: 0 on success, -EINVAL on mismatch. 467 */ 468 static int __init check_xtile_data_against_struct(int size) 469 { 470 u32 max_palid, palid, state_size; 471 u32 eax, ebx, ecx, edx; 472 u16 max_tile; 473 474 /* 475 * Check the maximum palette id: 476 * eax: the highest numbered palette subleaf. 477 */ 478 cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); 479 480 /* 481 * Cross-check each tile size and find the maximum number of 482 * supported tiles. 483 */ 484 for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { 485 u16 tile_size, max; 486 487 /* 488 * Check the tile size info: 489 * eax[31:16]: bytes per title 490 * ebx[31:16]: the max names (or max number of tiles) 491 */ 492 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); 493 tile_size = eax >> 16; 494 max = ebx >> 16; 495 496 if (tile_size != sizeof(struct xtile_data)) { 497 pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", 498 __stringify(XFEATURE_XTILE_DATA), 499 sizeof(struct xtile_data), tile_size); 500 __xstate_dump_leaves(); 501 return -EINVAL; 502 } 503 504 if (max > max_tile) 505 max_tile = max; 506 } 507 508 state_size = sizeof(struct xtile_data) * max_tile; 509 if (size != state_size) { 510 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", 511 __stringify(XFEATURE_XTILE_DATA), state_size, size); 512 __xstate_dump_leaves(); 513 return -EINVAL; 514 } 515 return 0; 516 } 517 518 /* 519 * We have a C struct for each 'xstate'. We need to ensure 520 * that our software representation matches what the CPU 521 * tells us about the state's size. 522 */ 523 static bool __init check_xstate_against_struct(int nr) 524 { 525 /* 526 * Ask the CPU for the size of the state. 527 */ 528 int sz = xfeature_size(nr); 529 /* 530 * Match each CPU state with the corresponding software 531 * structure. 532 */ 533 XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); 534 XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); 535 XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); 536 XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); 537 XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); 538 XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); 539 XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); 540 XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); 541 XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); 542 543 /* The tile data size varies between implementations. */ 544 if (nr == XFEATURE_XTILE_DATA) 545 check_xtile_data_against_struct(sz); 546 547 /* 548 * Make *SURE* to add any feature numbers in below if 549 * there are "holes" in the xsave state component 550 * numbers. 551 */ 552 if ((nr < XFEATURE_YMM) || 553 (nr >= XFEATURE_MAX) || 554 (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || 555 ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { 556 WARN_ONCE(1, "no structure for xstate: %d\n", nr); 557 XSTATE_WARN_ON(1); 558 return false; 559 } 560 return true; 561 } 562 563 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) 564 { 565 unsigned int topmost = fls64(xfeatures) - 1; 566 unsigned int offset = xstate_offsets[topmost]; 567 568 if (topmost <= XFEATURE_SSE) 569 return sizeof(struct xregs_state); 570 571 if (compacted) 572 offset = xfeature_get_offset(xfeatures, topmost); 573 return offset + xstate_sizes[topmost]; 574 } 575 576 /* 577 * This essentially double-checks what the cpu told us about 578 * how large the XSAVE buffer needs to be. We are recalculating 579 * it to be safe. 580 * 581 * Independent XSAVE features allocate their own buffers and are not 582 * covered by these checks. Only the size of the buffer for task->fpu 583 * is checked here. 584 */ 585 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) 586 { 587 bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); 588 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; 589 int i; 590 591 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 592 if (!check_xstate_against_struct(i)) 593 return false; 594 /* 595 * Supervisor state components can be managed only by 596 * XSAVES. 597 */ 598 if (!compacted && xfeature_is_supervisor(i)) { 599 XSTATE_WARN_ON(1); 600 return false; 601 } 602 } 603 size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); 604 XSTATE_WARN_ON(size != kernel_size); 605 return size == kernel_size; 606 } 607 608 /* 609 * Get total size of enabled xstates in XCR0 | IA32_XSS. 610 * 611 * Note the SDM's wording here. "sub-function 0" only enumerates 612 * the size of the *user* states. If we use it to size a buffer 613 * that we use 'XSAVES' on, we could potentially overflow the 614 * buffer because 'XSAVES' saves system states too. 615 */ 616 static unsigned int __init get_xsaves_size(void) 617 { 618 unsigned int eax, ebx, ecx, edx; 619 /* 620 * - CPUID function 0DH, sub-function 1: 621 * EBX enumerates the size (in bytes) required by 622 * the XSAVES instruction for an XSAVE area 623 * containing all the state components 624 * corresponding to bits currently set in 625 * XCR0 | IA32_XSS. 626 */ 627 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); 628 return ebx; 629 } 630 631 /* 632 * Get the total size of the enabled xstates without the independent supervisor 633 * features. 634 */ 635 static unsigned int __init get_xsaves_size_no_independent(void) 636 { 637 u64 mask = xfeatures_mask_independent(); 638 unsigned int size; 639 640 if (!mask) 641 return get_xsaves_size(); 642 643 /* Disable independent features. */ 644 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); 645 646 /* 647 * Ask the hardware what size is required of the buffer. 648 * This is the size required for the task->fpu buffer. 649 */ 650 size = get_xsaves_size(); 651 652 /* Re-enable independent features so XSAVES will work on them again. */ 653 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); 654 655 return size; 656 } 657 658 static unsigned int __init get_xsave_size_user(void) 659 { 660 unsigned int eax, ebx, ecx, edx; 661 /* 662 * - CPUID function 0DH, sub-function 0: 663 * EBX enumerates the size (in bytes) required by 664 * the XSAVE instruction for an XSAVE area 665 * containing all the *user* state components 666 * corresponding to bits currently set in XCR0. 667 */ 668 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); 669 return ebx; 670 } 671 672 /* 673 * Will the runtime-enumerated 'xstate_size' fit in the init 674 * task's statically-allocated buffer? 675 */ 676 static bool __init is_supported_xstate_size(unsigned int test_xstate_size) 677 { 678 if (test_xstate_size <= sizeof(init_fpstate.regs)) 679 return true; 680 681 pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", 682 sizeof(init_fpstate.regs), test_xstate_size); 683 return false; 684 } 685 686 static int __init init_xstate_size(void) 687 { 688 /* Recompute the context size for enabled features: */ 689 unsigned int user_size, kernel_size, kernel_default_size; 690 bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); 691 692 /* Uncompacted user space size */ 693 user_size = get_xsave_size_user(); 694 695 /* 696 * XSAVES kernel size includes supervisor states and 697 * uses compacted format when available. 698 * 699 * XSAVE does not support supervisor states so 700 * kernel and user size is identical. 701 */ 702 if (compacted) 703 kernel_size = get_xsaves_size_no_independent(); 704 else 705 kernel_size = user_size; 706 707 kernel_default_size = 708 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); 709 710 /* Ensure we have the space to store all default enabled features. */ 711 if (!is_supported_xstate_size(kernel_default_size)) 712 return -EINVAL; 713 714 if (!paranoid_xstate_size_valid(kernel_size)) 715 return -EINVAL; 716 717 fpu_kernel_cfg.max_size = kernel_size; 718 fpu_user_cfg.max_size = user_size; 719 720 fpu_kernel_cfg.default_size = kernel_default_size; 721 fpu_user_cfg.default_size = 722 xstate_calculate_size(fpu_user_cfg.default_features, false); 723 724 return 0; 725 } 726 727 /* 728 * We enabled the XSAVE hardware, but something went wrong and 729 * we can not use it. Disable it. 730 */ 731 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) 732 { 733 fpu_kernel_cfg.max_features = 0; 734 cr4_clear_bits(X86_CR4_OSXSAVE); 735 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 736 737 /* Restore the legacy size.*/ 738 fpu_kernel_cfg.max_size = legacy_size; 739 fpu_kernel_cfg.default_size = legacy_size; 740 fpu_user_cfg.max_size = legacy_size; 741 fpu_user_cfg.default_size = legacy_size; 742 743 /* 744 * Prevent enabling the static branch which enables writes to the 745 * XFD MSR. 746 */ 747 init_fpstate.xfd = 0; 748 749 fpstate_reset(¤t->thread.fpu); 750 } 751 752 /* 753 * Enable and initialize the xsave feature. 754 * Called once per system bootup. 755 */ 756 void __init fpu__init_system_xstate(unsigned int legacy_size) 757 { 758 unsigned int eax, ebx, ecx, edx; 759 u64 xfeatures; 760 int err; 761 int i; 762 763 if (!boot_cpu_has(X86_FEATURE_FPU)) { 764 pr_info("x86/fpu: No FPU detected\n"); 765 return; 766 } 767 768 if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 769 pr_info("x86/fpu: x87 FPU will use %s\n", 770 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); 771 return; 772 } 773 774 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { 775 WARN_ON_FPU(1); 776 return; 777 } 778 779 /* 780 * Find user xstates supported by the processor. 781 */ 782 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); 783 fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); 784 785 /* 786 * Find supervisor xstates supported by the processor. 787 */ 788 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); 789 fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); 790 791 if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { 792 /* 793 * This indicates that something really unexpected happened 794 * with the enumeration. Disable XSAVE and try to continue 795 * booting without it. This is too early to BUG(). 796 */ 797 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", 798 fpu_kernel_cfg.max_features); 799 goto out_disable; 800 } 801 802 /* 803 * Clear XSAVE features that are disabled in the normal CPUID. 804 */ 805 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { 806 unsigned short cid = xsave_cpuid_features[i]; 807 808 /* Careful: X86_FEATURE_FPU is 0! */ 809 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) 810 fpu_kernel_cfg.max_features &= ~BIT_ULL(i); 811 } 812 813 if (!cpu_feature_enabled(X86_FEATURE_XFD)) 814 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; 815 816 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | 817 XFEATURE_MASK_SUPERVISOR_SUPPORTED; 818 819 fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; 820 fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; 821 822 /* Clean out dynamic features from default */ 823 fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; 824 fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 825 826 fpu_user_cfg.default_features = fpu_user_cfg.max_features; 827 fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 828 829 /* Store it for paranoia check at the end */ 830 xfeatures = fpu_kernel_cfg.max_features; 831 832 /* 833 * Initialize the default XFD state in initfp_state and enable the 834 * dynamic sizing mechanism if dynamic states are available. The 835 * static key cannot be enabled here because this runs before 836 * jump_label_init(). This is delayed to an initcall. 837 */ 838 init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; 839 840 /* Enable xstate instructions to be able to continue with initialization: */ 841 fpu__init_cpu_xstate(); 842 843 /* Cache size, offset and flags for initialization */ 844 setup_xstate_cache(); 845 846 err = init_xstate_size(); 847 if (err) 848 goto out_disable; 849 850 /* Reset the state for the current task */ 851 fpstate_reset(¤t->thread.fpu); 852 853 /* 854 * Update info used for ptrace frames; use standard-format size and no 855 * supervisor xstates: 856 */ 857 update_regset_xstate_info(fpu_user_cfg.max_size, 858 fpu_user_cfg.max_features); 859 860 setup_init_fpu_buf(); 861 862 /* 863 * Paranoia check whether something in the setup modified the 864 * xfeatures mask. 865 */ 866 if (xfeatures != fpu_kernel_cfg.max_features) { 867 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", 868 xfeatures, fpu_kernel_cfg.max_features); 869 goto out_disable; 870 } 871 872 print_xstate_offset_size(); 873 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", 874 fpu_kernel_cfg.max_features, 875 fpu_kernel_cfg.max_size, 876 boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); 877 return; 878 879 out_disable: 880 /* something went wrong, try to boot without any XSAVE support */ 881 fpu__init_disable_system_xstate(legacy_size); 882 } 883 884 /* 885 * Restore minimal FPU state after suspend: 886 */ 887 void fpu__resume_cpu(void) 888 { 889 /* 890 * Restore XCR0 on xsave capable CPUs: 891 */ 892 if (cpu_feature_enabled(X86_FEATURE_XSAVE)) 893 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 894 895 /* 896 * Restore IA32_XSS. The same CPUID bit enumerates support 897 * of XSAVES and MSR_IA32_XSS. 898 */ 899 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { 900 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 901 xfeatures_mask_independent()); 902 } 903 904 if (fpu_state_size_dynamic()) 905 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); 906 } 907 908 /* 909 * Given an xstate feature nr, calculate where in the xsave 910 * buffer the state is. Callers should ensure that the buffer 911 * is valid. 912 */ 913 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 914 { 915 u64 xcomp_bv = xsave->header.xcomp_bv; 916 917 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 918 return NULL; 919 920 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { 921 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) 922 return NULL; 923 } 924 925 return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); 926 } 927 928 /* 929 * Given the xsave area and a state inside, this function returns the 930 * address of the state. 931 * 932 * This is the API that is called to get xstate address in either 933 * standard format or compacted format of xsave area. 934 * 935 * Note that if there is no data for the field in the xsave buffer 936 * this will return NULL. 937 * 938 * Inputs: 939 * xstate: the thread's storage area for all FPU data 940 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, 941 * XFEATURE_SSE, etc...) 942 * Output: 943 * address of the state in the xsave area, or NULL if the 944 * field is not present in the xsave buffer. 945 */ 946 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 947 { 948 /* 949 * Do we even *have* xsave state? 950 */ 951 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 952 return NULL; 953 954 /* 955 * We should not ever be requesting features that we 956 * have not enabled. 957 */ 958 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 959 return NULL; 960 961 /* 962 * This assumes the last 'xsave*' instruction to 963 * have requested that 'xfeature_nr' be saved. 964 * If it did not, we might be seeing and old value 965 * of the field in the buffer. 966 * 967 * This can happen because the last 'xsave' did not 968 * request that this feature be saved (unlikely) 969 * or because the "init optimization" caused it 970 * to not be saved. 971 */ 972 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) 973 return NULL; 974 975 return __raw_xsave_addr(xsave, xfeature_nr); 976 } 977 978 #ifdef CONFIG_ARCH_HAS_PKEYS 979 980 /* 981 * This will go out and modify PKRU register to set the access 982 * rights for @pkey to @init_val. 983 */ 984 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 985 unsigned long init_val) 986 { 987 u32 old_pkru, new_pkru_bits = 0; 988 int pkey_shift; 989 990 /* 991 * This check implies XSAVE support. OSPKE only gets 992 * set if we enable XSAVE and we enable PKU in XCR0. 993 */ 994 if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) 995 return -EINVAL; 996 997 /* 998 * This code should only be called with valid 'pkey' 999 * values originating from in-kernel users. Complain 1000 * if a bad value is observed. 1001 */ 1002 if (WARN_ON_ONCE(pkey >= arch_max_pkey())) 1003 return -EINVAL; 1004 1005 /* Set the bits we need in PKRU: */ 1006 if (init_val & PKEY_DISABLE_ACCESS) 1007 new_pkru_bits |= PKRU_AD_BIT; 1008 if (init_val & PKEY_DISABLE_WRITE) 1009 new_pkru_bits |= PKRU_WD_BIT; 1010 1011 /* Shift the bits in to the correct place in PKRU for pkey: */ 1012 pkey_shift = pkey * PKRU_BITS_PER_PKEY; 1013 new_pkru_bits <<= pkey_shift; 1014 1015 /* Get old PKRU and mask off any old bits in place: */ 1016 old_pkru = read_pkru(); 1017 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); 1018 1019 /* Write old part along with new part: */ 1020 write_pkru(old_pkru | new_pkru_bits); 1021 1022 return 0; 1023 } 1024 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ 1025 1026 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, 1027 void *init_xstate, unsigned int size) 1028 { 1029 membuf_write(to, from_xstate ? xstate : init_xstate, size); 1030 } 1031 1032 /** 1033 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1034 * @to: membuf descriptor 1035 * @fpstate: The fpstate buffer from which to copy 1036 * @pkru_val: The PKRU value to store in the PKRU component 1037 * @copy_mode: The requested copy mode 1038 * 1039 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1040 * format, i.e. from the kernel internal hardware dependent storage format 1041 * to the requested @mode. UABI XSTATE is always uncompacted! 1042 * 1043 * It supports partial copy but @to.pos always starts from zero. 1044 */ 1045 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, 1046 u32 pkru_val, enum xstate_copy_mode copy_mode) 1047 { 1048 const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); 1049 struct xregs_state *xinit = &init_fpstate.regs.xsave; 1050 struct xregs_state *xsave = &fpstate->regs.xsave; 1051 struct xstate_header header; 1052 unsigned int zerofrom; 1053 u64 mask; 1054 int i; 1055 1056 memset(&header, 0, sizeof(header)); 1057 header.xfeatures = xsave->header.xfeatures; 1058 1059 /* Mask out the feature bits depending on copy mode */ 1060 switch (copy_mode) { 1061 case XSTATE_COPY_FP: 1062 header.xfeatures &= XFEATURE_MASK_FP; 1063 break; 1064 1065 case XSTATE_COPY_FX: 1066 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; 1067 break; 1068 1069 case XSTATE_COPY_XSAVE: 1070 header.xfeatures &= fpstate->user_xfeatures; 1071 break; 1072 } 1073 1074 /* Copy FP state up to MXCSR */ 1075 copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, 1076 &xinit->i387, off_mxcsr); 1077 1078 /* Copy MXCSR when SSE or YMM are set in the feature mask */ 1079 copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), 1080 &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, 1081 MXCSR_AND_FLAGS_SIZE); 1082 1083 /* Copy the remaining FP state */ 1084 copy_feature(header.xfeatures & XFEATURE_MASK_FP, 1085 &to, &xsave->i387.st_space, &xinit->i387.st_space, 1086 sizeof(xsave->i387.st_space)); 1087 1088 /* Copy the SSE state - shared with YMM, but independently managed */ 1089 copy_feature(header.xfeatures & XFEATURE_MASK_SSE, 1090 &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, 1091 sizeof(xsave->i387.xmm_space)); 1092 1093 if (copy_mode != XSTATE_COPY_XSAVE) 1094 goto out; 1095 1096 /* Zero the padding area */ 1097 membuf_zero(&to, sizeof(xsave->i387.padding)); 1098 1099 /* Copy xsave->i387.sw_reserved */ 1100 membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); 1101 1102 /* Copy the user space relevant state of @xsave->header */ 1103 membuf_write(&to, &header, sizeof(header)); 1104 1105 zerofrom = offsetof(struct xregs_state, extended_state_area); 1106 1107 /* 1108 * The ptrace buffer is in non-compacted XSAVE format. In 1109 * non-compacted format disabled features still occupy state space, 1110 * but there is no state to copy from in the compacted 1111 * init_fpstate. The gap tracking will zero these states. 1112 */ 1113 mask = fpstate->user_xfeatures; 1114 1115 for_each_extended_xfeature(i, mask) { 1116 /* 1117 * If there was a feature or alignment gap, zero the space 1118 * in the destination buffer. 1119 */ 1120 if (zerofrom < xstate_offsets[i]) 1121 membuf_zero(&to, xstate_offsets[i] - zerofrom); 1122 1123 if (i == XFEATURE_PKRU) { 1124 struct pkru_state pkru = {0}; 1125 /* 1126 * PKRU is not necessarily up to date in the 1127 * XSAVE buffer. Use the provided value. 1128 */ 1129 pkru.pkru = pkru_val; 1130 membuf_write(&to, &pkru, sizeof(pkru)); 1131 } else { 1132 copy_feature(header.xfeatures & BIT_ULL(i), &to, 1133 __raw_xsave_addr(xsave, i), 1134 __raw_xsave_addr(xinit, i), 1135 xstate_sizes[i]); 1136 } 1137 /* 1138 * Keep track of the last copied state in the non-compacted 1139 * target buffer for gap zeroing. 1140 */ 1141 zerofrom = xstate_offsets[i] + xstate_sizes[i]; 1142 } 1143 1144 out: 1145 if (to.left) 1146 membuf_zero(&to, to.left); 1147 } 1148 1149 /** 1150 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1151 * @to: membuf descriptor 1152 * @tsk: The task from which to copy the saved xstate 1153 * @copy_mode: The requested copy mode 1154 * 1155 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1156 * format, i.e. from the kernel internal hardware dependent storage format 1157 * to the requested @mode. UABI XSTATE is always uncompacted! 1158 * 1159 * It supports partial copy but @to.pos always starts from zero. 1160 */ 1161 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, 1162 enum xstate_copy_mode copy_mode) 1163 { 1164 __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, 1165 tsk->thread.pkru, copy_mode); 1166 } 1167 1168 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, 1169 const void *kbuf, const void __user *ubuf) 1170 { 1171 if (kbuf) { 1172 memcpy(dst, kbuf + offset, size); 1173 } else { 1174 if (copy_from_user(dst, ubuf + offset, size)) 1175 return -EFAULT; 1176 } 1177 return 0; 1178 } 1179 1180 1181 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, 1182 const void __user *ubuf) 1183 { 1184 struct xregs_state *xsave = &fpstate->regs.xsave; 1185 unsigned int offset, size; 1186 struct xstate_header hdr; 1187 u64 mask; 1188 int i; 1189 1190 offset = offsetof(struct xregs_state, header); 1191 if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) 1192 return -EFAULT; 1193 1194 if (validate_user_xstate_header(&hdr, fpstate)) 1195 return -EINVAL; 1196 1197 /* Validate MXCSR when any of the related features is in use */ 1198 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; 1199 if (hdr.xfeatures & mask) { 1200 u32 mxcsr[2]; 1201 1202 offset = offsetof(struct fxregs_state, mxcsr); 1203 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) 1204 return -EFAULT; 1205 1206 /* Reserved bits in MXCSR must be zero. */ 1207 if (mxcsr[0] & ~mxcsr_feature_mask) 1208 return -EINVAL; 1209 1210 /* SSE and YMM require MXCSR even when FP is not in use. */ 1211 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { 1212 xsave->i387.mxcsr = mxcsr[0]; 1213 xsave->i387.mxcsr_mask = mxcsr[1]; 1214 } 1215 } 1216 1217 for (i = 0; i < XFEATURE_MAX; i++) { 1218 u64 mask = ((u64)1 << i); 1219 1220 if (hdr.xfeatures & mask) { 1221 void *dst = __raw_xsave_addr(xsave, i); 1222 1223 offset = xstate_offsets[i]; 1224 size = xstate_sizes[i]; 1225 1226 if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) 1227 return -EFAULT; 1228 } 1229 } 1230 1231 /* 1232 * The state that came in from userspace was user-state only. 1233 * Mask all the user states out of 'xfeatures': 1234 */ 1235 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; 1236 1237 /* 1238 * Add back in the features that came in from userspace: 1239 */ 1240 xsave->header.xfeatures |= hdr.xfeatures; 1241 1242 return 0; 1243 } 1244 1245 /* 1246 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] 1247 * format and copy to the target thread. Used by ptrace and KVM. 1248 */ 1249 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) 1250 { 1251 return copy_uabi_to_xstate(fpstate, kbuf, NULL); 1252 } 1253 1254 /* 1255 * Convert from a sigreturn standard-format user-space buffer to kernel 1256 * XSAVE[S] format and copy to the target thread. This is called from the 1257 * sigreturn() and rt_sigreturn() system calls. 1258 */ 1259 int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, 1260 const void __user *ubuf) 1261 { 1262 return copy_uabi_to_xstate(fpstate, NULL, ubuf); 1263 } 1264 1265 static bool validate_independent_components(u64 mask) 1266 { 1267 u64 xchk; 1268 1269 if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) 1270 return false; 1271 1272 xchk = ~xfeatures_mask_independent(); 1273 1274 if (WARN_ON_ONCE(!mask || mask & xchk)) 1275 return false; 1276 1277 return true; 1278 } 1279 1280 /** 1281 * xsaves - Save selected components to a kernel xstate buffer 1282 * @xstate: Pointer to the buffer 1283 * @mask: Feature mask to select the components to save 1284 * 1285 * The @xstate buffer must be 64 byte aligned and correctly initialized as 1286 * XSAVES does not write the full xstate header. Before first use the 1287 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer 1288 * can #GP. 1289 * 1290 * The feature mask must be a subset of the independent features. 1291 */ 1292 void xsaves(struct xregs_state *xstate, u64 mask) 1293 { 1294 int err; 1295 1296 if (!validate_independent_components(mask)) 1297 return; 1298 1299 XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); 1300 WARN_ON_ONCE(err); 1301 } 1302 1303 /** 1304 * xrstors - Restore selected components from a kernel xstate buffer 1305 * @xstate: Pointer to the buffer 1306 * @mask: Feature mask to select the components to restore 1307 * 1308 * The @xstate buffer must be 64 byte aligned and correctly initialized 1309 * otherwise XRSTORS from that buffer can #GP. 1310 * 1311 * Proper usage is to restore the state which was saved with 1312 * xsaves() into @xstate. 1313 * 1314 * The feature mask must be a subset of the independent features. 1315 */ 1316 void xrstors(struct xregs_state *xstate, u64 mask) 1317 { 1318 int err; 1319 1320 if (!validate_independent_components(mask)) 1321 return; 1322 1323 XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); 1324 WARN_ON_ONCE(err); 1325 } 1326 1327 #if IS_ENABLED(CONFIG_KVM) 1328 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) 1329 { 1330 void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); 1331 1332 if (addr) 1333 memset(addr, 0, xstate_sizes[xfeature]); 1334 } 1335 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); 1336 #endif 1337 1338 #ifdef CONFIG_X86_64 1339 1340 #ifdef CONFIG_X86_DEBUG_FPU 1341 /* 1342 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask 1343 * can safely operate on the @fpstate buffer. 1344 */ 1345 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) 1346 { 1347 u64 xfd = __this_cpu_read(xfd_state); 1348 1349 if (fpstate->xfd == xfd) 1350 return true; 1351 1352 /* 1353 * The XFD MSR does not match fpstate->xfd. That's invalid when 1354 * the passed in fpstate is current's fpstate. 1355 */ 1356 if (fpstate->xfd == current->thread.fpu.fpstate->xfd) 1357 return false; 1358 1359 /* 1360 * XRSTOR(S) from init_fpstate are always correct as it will just 1361 * bring all components into init state and not read from the 1362 * buffer. XSAVE(S) raises #PF after init. 1363 */ 1364 if (fpstate == &init_fpstate) 1365 return rstor; 1366 1367 /* 1368 * XSAVE(S): clone(), fpu_swap_kvm_fpu() 1369 * XRSTORS(S): fpu_swap_kvm_fpu() 1370 */ 1371 1372 /* 1373 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch 1374 * the buffer area for XFD-disabled state components. 1375 */ 1376 mask &= ~xfd; 1377 1378 /* 1379 * Remove features which are valid in fpstate. They 1380 * have space allocated in fpstate. 1381 */ 1382 mask &= ~fpstate->xfeatures; 1383 1384 /* 1385 * Any remaining state components in 'mask' might be written 1386 * by XSAVE/XRSTOR. Fail validation it found. 1387 */ 1388 return !mask; 1389 } 1390 1391 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) 1392 { 1393 WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); 1394 } 1395 #endif /* CONFIG_X86_DEBUG_FPU */ 1396 1397 static int __init xfd_update_static_branch(void) 1398 { 1399 /* 1400 * If init_fpstate.xfd has bits set then dynamic features are 1401 * available and the dynamic sizing must be enabled. 1402 */ 1403 if (init_fpstate.xfd) 1404 static_branch_enable(&__fpu_state_size_dynamic); 1405 return 0; 1406 } 1407 arch_initcall(xfd_update_static_branch) 1408 1409 void fpstate_free(struct fpu *fpu) 1410 { 1411 if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) 1412 vfree(fpu->fpstate); 1413 } 1414 1415 /** 1416 * fpstate_realloc - Reallocate struct fpstate for the requested new features 1417 * 1418 * @xfeatures: A bitmap of xstate features which extend the enabled features 1419 * of that task 1420 * @ksize: The required size for the kernel buffer 1421 * @usize: The required size for user space buffers 1422 * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations 1423 * 1424 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer 1425 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks 1426 * with large states are likely to live longer. 1427 * 1428 * Returns: 0 on success, -ENOMEM on allocation error. 1429 */ 1430 static int fpstate_realloc(u64 xfeatures, unsigned int ksize, 1431 unsigned int usize, struct fpu_guest *guest_fpu) 1432 { 1433 struct fpu *fpu = ¤t->thread.fpu; 1434 struct fpstate *curfps, *newfps = NULL; 1435 unsigned int fpsize; 1436 bool in_use; 1437 1438 fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); 1439 1440 newfps = vzalloc(fpsize); 1441 if (!newfps) 1442 return -ENOMEM; 1443 newfps->size = ksize; 1444 newfps->user_size = usize; 1445 newfps->is_valloc = true; 1446 1447 /* 1448 * When a guest FPU is supplied, use @guest_fpu->fpstate 1449 * as reference independent whether it is in use or not. 1450 */ 1451 curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; 1452 1453 /* Determine whether @curfps is the active fpstate */ 1454 in_use = fpu->fpstate == curfps; 1455 1456 if (guest_fpu) { 1457 newfps->is_guest = true; 1458 newfps->is_confidential = curfps->is_confidential; 1459 newfps->in_use = curfps->in_use; 1460 guest_fpu->xfeatures |= xfeatures; 1461 guest_fpu->uabi_size = usize; 1462 } 1463 1464 fpregs_lock(); 1465 /* 1466 * If @curfps is in use, ensure that the current state is in the 1467 * registers before swapping fpstate as that might invalidate it 1468 * due to layout changes. 1469 */ 1470 if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) 1471 fpregs_restore_userregs(); 1472 1473 newfps->xfeatures = curfps->xfeatures | xfeatures; 1474 1475 if (!guest_fpu) 1476 newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; 1477 1478 newfps->xfd = curfps->xfd & ~xfeatures; 1479 1480 /* Do the final updates within the locked region */ 1481 xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); 1482 1483 if (guest_fpu) { 1484 guest_fpu->fpstate = newfps; 1485 /* If curfps is active, update the FPU fpstate pointer */ 1486 if (in_use) 1487 fpu->fpstate = newfps; 1488 } else { 1489 fpu->fpstate = newfps; 1490 } 1491 1492 if (in_use) 1493 xfd_update_state(fpu->fpstate); 1494 fpregs_unlock(); 1495 1496 /* Only free valloc'ed state */ 1497 if (curfps && curfps->is_valloc) 1498 vfree(curfps); 1499 1500 return 0; 1501 } 1502 1503 static int validate_sigaltstack(unsigned int usize) 1504 { 1505 struct task_struct *thread, *leader = current->group_leader; 1506 unsigned long framesize = get_sigframe_size(); 1507 1508 lockdep_assert_held(¤t->sighand->siglock); 1509 1510 /* get_sigframe_size() is based on fpu_user_cfg.max_size */ 1511 framesize -= fpu_user_cfg.max_size; 1512 framesize += usize; 1513 for_each_thread(leader, thread) { 1514 if (thread->sas_ss_size && thread->sas_ss_size < framesize) 1515 return -ENOSPC; 1516 } 1517 return 0; 1518 } 1519 1520 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) 1521 { 1522 /* 1523 * This deliberately does not exclude !XSAVES as we still might 1524 * decide to optionally context switch XCR0 or talk the silicon 1525 * vendors into extending XFD for the pre AMX states, especially 1526 * AVX512. 1527 */ 1528 bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); 1529 struct fpu *fpu = ¤t->group_leader->thread.fpu; 1530 struct fpu_state_perm *perm; 1531 unsigned int ksize, usize; 1532 u64 mask; 1533 int ret = 0; 1534 1535 /* Check whether fully enabled */ 1536 if ((permitted & requested) == requested) 1537 return 0; 1538 1539 /* Calculate the resulting kernel state size */ 1540 mask = permitted | requested; 1541 /* Take supervisor states into account on the host */ 1542 if (!guest) 1543 mask |= xfeatures_mask_supervisor(); 1544 ksize = xstate_calculate_size(mask, compacted); 1545 1546 /* Calculate the resulting user state size */ 1547 mask &= XFEATURE_MASK_USER_SUPPORTED; 1548 usize = xstate_calculate_size(mask, false); 1549 1550 if (!guest) { 1551 ret = validate_sigaltstack(usize); 1552 if (ret) 1553 return ret; 1554 } 1555 1556 perm = guest ? &fpu->guest_perm : &fpu->perm; 1557 /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ 1558 WRITE_ONCE(perm->__state_perm, mask); 1559 /* Protected by sighand lock */ 1560 perm->__state_size = ksize; 1561 perm->__user_state_size = usize; 1562 return ret; 1563 } 1564 1565 /* 1566 * Permissions array to map facilities with more than one component 1567 */ 1568 static const u64 xstate_prctl_req[XFEATURE_MAX] = { 1569 [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, 1570 }; 1571 1572 static int xstate_request_perm(unsigned long idx, bool guest) 1573 { 1574 u64 permitted, requested; 1575 int ret; 1576 1577 if (idx >= XFEATURE_MAX) 1578 return -EINVAL; 1579 1580 /* 1581 * Look up the facility mask which can require more than 1582 * one xstate component. 1583 */ 1584 idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); 1585 requested = xstate_prctl_req[idx]; 1586 if (!requested) 1587 return -EOPNOTSUPP; 1588 1589 if ((fpu_user_cfg.max_features & requested) != requested) 1590 return -EOPNOTSUPP; 1591 1592 /* Lockless quick check */ 1593 permitted = xstate_get_group_perm(guest); 1594 if ((permitted & requested) == requested) 1595 return 0; 1596 1597 /* Protect against concurrent modifications */ 1598 spin_lock_irq(¤t->sighand->siglock); 1599 permitted = xstate_get_group_perm(guest); 1600 1601 /* First vCPU allocation locks the permissions. */ 1602 if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) 1603 ret = -EBUSY; 1604 else 1605 ret = __xstate_request_perm(permitted, requested, guest); 1606 spin_unlock_irq(¤t->sighand->siglock); 1607 return ret; 1608 } 1609 1610 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) 1611 { 1612 u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; 1613 struct fpu_state_perm *perm; 1614 unsigned int ksize, usize; 1615 struct fpu *fpu; 1616 1617 if (!xfd_event) { 1618 if (!guest_fpu) 1619 pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); 1620 return 0; 1621 } 1622 1623 /* Protect against concurrent modifications */ 1624 spin_lock_irq(¤t->sighand->siglock); 1625 1626 /* If not permitted let it die */ 1627 if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { 1628 spin_unlock_irq(¤t->sighand->siglock); 1629 return -EPERM; 1630 } 1631 1632 fpu = ¤t->group_leader->thread.fpu; 1633 perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; 1634 ksize = perm->__state_size; 1635 usize = perm->__user_state_size; 1636 1637 /* 1638 * The feature is permitted. State size is sufficient. Dropping 1639 * the lock is safe here even if more features are added from 1640 * another task, the retrieved buffer sizes are valid for the 1641 * currently requested feature(s). 1642 */ 1643 spin_unlock_irq(¤t->sighand->siglock); 1644 1645 /* 1646 * Try to allocate a new fpstate. If that fails there is no way 1647 * out. 1648 */ 1649 if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) 1650 return -EFAULT; 1651 return 0; 1652 } 1653 1654 int xfd_enable_feature(u64 xfd_err) 1655 { 1656 return __xfd_enable_feature(xfd_err, NULL); 1657 } 1658 1659 #else /* CONFIG_X86_64 */ 1660 static inline int xstate_request_perm(unsigned long idx, bool guest) 1661 { 1662 return -EPERM; 1663 } 1664 #endif /* !CONFIG_X86_64 */ 1665 1666 u64 xstate_get_guest_group_perm(void) 1667 { 1668 return xstate_get_group_perm(true); 1669 } 1670 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); 1671 1672 /** 1673 * fpu_xstate_prctl - xstate permission operations 1674 * @tsk: Redundant pointer to current 1675 * @option: A subfunction of arch_prctl() 1676 * @arg2: option argument 1677 * Return: 0 if successful; otherwise, an error code 1678 * 1679 * Option arguments: 1680 * 1681 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info 1682 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info 1683 * ARCH_REQ_XCOMP_PERM: Facility number requested 1684 * 1685 * For facilities which require more than one XSTATE component, the request 1686 * must be the highest state component number related to that facility, 1687 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and 1688 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). 1689 */ 1690 long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) 1691 { 1692 u64 __user *uptr = (u64 __user *)arg2; 1693 u64 permitted, supported; 1694 unsigned long idx = arg2; 1695 bool guest = false; 1696 1697 if (tsk != current) 1698 return -EPERM; 1699 1700 switch (option) { 1701 case ARCH_GET_XCOMP_SUPP: 1702 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; 1703 return put_user(supported, uptr); 1704 1705 case ARCH_GET_XCOMP_PERM: 1706 /* 1707 * Lockless snapshot as it can also change right after the 1708 * dropping the lock. 1709 */ 1710 permitted = xstate_get_host_group_perm(); 1711 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1712 return put_user(permitted, uptr); 1713 1714 case ARCH_GET_XCOMP_GUEST_PERM: 1715 permitted = xstate_get_guest_group_perm(); 1716 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1717 return put_user(permitted, uptr); 1718 1719 case ARCH_REQ_XCOMP_GUEST_PERM: 1720 guest = true; 1721 fallthrough; 1722 1723 case ARCH_REQ_XCOMP_PERM: 1724 if (!IS_ENABLED(CONFIG_X86_64)) 1725 return -EOPNOTSUPP; 1726 1727 return xstate_request_perm(idx, guest); 1728 1729 default: 1730 return -EINVAL; 1731 } 1732 } 1733 1734 #ifdef CONFIG_PROC_PID_ARCH_STATUS 1735 /* 1736 * Report the amount of time elapsed in millisecond since last AVX512 1737 * use in the task. 1738 */ 1739 static void avx512_status(struct seq_file *m, struct task_struct *task) 1740 { 1741 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); 1742 long delta; 1743 1744 if (!timestamp) { 1745 /* 1746 * Report -1 if no AVX512 usage 1747 */ 1748 delta = -1; 1749 } else { 1750 delta = (long)(jiffies - timestamp); 1751 /* 1752 * Cap to LONG_MAX if time difference > LONG_MAX 1753 */ 1754 if (delta < 0) 1755 delta = LONG_MAX; 1756 delta = jiffies_to_msecs(delta); 1757 } 1758 1759 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); 1760 seq_putc(m, '\n'); 1761 } 1762 1763 /* 1764 * Report architecture specific information 1765 */ 1766 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, 1767 struct pid *pid, struct task_struct *task) 1768 { 1769 /* 1770 * Report AVX512 state if the processor and build option supported. 1771 */ 1772 if (cpu_feature_enabled(X86_FEATURE_AVX512F)) 1773 avx512_status(m, task); 1774 1775 return 0; 1776 } 1777 #endif /* CONFIG_PROC_PID_ARCH_STATUS */ 1778