1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * xsave/xrstor support. 4 * 5 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 6 */ 7 #include <linux/bitops.h> 8 #include <linux/compat.h> 9 #include <linux/cpu.h> 10 #include <linux/mman.h> 11 #include <linux/nospec.h> 12 #include <linux/pkeys.h> 13 #include <linux/seq_file.h> 14 #include <linux/proc_fs.h> 15 #include <linux/vmalloc.h> 16 17 #include <asm/fpu/api.h> 18 #include <asm/fpu/regset.h> 19 #include <asm/fpu/signal.h> 20 #include <asm/fpu/xcr.h> 21 22 #include <asm/tlbflush.h> 23 #include <asm/prctl.h> 24 #include <asm/elf.h> 25 26 #include "context.h" 27 #include "internal.h" 28 #include "legacy.h" 29 #include "xstate.h" 30 31 #define for_each_extended_xfeature(bit, mask) \ 32 (bit) = FIRST_EXTENDED_XFEATURE; \ 33 for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) 34 35 /* 36 * Although we spell it out in here, the Processor Trace 37 * xfeature is completely unused. We use other mechanisms 38 * to save/restore PT state in Linux. 39 */ 40 static const char *xfeature_names[] = 41 { 42 "x87 floating point registers" , 43 "SSE registers" , 44 "AVX registers" , 45 "MPX bounds registers" , 46 "MPX CSR" , 47 "AVX-512 opmask" , 48 "AVX-512 Hi256" , 49 "AVX-512 ZMM_Hi256" , 50 "Processor Trace (unused)" , 51 "Protection Keys User registers", 52 "PASID state", 53 "unknown xstate feature" , 54 "unknown xstate feature" , 55 "unknown xstate feature" , 56 "unknown xstate feature" , 57 "unknown xstate feature" , 58 "unknown xstate feature" , 59 "AMX Tile config" , 60 "AMX Tile data" , 61 "unknown xstate feature" , 62 }; 63 64 static unsigned short xsave_cpuid_features[] __initdata = { 65 [XFEATURE_FP] = X86_FEATURE_FPU, 66 [XFEATURE_SSE] = X86_FEATURE_XMM, 67 [XFEATURE_YMM] = X86_FEATURE_AVX, 68 [XFEATURE_BNDREGS] = X86_FEATURE_MPX, 69 [XFEATURE_BNDCSR] = X86_FEATURE_MPX, 70 [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, 71 [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, 72 [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, 73 [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, 74 [XFEATURE_PKRU] = X86_FEATURE_PKU, 75 [XFEATURE_PASID] = X86_FEATURE_ENQCMD, 76 [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, 77 [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, 78 }; 79 80 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = 81 { [ 0 ... XFEATURE_MAX - 1] = -1}; 82 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = 83 { [ 0 ... XFEATURE_MAX - 1] = -1}; 84 static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init = 85 { [ 0 ... XFEATURE_MAX - 1] = -1}; 86 static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init = 87 { [ 0 ... XFEATURE_MAX - 1] = -1}; 88 89 /* 90 * Return whether the system supports a given xfeature. 91 * 92 * Also return the name of the (most advanced) feature that the caller requested: 93 */ 94 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) 95 { 96 u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; 97 98 if (unlikely(feature_name)) { 99 long xfeature_idx, max_idx; 100 u64 xfeatures_print; 101 /* 102 * So we use FLS here to be able to print the most advanced 103 * feature that was requested but is missing. So if a driver 104 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the 105 * missing AVX feature - this is the most informative message 106 * to users: 107 */ 108 if (xfeatures_missing) 109 xfeatures_print = xfeatures_missing; 110 else 111 xfeatures_print = xfeatures_needed; 112 113 xfeature_idx = fls64(xfeatures_print)-1; 114 max_idx = ARRAY_SIZE(xfeature_names)-1; 115 xfeature_idx = min(xfeature_idx, max_idx); 116 117 *feature_name = xfeature_names[xfeature_idx]; 118 } 119 120 if (xfeatures_missing) 121 return 0; 122 123 return 1; 124 } 125 EXPORT_SYMBOL_GPL(cpu_has_xfeatures); 126 127 static bool xfeature_is_supervisor(int xfeature_nr) 128 { 129 /* 130 * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1) 131 * returns ECX[0] set to (1) for a supervisor state, and cleared (0) 132 * for a user state. 133 */ 134 u32 eax, ebx, ecx, edx; 135 136 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); 137 return ecx & 1; 138 } 139 140 /* 141 * Enable the extended processor state save/restore feature. 142 * Called once per CPU onlining. 143 */ 144 void fpu__init_cpu_xstate(void) 145 { 146 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) 147 return; 148 149 cr4_set_bits(X86_CR4_OSXSAVE); 150 151 /* 152 * Must happen after CR4 setup and before xsetbv() to allow KVM 153 * lazy passthrough. Write independent of the dynamic state static 154 * key as that does not work on the boot CPU. This also ensures 155 * that any stale state is wiped out from XFD. 156 */ 157 if (cpu_feature_enabled(X86_FEATURE_XFD)) 158 wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); 159 160 /* 161 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features 162 * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user 163 * states can be set here. 164 */ 165 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 166 167 /* 168 * MSR_IA32_XSS sets supervisor states managed by XSAVES. 169 */ 170 if (boot_cpu_has(X86_FEATURE_XSAVES)) { 171 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 172 xfeatures_mask_independent()); 173 } 174 } 175 176 static bool xfeature_enabled(enum xfeature xfeature) 177 { 178 return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); 179 } 180 181 /* 182 * Record the offsets and sizes of various xstates contained 183 * in the XSAVE state memory layout. 184 */ 185 static void __init setup_xstate_features(void) 186 { 187 u32 eax, ebx, ecx, edx, i; 188 /* start at the beginning of the "extended state" */ 189 unsigned int last_good_offset = offsetof(struct xregs_state, 190 extended_state_area); 191 /* 192 * The FP xstates and SSE xstates are legacy states. They are always 193 * in the fixed offsets in the xsave area in either compacted form 194 * or standard form. 195 */ 196 xstate_offsets[XFEATURE_FP] = 0; 197 xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, 198 xmm_space); 199 200 xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; 201 xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, 202 xmm_space); 203 204 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 205 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); 206 207 xstate_sizes[i] = eax; 208 209 /* 210 * If an xfeature is supervisor state, the offset in EBX is 211 * invalid, leave it to -1. 212 */ 213 if (xfeature_is_supervisor(i)) 214 continue; 215 216 xstate_offsets[i] = ebx; 217 218 /* 219 * In our xstate size checks, we assume that the highest-numbered 220 * xstate feature has the highest offset in the buffer. Ensure 221 * it does. 222 */ 223 WARN_ONCE(last_good_offset > xstate_offsets[i], 224 "x86/fpu: misordered xstate at %d\n", last_good_offset); 225 226 last_good_offset = xstate_offsets[i]; 227 } 228 } 229 230 static void __init print_xstate_feature(u64 xstate_mask) 231 { 232 const char *feature_name; 233 234 if (cpu_has_xfeatures(xstate_mask, &feature_name)) 235 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); 236 } 237 238 /* 239 * Print out all the supported xstate features: 240 */ 241 static void __init print_xstate_features(void) 242 { 243 print_xstate_feature(XFEATURE_MASK_FP); 244 print_xstate_feature(XFEATURE_MASK_SSE); 245 print_xstate_feature(XFEATURE_MASK_YMM); 246 print_xstate_feature(XFEATURE_MASK_BNDREGS); 247 print_xstate_feature(XFEATURE_MASK_BNDCSR); 248 print_xstate_feature(XFEATURE_MASK_OPMASK); 249 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); 250 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); 251 print_xstate_feature(XFEATURE_MASK_PKRU); 252 print_xstate_feature(XFEATURE_MASK_PASID); 253 print_xstate_feature(XFEATURE_MASK_XTILE_CFG); 254 print_xstate_feature(XFEATURE_MASK_XTILE_DATA); 255 } 256 257 /* 258 * This check is important because it is easy to get XSTATE_* 259 * confused with XSTATE_BIT_*. 260 */ 261 #define CHECK_XFEATURE(nr) do { \ 262 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ 263 WARN_ON(nr >= XFEATURE_MAX); \ 264 } while (0) 265 266 /* 267 * We could cache this like xstate_size[], but we only use 268 * it here, so it would be a waste of space. 269 */ 270 static int xfeature_is_aligned(int xfeature_nr) 271 { 272 u32 eax, ebx, ecx, edx; 273 274 CHECK_XFEATURE(xfeature_nr); 275 276 if (!xfeature_enabled(xfeature_nr)) { 277 WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n", 278 xfeature_nr); 279 return 0; 280 } 281 282 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); 283 /* 284 * The value returned by ECX[1] indicates the alignment 285 * of state component 'i' when the compacted format 286 * of the extended region of an XSAVE area is used: 287 */ 288 return !!(ecx & 2); 289 } 290 291 /* 292 * This function sets up offsets and sizes of all extended states in 293 * xsave area. This supports both standard format and compacted format 294 * of the xsave area. 295 */ 296 static void __init setup_xstate_comp_offsets(void) 297 { 298 unsigned int next_offset; 299 int i; 300 301 /* 302 * The FP xstates and SSE xstates are legacy states. They are always 303 * in the fixed offsets in the xsave area in either compacted form 304 * or standard form. 305 */ 306 xstate_comp_offsets[XFEATURE_FP] = 0; 307 xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, 308 xmm_space); 309 310 if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) { 311 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) 312 xstate_comp_offsets[i] = xstate_offsets[i]; 313 return; 314 } 315 316 next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; 317 318 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 319 if (xfeature_is_aligned(i)) 320 next_offset = ALIGN(next_offset, 64); 321 322 xstate_comp_offsets[i] = next_offset; 323 next_offset += xstate_sizes[i]; 324 } 325 } 326 327 /* 328 * Setup offsets of a supervisor-state-only XSAVES buffer: 329 * 330 * The offsets stored in xstate_comp_offsets[] only work for one specific 331 * value of the Requested Feature BitMap (RFBM). In cases where a different 332 * RFBM value is used, a different set of offsets is required. This set of 333 * offsets is for when RFBM=xfeatures_mask_supervisor(). 334 */ 335 static void __init setup_supervisor_only_offsets(void) 336 { 337 unsigned int next_offset; 338 int i; 339 340 next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; 341 342 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 343 if (!xfeature_is_supervisor(i)) 344 continue; 345 346 if (xfeature_is_aligned(i)) 347 next_offset = ALIGN(next_offset, 64); 348 349 xstate_supervisor_only_offsets[i] = next_offset; 350 next_offset += xstate_sizes[i]; 351 } 352 } 353 354 /* 355 * Print out xstate component offsets and sizes 356 */ 357 static void __init print_xstate_offset_size(void) 358 { 359 int i; 360 361 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 362 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", 363 i, xstate_comp_offsets[i], i, xstate_sizes[i]); 364 } 365 } 366 367 /* 368 * This function is called only during boot time when x86 caps are not set 369 * up and alternative can not be used yet. 370 */ 371 static __init void os_xrstor_booting(struct xregs_state *xstate) 372 { 373 u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; 374 u32 lmask = mask; 375 u32 hmask = mask >> 32; 376 int err; 377 378 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) 379 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); 380 else 381 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); 382 383 /* 384 * We should never fault when copying from a kernel buffer, and the FPU 385 * state we set at boot time should be valid. 386 */ 387 WARN_ON_FPU(err); 388 } 389 390 /* 391 * All supported features have either init state all zeros or are 392 * handled in setup_init_fpu() individually. This is an explicit 393 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch 394 * newly added supported features at build time and make people 395 * actually look at the init state for the new feature. 396 */ 397 #define XFEATURES_INIT_FPSTATE_HANDLED \ 398 (XFEATURE_MASK_FP | \ 399 XFEATURE_MASK_SSE | \ 400 XFEATURE_MASK_YMM | \ 401 XFEATURE_MASK_OPMASK | \ 402 XFEATURE_MASK_ZMM_Hi256 | \ 403 XFEATURE_MASK_Hi16_ZMM | \ 404 XFEATURE_MASK_PKRU | \ 405 XFEATURE_MASK_BNDREGS | \ 406 XFEATURE_MASK_BNDCSR | \ 407 XFEATURE_MASK_PASID | \ 408 XFEATURE_MASK_XTILE) 409 410 /* 411 * setup the xstate image representing the init state 412 */ 413 static void __init setup_init_fpu_buf(void) 414 { 415 BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | 416 XFEATURE_MASK_SUPERVISOR_SUPPORTED) != 417 XFEATURES_INIT_FPSTATE_HANDLED); 418 419 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 420 return; 421 422 setup_xstate_features(); 423 print_xstate_features(); 424 425 xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); 426 427 /* 428 * Init all the features state with header.xfeatures being 0x0 429 */ 430 os_xrstor_booting(&init_fpstate.regs.xsave); 431 432 /* 433 * All components are now in init state. Read the state back so 434 * that init_fpstate contains all non-zero init state. This only 435 * works with XSAVE, but not with XSAVEOPT and XSAVES because 436 * those use the init optimization which skips writing data for 437 * components in init state. 438 * 439 * XSAVE could be used, but that would require to reshuffle the 440 * data when XSAVES is available because XSAVES uses xstate 441 * compaction. But doing so is a pointless exercise because most 442 * components have an all zeros init state except for the legacy 443 * ones (FP and SSE). Those can be saved with FXSAVE into the 444 * legacy area. Adding new features requires to ensure that init 445 * state is all zeroes or if not to add the necessary handling 446 * here. 447 */ 448 fxsave(&init_fpstate.regs.fxsave); 449 } 450 451 static int xfeature_uncompacted_offset(int xfeature_nr) 452 { 453 u32 eax, ebx, ecx, edx; 454 455 /* 456 * Only XSAVES supports supervisor states and it uses compacted 457 * format. Checking a supervisor state's uncompacted offset is 458 * an error. 459 */ 460 if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) { 461 WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); 462 return -1; 463 } 464 465 CHECK_XFEATURE(xfeature_nr); 466 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); 467 return ebx; 468 } 469 470 int xfeature_size(int xfeature_nr) 471 { 472 u32 eax, ebx, ecx, edx; 473 474 CHECK_XFEATURE(xfeature_nr); 475 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); 476 return eax; 477 } 478 479 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ 480 static int validate_user_xstate_header(const struct xstate_header *hdr, 481 struct fpstate *fpstate) 482 { 483 /* No unknown or supervisor features may be set */ 484 if (hdr->xfeatures & ~fpstate->user_xfeatures) 485 return -EINVAL; 486 487 /* Userspace must use the uncompacted format */ 488 if (hdr->xcomp_bv) 489 return -EINVAL; 490 491 /* 492 * If 'reserved' is shrunken to add a new field, make sure to validate 493 * that new field here! 494 */ 495 BUILD_BUG_ON(sizeof(hdr->reserved) != 48); 496 497 /* No reserved bits may be set */ 498 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) 499 return -EINVAL; 500 501 return 0; 502 } 503 504 static void __init __xstate_dump_leaves(void) 505 { 506 int i; 507 u32 eax, ebx, ecx, edx; 508 static int should_dump = 1; 509 510 if (!should_dump) 511 return; 512 should_dump = 0; 513 /* 514 * Dump out a few leaves past the ones that we support 515 * just in case there are some goodies up there 516 */ 517 for (i = 0; i < XFEATURE_MAX + 10; i++) { 518 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); 519 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", 520 XSTATE_CPUID, i, eax, ebx, ecx, edx); 521 } 522 } 523 524 #define XSTATE_WARN_ON(x) do { \ 525 if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ 526 __xstate_dump_leaves(); \ 527 } \ 528 } while (0) 529 530 #define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ 531 if ((nr == nr_macro) && \ 532 WARN_ONCE(sz != sizeof(__struct), \ 533 "%s: struct is %zu bytes, cpu state %d bytes\n", \ 534 __stringify(nr_macro), sizeof(__struct), sz)) { \ 535 __xstate_dump_leaves(); \ 536 } \ 537 } while (0) 538 539 /** 540 * check_xtile_data_against_struct - Check tile data state size. 541 * 542 * Calculate the state size by multiplying the single tile size which is 543 * recorded in a C struct, and the number of tiles that the CPU informs. 544 * Compare the provided size with the calculation. 545 * 546 * @size: The tile data state size 547 * 548 * Returns: 0 on success, -EINVAL on mismatch. 549 */ 550 static int __init check_xtile_data_against_struct(int size) 551 { 552 u32 max_palid, palid, state_size; 553 u32 eax, ebx, ecx, edx; 554 u16 max_tile; 555 556 /* 557 * Check the maximum palette id: 558 * eax: the highest numbered palette subleaf. 559 */ 560 cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); 561 562 /* 563 * Cross-check each tile size and find the maximum number of 564 * supported tiles. 565 */ 566 for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { 567 u16 tile_size, max; 568 569 /* 570 * Check the tile size info: 571 * eax[31:16]: bytes per title 572 * ebx[31:16]: the max names (or max number of tiles) 573 */ 574 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); 575 tile_size = eax >> 16; 576 max = ebx >> 16; 577 578 if (tile_size != sizeof(struct xtile_data)) { 579 pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", 580 __stringify(XFEATURE_XTILE_DATA), 581 sizeof(struct xtile_data), tile_size); 582 __xstate_dump_leaves(); 583 return -EINVAL; 584 } 585 586 if (max > max_tile) 587 max_tile = max; 588 } 589 590 state_size = sizeof(struct xtile_data) * max_tile; 591 if (size != state_size) { 592 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", 593 __stringify(XFEATURE_XTILE_DATA), state_size, size); 594 __xstate_dump_leaves(); 595 return -EINVAL; 596 } 597 return 0; 598 } 599 600 /* 601 * We have a C struct for each 'xstate'. We need to ensure 602 * that our software representation matches what the CPU 603 * tells us about the state's size. 604 */ 605 static bool __init check_xstate_against_struct(int nr) 606 { 607 /* 608 * Ask the CPU for the size of the state. 609 */ 610 int sz = xfeature_size(nr); 611 /* 612 * Match each CPU state with the corresponding software 613 * structure. 614 */ 615 XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); 616 XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); 617 XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); 618 XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); 619 XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); 620 XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); 621 XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); 622 XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); 623 XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); 624 625 /* The tile data size varies between implementations. */ 626 if (nr == XFEATURE_XTILE_DATA) 627 check_xtile_data_against_struct(sz); 628 629 /* 630 * Make *SURE* to add any feature numbers in below if 631 * there are "holes" in the xsave state component 632 * numbers. 633 */ 634 if ((nr < XFEATURE_YMM) || 635 (nr >= XFEATURE_MAX) || 636 (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || 637 ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { 638 WARN_ONCE(1, "no structure for xstate: %d\n", nr); 639 XSTATE_WARN_ON(1); 640 return false; 641 } 642 return true; 643 } 644 645 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) 646 { 647 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; 648 int i; 649 650 for_each_extended_xfeature(i, xfeatures) { 651 /* Align from the end of the previous feature */ 652 if (xfeature_is_aligned(i)) 653 size = ALIGN(size, 64); 654 /* 655 * In compacted format the enabled features are packed, 656 * i.e. disabled features do not occupy space. 657 * 658 * In non-compacted format the offsets are fixed and 659 * disabled states still occupy space in the memory buffer. 660 */ 661 if (!compacted) 662 size = xfeature_uncompacted_offset(i); 663 /* 664 * Add the feature size even for non-compacted format 665 * to make the end result correct 666 */ 667 size += xfeature_size(i); 668 } 669 return size; 670 } 671 672 /* 673 * This essentially double-checks what the cpu told us about 674 * how large the XSAVE buffer needs to be. We are recalculating 675 * it to be safe. 676 * 677 * Independent XSAVE features allocate their own buffers and are not 678 * covered by these checks. Only the size of the buffer for task->fpu 679 * is checked here. 680 */ 681 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) 682 { 683 bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); 684 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; 685 int i; 686 687 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 688 if (!check_xstate_against_struct(i)) 689 return false; 690 /* 691 * Supervisor state components can be managed only by 692 * XSAVES. 693 */ 694 if (!compacted && xfeature_is_supervisor(i)) { 695 XSTATE_WARN_ON(1); 696 return false; 697 } 698 } 699 size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); 700 XSTATE_WARN_ON(size != kernel_size); 701 return size == kernel_size; 702 } 703 704 /* 705 * Get total size of enabled xstates in XCR0 | IA32_XSS. 706 * 707 * Note the SDM's wording here. "sub-function 0" only enumerates 708 * the size of the *user* states. If we use it to size a buffer 709 * that we use 'XSAVES' on, we could potentially overflow the 710 * buffer because 'XSAVES' saves system states too. 711 */ 712 static unsigned int __init get_xsaves_size(void) 713 { 714 unsigned int eax, ebx, ecx, edx; 715 /* 716 * - CPUID function 0DH, sub-function 1: 717 * EBX enumerates the size (in bytes) required by 718 * the XSAVES instruction for an XSAVE area 719 * containing all the state components 720 * corresponding to bits currently set in 721 * XCR0 | IA32_XSS. 722 */ 723 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); 724 return ebx; 725 } 726 727 /* 728 * Get the total size of the enabled xstates without the independent supervisor 729 * features. 730 */ 731 static unsigned int __init get_xsaves_size_no_independent(void) 732 { 733 u64 mask = xfeatures_mask_independent(); 734 unsigned int size; 735 736 if (!mask) 737 return get_xsaves_size(); 738 739 /* Disable independent features. */ 740 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); 741 742 /* 743 * Ask the hardware what size is required of the buffer. 744 * This is the size required for the task->fpu buffer. 745 */ 746 size = get_xsaves_size(); 747 748 /* Re-enable independent features so XSAVES will work on them again. */ 749 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); 750 751 return size; 752 } 753 754 static unsigned int __init get_xsave_size_user(void) 755 { 756 unsigned int eax, ebx, ecx, edx; 757 /* 758 * - CPUID function 0DH, sub-function 0: 759 * EBX enumerates the size (in bytes) required by 760 * the XSAVE instruction for an XSAVE area 761 * containing all the *user* state components 762 * corresponding to bits currently set in XCR0. 763 */ 764 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); 765 return ebx; 766 } 767 768 /* 769 * Will the runtime-enumerated 'xstate_size' fit in the init 770 * task's statically-allocated buffer? 771 */ 772 static bool __init is_supported_xstate_size(unsigned int test_xstate_size) 773 { 774 if (test_xstate_size <= sizeof(init_fpstate.regs)) 775 return true; 776 777 pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", 778 sizeof(init_fpstate.regs), test_xstate_size); 779 return false; 780 } 781 782 static int __init init_xstate_size(void) 783 { 784 /* Recompute the context size for enabled features: */ 785 unsigned int user_size, kernel_size, kernel_default_size; 786 bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); 787 788 /* Uncompacted user space size */ 789 user_size = get_xsave_size_user(); 790 791 /* 792 * XSAVES kernel size includes supervisor states and 793 * uses compacted format when available. 794 * 795 * XSAVE does not support supervisor states so 796 * kernel and user size is identical. 797 */ 798 if (compacted) 799 kernel_size = get_xsaves_size_no_independent(); 800 else 801 kernel_size = user_size; 802 803 kernel_default_size = 804 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); 805 806 /* Ensure we have the space to store all default enabled features. */ 807 if (!is_supported_xstate_size(kernel_default_size)) 808 return -EINVAL; 809 810 if (!paranoid_xstate_size_valid(kernel_size)) 811 return -EINVAL; 812 813 fpu_kernel_cfg.max_size = kernel_size; 814 fpu_user_cfg.max_size = user_size; 815 816 fpu_kernel_cfg.default_size = kernel_default_size; 817 fpu_user_cfg.default_size = 818 xstate_calculate_size(fpu_user_cfg.default_features, false); 819 820 return 0; 821 } 822 823 /* 824 * We enabled the XSAVE hardware, but something went wrong and 825 * we can not use it. Disable it. 826 */ 827 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) 828 { 829 fpu_kernel_cfg.max_features = 0; 830 cr4_clear_bits(X86_CR4_OSXSAVE); 831 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 832 833 /* Restore the legacy size.*/ 834 fpu_kernel_cfg.max_size = legacy_size; 835 fpu_kernel_cfg.default_size = legacy_size; 836 fpu_user_cfg.max_size = legacy_size; 837 fpu_user_cfg.default_size = legacy_size; 838 839 /* 840 * Prevent enabling the static branch which enables writes to the 841 * XFD MSR. 842 */ 843 init_fpstate.xfd = 0; 844 845 fpstate_reset(¤t->thread.fpu); 846 } 847 848 /* 849 * Enable and initialize the xsave feature. 850 * Called once per system bootup. 851 */ 852 void __init fpu__init_system_xstate(unsigned int legacy_size) 853 { 854 unsigned int eax, ebx, ecx, edx; 855 u64 xfeatures; 856 int err; 857 int i; 858 859 if (!boot_cpu_has(X86_FEATURE_FPU)) { 860 pr_info("x86/fpu: No FPU detected\n"); 861 return; 862 } 863 864 if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 865 pr_info("x86/fpu: x87 FPU will use %s\n", 866 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); 867 return; 868 } 869 870 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { 871 WARN_ON_FPU(1); 872 return; 873 } 874 875 /* 876 * Find user xstates supported by the processor. 877 */ 878 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); 879 fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); 880 881 /* 882 * Find supervisor xstates supported by the processor. 883 */ 884 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); 885 fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); 886 887 if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { 888 /* 889 * This indicates that something really unexpected happened 890 * with the enumeration. Disable XSAVE and try to continue 891 * booting without it. This is too early to BUG(). 892 */ 893 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", 894 fpu_kernel_cfg.max_features); 895 goto out_disable; 896 } 897 898 /* 899 * Clear XSAVE features that are disabled in the normal CPUID. 900 */ 901 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { 902 unsigned short cid = xsave_cpuid_features[i]; 903 904 /* Careful: X86_FEATURE_FPU is 0! */ 905 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) 906 fpu_kernel_cfg.max_features &= ~BIT_ULL(i); 907 } 908 909 if (!cpu_feature_enabled(X86_FEATURE_XFD)) 910 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; 911 912 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | 913 XFEATURE_MASK_SUPERVISOR_SUPPORTED; 914 915 fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; 916 fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; 917 918 /* Clean out dynamic features from default */ 919 fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; 920 fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 921 922 fpu_user_cfg.default_features = fpu_user_cfg.max_features; 923 fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 924 925 /* Store it for paranoia check at the end */ 926 xfeatures = fpu_kernel_cfg.max_features; 927 928 /* 929 * Initialize the default XFD state in initfp_state and enable the 930 * dynamic sizing mechanism if dynamic states are available. The 931 * static key cannot be enabled here because this runs before 932 * jump_label_init(). This is delayed to an initcall. 933 */ 934 init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; 935 936 /* Enable xstate instructions to be able to continue with initialization: */ 937 fpu__init_cpu_xstate(); 938 err = init_xstate_size(); 939 if (err) 940 goto out_disable; 941 942 /* Reset the state for the current task */ 943 fpstate_reset(¤t->thread.fpu); 944 945 /* 946 * Update info used for ptrace frames; use standard-format size and no 947 * supervisor xstates: 948 */ 949 update_regset_xstate_info(fpu_user_cfg.max_size, 950 fpu_user_cfg.max_features); 951 952 setup_init_fpu_buf(); 953 setup_xstate_comp_offsets(); 954 setup_supervisor_only_offsets(); 955 956 /* 957 * Paranoia check whether something in the setup modified the 958 * xfeatures mask. 959 */ 960 if (xfeatures != fpu_kernel_cfg.max_features) { 961 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", 962 xfeatures, fpu_kernel_cfg.max_features); 963 goto out_disable; 964 } 965 966 print_xstate_offset_size(); 967 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", 968 fpu_kernel_cfg.max_features, 969 fpu_kernel_cfg.max_size, 970 boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); 971 return; 972 973 out_disable: 974 /* something went wrong, try to boot without any XSAVE support */ 975 fpu__init_disable_system_xstate(legacy_size); 976 } 977 978 /* 979 * Restore minimal FPU state after suspend: 980 */ 981 void fpu__resume_cpu(void) 982 { 983 /* 984 * Restore XCR0 on xsave capable CPUs: 985 */ 986 if (cpu_feature_enabled(X86_FEATURE_XSAVE)) 987 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 988 989 /* 990 * Restore IA32_XSS. The same CPUID bit enumerates support 991 * of XSAVES and MSR_IA32_XSS. 992 */ 993 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { 994 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 995 xfeatures_mask_independent()); 996 } 997 998 if (fpu_state_size_dynamic()) 999 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); 1000 } 1001 1002 /* 1003 * Given an xstate feature nr, calculate where in the xsave 1004 * buffer the state is. Callers should ensure that the buffer 1005 * is valid. 1006 */ 1007 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 1008 { 1009 if (!xfeature_enabled(xfeature_nr)) { 1010 WARN_ON_FPU(1); 1011 return NULL; 1012 } 1013 1014 return (void *)xsave + xstate_comp_offsets[xfeature_nr]; 1015 } 1016 /* 1017 * Given the xsave area and a state inside, this function returns the 1018 * address of the state. 1019 * 1020 * This is the API that is called to get xstate address in either 1021 * standard format or compacted format of xsave area. 1022 * 1023 * Note that if there is no data for the field in the xsave buffer 1024 * this will return NULL. 1025 * 1026 * Inputs: 1027 * xstate: the thread's storage area for all FPU data 1028 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, 1029 * XFEATURE_SSE, etc...) 1030 * Output: 1031 * address of the state in the xsave area, or NULL if the 1032 * field is not present in the xsave buffer. 1033 */ 1034 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 1035 { 1036 /* 1037 * Do we even *have* xsave state? 1038 */ 1039 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 1040 return NULL; 1041 1042 /* 1043 * We should not ever be requesting features that we 1044 * have not enabled. 1045 */ 1046 WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)), 1047 "get of unsupported state"); 1048 /* 1049 * This assumes the last 'xsave*' instruction to 1050 * have requested that 'xfeature_nr' be saved. 1051 * If it did not, we might be seeing and old value 1052 * of the field in the buffer. 1053 * 1054 * This can happen because the last 'xsave' did not 1055 * request that this feature be saved (unlikely) 1056 * or because the "init optimization" caused it 1057 * to not be saved. 1058 */ 1059 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) 1060 return NULL; 1061 1062 return __raw_xsave_addr(xsave, xfeature_nr); 1063 } 1064 1065 #ifdef CONFIG_ARCH_HAS_PKEYS 1066 1067 /* 1068 * This will go out and modify PKRU register to set the access 1069 * rights for @pkey to @init_val. 1070 */ 1071 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 1072 unsigned long init_val) 1073 { 1074 u32 old_pkru, new_pkru_bits = 0; 1075 int pkey_shift; 1076 1077 /* 1078 * This check implies XSAVE support. OSPKE only gets 1079 * set if we enable XSAVE and we enable PKU in XCR0. 1080 */ 1081 if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) 1082 return -EINVAL; 1083 1084 /* 1085 * This code should only be called with valid 'pkey' 1086 * values originating from in-kernel users. Complain 1087 * if a bad value is observed. 1088 */ 1089 if (WARN_ON_ONCE(pkey >= arch_max_pkey())) 1090 return -EINVAL; 1091 1092 /* Set the bits we need in PKRU: */ 1093 if (init_val & PKEY_DISABLE_ACCESS) 1094 new_pkru_bits |= PKRU_AD_BIT; 1095 if (init_val & PKEY_DISABLE_WRITE) 1096 new_pkru_bits |= PKRU_WD_BIT; 1097 1098 /* Shift the bits in to the correct place in PKRU for pkey: */ 1099 pkey_shift = pkey * PKRU_BITS_PER_PKEY; 1100 new_pkru_bits <<= pkey_shift; 1101 1102 /* Get old PKRU and mask off any old bits in place: */ 1103 old_pkru = read_pkru(); 1104 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); 1105 1106 /* Write old part along with new part: */ 1107 write_pkru(old_pkru | new_pkru_bits); 1108 1109 return 0; 1110 } 1111 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ 1112 1113 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, 1114 void *init_xstate, unsigned int size) 1115 { 1116 membuf_write(to, from_xstate ? xstate : init_xstate, size); 1117 } 1118 1119 /** 1120 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1121 * @to: membuf descriptor 1122 * @fpstate: The fpstate buffer from which to copy 1123 * @pkru_val: The PKRU value to store in the PKRU component 1124 * @copy_mode: The requested copy mode 1125 * 1126 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1127 * format, i.e. from the kernel internal hardware dependent storage format 1128 * to the requested @mode. UABI XSTATE is always uncompacted! 1129 * 1130 * It supports partial copy but @to.pos always starts from zero. 1131 */ 1132 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, 1133 u32 pkru_val, enum xstate_copy_mode copy_mode) 1134 { 1135 const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); 1136 struct xregs_state *xinit = &init_fpstate.regs.xsave; 1137 struct xregs_state *xsave = &fpstate->regs.xsave; 1138 struct xstate_header header; 1139 unsigned int zerofrom; 1140 u64 mask; 1141 int i; 1142 1143 memset(&header, 0, sizeof(header)); 1144 header.xfeatures = xsave->header.xfeatures; 1145 1146 /* Mask out the feature bits depending on copy mode */ 1147 switch (copy_mode) { 1148 case XSTATE_COPY_FP: 1149 header.xfeatures &= XFEATURE_MASK_FP; 1150 break; 1151 1152 case XSTATE_COPY_FX: 1153 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; 1154 break; 1155 1156 case XSTATE_COPY_XSAVE: 1157 header.xfeatures &= fpstate->user_xfeatures; 1158 break; 1159 } 1160 1161 /* Copy FP state up to MXCSR */ 1162 copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, 1163 &xinit->i387, off_mxcsr); 1164 1165 /* Copy MXCSR when SSE or YMM are set in the feature mask */ 1166 copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), 1167 &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, 1168 MXCSR_AND_FLAGS_SIZE); 1169 1170 /* Copy the remaining FP state */ 1171 copy_feature(header.xfeatures & XFEATURE_MASK_FP, 1172 &to, &xsave->i387.st_space, &xinit->i387.st_space, 1173 sizeof(xsave->i387.st_space)); 1174 1175 /* Copy the SSE state - shared with YMM, but independently managed */ 1176 copy_feature(header.xfeatures & XFEATURE_MASK_SSE, 1177 &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, 1178 sizeof(xsave->i387.xmm_space)); 1179 1180 if (copy_mode != XSTATE_COPY_XSAVE) 1181 goto out; 1182 1183 /* Zero the padding area */ 1184 membuf_zero(&to, sizeof(xsave->i387.padding)); 1185 1186 /* Copy xsave->i387.sw_reserved */ 1187 membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); 1188 1189 /* Copy the user space relevant state of @xsave->header */ 1190 membuf_write(&to, &header, sizeof(header)); 1191 1192 zerofrom = offsetof(struct xregs_state, extended_state_area); 1193 1194 /* 1195 * The ptrace buffer is in non-compacted XSAVE format. In 1196 * non-compacted format disabled features still occupy state space, 1197 * but there is no state to copy from in the compacted 1198 * init_fpstate. The gap tracking will zero these states. 1199 */ 1200 mask = fpstate->user_xfeatures; 1201 1202 for_each_extended_xfeature(i, mask) { 1203 /* 1204 * If there was a feature or alignment gap, zero the space 1205 * in the destination buffer. 1206 */ 1207 if (zerofrom < xstate_offsets[i]) 1208 membuf_zero(&to, xstate_offsets[i] - zerofrom); 1209 1210 if (i == XFEATURE_PKRU) { 1211 struct pkru_state pkru = {0}; 1212 /* 1213 * PKRU is not necessarily up to date in the 1214 * XSAVE buffer. Use the provided value. 1215 */ 1216 pkru.pkru = pkru_val; 1217 membuf_write(&to, &pkru, sizeof(pkru)); 1218 } else { 1219 copy_feature(header.xfeatures & BIT_ULL(i), &to, 1220 __raw_xsave_addr(xsave, i), 1221 __raw_xsave_addr(xinit, i), 1222 xstate_sizes[i]); 1223 } 1224 /* 1225 * Keep track of the last copied state in the non-compacted 1226 * target buffer for gap zeroing. 1227 */ 1228 zerofrom = xstate_offsets[i] + xstate_sizes[i]; 1229 } 1230 1231 out: 1232 if (to.left) 1233 membuf_zero(&to, to.left); 1234 } 1235 1236 /** 1237 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1238 * @to: membuf descriptor 1239 * @tsk: The task from which to copy the saved xstate 1240 * @copy_mode: The requested copy mode 1241 * 1242 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1243 * format, i.e. from the kernel internal hardware dependent storage format 1244 * to the requested @mode. UABI XSTATE is always uncompacted! 1245 * 1246 * It supports partial copy but @to.pos always starts from zero. 1247 */ 1248 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, 1249 enum xstate_copy_mode copy_mode) 1250 { 1251 __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, 1252 tsk->thread.pkru, copy_mode); 1253 } 1254 1255 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, 1256 const void *kbuf, const void __user *ubuf) 1257 { 1258 if (kbuf) { 1259 memcpy(dst, kbuf + offset, size); 1260 } else { 1261 if (copy_from_user(dst, ubuf + offset, size)) 1262 return -EFAULT; 1263 } 1264 return 0; 1265 } 1266 1267 1268 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, 1269 const void __user *ubuf) 1270 { 1271 struct xregs_state *xsave = &fpstate->regs.xsave; 1272 unsigned int offset, size; 1273 struct xstate_header hdr; 1274 u64 mask; 1275 int i; 1276 1277 offset = offsetof(struct xregs_state, header); 1278 if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) 1279 return -EFAULT; 1280 1281 if (validate_user_xstate_header(&hdr, fpstate)) 1282 return -EINVAL; 1283 1284 /* Validate MXCSR when any of the related features is in use */ 1285 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; 1286 if (hdr.xfeatures & mask) { 1287 u32 mxcsr[2]; 1288 1289 offset = offsetof(struct fxregs_state, mxcsr); 1290 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) 1291 return -EFAULT; 1292 1293 /* Reserved bits in MXCSR must be zero. */ 1294 if (mxcsr[0] & ~mxcsr_feature_mask) 1295 return -EINVAL; 1296 1297 /* SSE and YMM require MXCSR even when FP is not in use. */ 1298 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { 1299 xsave->i387.mxcsr = mxcsr[0]; 1300 xsave->i387.mxcsr_mask = mxcsr[1]; 1301 } 1302 } 1303 1304 for (i = 0; i < XFEATURE_MAX; i++) { 1305 u64 mask = ((u64)1 << i); 1306 1307 if (hdr.xfeatures & mask) { 1308 void *dst = __raw_xsave_addr(xsave, i); 1309 1310 offset = xstate_offsets[i]; 1311 size = xstate_sizes[i]; 1312 1313 if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) 1314 return -EFAULT; 1315 } 1316 } 1317 1318 /* 1319 * The state that came in from userspace was user-state only. 1320 * Mask all the user states out of 'xfeatures': 1321 */ 1322 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; 1323 1324 /* 1325 * Add back in the features that came in from userspace: 1326 */ 1327 xsave->header.xfeatures |= hdr.xfeatures; 1328 1329 return 0; 1330 } 1331 1332 /* 1333 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] 1334 * format and copy to the target thread. Used by ptrace and KVM. 1335 */ 1336 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) 1337 { 1338 return copy_uabi_to_xstate(fpstate, kbuf, NULL); 1339 } 1340 1341 /* 1342 * Convert from a sigreturn standard-format user-space buffer to kernel 1343 * XSAVE[S] format and copy to the target thread. This is called from the 1344 * sigreturn() and rt_sigreturn() system calls. 1345 */ 1346 int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, 1347 const void __user *ubuf) 1348 { 1349 return copy_uabi_to_xstate(fpstate, NULL, ubuf); 1350 } 1351 1352 static bool validate_independent_components(u64 mask) 1353 { 1354 u64 xchk; 1355 1356 if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) 1357 return false; 1358 1359 xchk = ~xfeatures_mask_independent(); 1360 1361 if (WARN_ON_ONCE(!mask || mask & xchk)) 1362 return false; 1363 1364 return true; 1365 } 1366 1367 /** 1368 * xsaves - Save selected components to a kernel xstate buffer 1369 * @xstate: Pointer to the buffer 1370 * @mask: Feature mask to select the components to save 1371 * 1372 * The @xstate buffer must be 64 byte aligned and correctly initialized as 1373 * XSAVES does not write the full xstate header. Before first use the 1374 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer 1375 * can #GP. 1376 * 1377 * The feature mask must be a subset of the independent features. 1378 */ 1379 void xsaves(struct xregs_state *xstate, u64 mask) 1380 { 1381 int err; 1382 1383 if (!validate_independent_components(mask)) 1384 return; 1385 1386 XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); 1387 WARN_ON_ONCE(err); 1388 } 1389 1390 /** 1391 * xrstors - Restore selected components from a kernel xstate buffer 1392 * @xstate: Pointer to the buffer 1393 * @mask: Feature mask to select the components to restore 1394 * 1395 * The @xstate buffer must be 64 byte aligned and correctly initialized 1396 * otherwise XRSTORS from that buffer can #GP. 1397 * 1398 * Proper usage is to restore the state which was saved with 1399 * xsaves() into @xstate. 1400 * 1401 * The feature mask must be a subset of the independent features. 1402 */ 1403 void xrstors(struct xregs_state *xstate, u64 mask) 1404 { 1405 int err; 1406 1407 if (!validate_independent_components(mask)) 1408 return; 1409 1410 XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); 1411 WARN_ON_ONCE(err); 1412 } 1413 1414 #if IS_ENABLED(CONFIG_KVM) 1415 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) 1416 { 1417 void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); 1418 1419 if (addr) 1420 memset(addr, 0, xstate_sizes[xfeature]); 1421 } 1422 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); 1423 #endif 1424 1425 #ifdef CONFIG_X86_64 1426 1427 #ifdef CONFIG_X86_DEBUG_FPU 1428 /* 1429 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask 1430 * can safely operate on the @fpstate buffer. 1431 */ 1432 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) 1433 { 1434 u64 xfd = __this_cpu_read(xfd_state); 1435 1436 if (fpstate->xfd == xfd) 1437 return true; 1438 1439 /* 1440 * The XFD MSR does not match fpstate->xfd. That's invalid when 1441 * the passed in fpstate is current's fpstate. 1442 */ 1443 if (fpstate->xfd == current->thread.fpu.fpstate->xfd) 1444 return false; 1445 1446 /* 1447 * XRSTOR(S) from init_fpstate are always correct as it will just 1448 * bring all components into init state and not read from the 1449 * buffer. XSAVE(S) raises #PF after init. 1450 */ 1451 if (fpstate == &init_fpstate) 1452 return rstor; 1453 1454 /* 1455 * XSAVE(S): clone(), fpu_swap_kvm_fpu() 1456 * XRSTORS(S): fpu_swap_kvm_fpu() 1457 */ 1458 1459 /* 1460 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch 1461 * the buffer area for XFD-disabled state components. 1462 */ 1463 mask &= ~xfd; 1464 1465 /* 1466 * Remove features which are valid in fpstate. They 1467 * have space allocated in fpstate. 1468 */ 1469 mask &= ~fpstate->xfeatures; 1470 1471 /* 1472 * Any remaining state components in 'mask' might be written 1473 * by XSAVE/XRSTOR. Fail validation it found. 1474 */ 1475 return !mask; 1476 } 1477 1478 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) 1479 { 1480 WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); 1481 } 1482 #endif /* CONFIG_X86_DEBUG_FPU */ 1483 1484 static int __init xfd_update_static_branch(void) 1485 { 1486 /* 1487 * If init_fpstate.xfd has bits set then dynamic features are 1488 * available and the dynamic sizing must be enabled. 1489 */ 1490 if (init_fpstate.xfd) 1491 static_branch_enable(&__fpu_state_size_dynamic); 1492 return 0; 1493 } 1494 arch_initcall(xfd_update_static_branch) 1495 1496 void fpstate_free(struct fpu *fpu) 1497 { 1498 if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) 1499 vfree(fpu->fpstate); 1500 } 1501 1502 /** 1503 * fpstate_realloc - Reallocate struct fpstate for the requested new features 1504 * 1505 * @xfeatures: A bitmap of xstate features which extend the enabled features 1506 * of that task 1507 * @ksize: The required size for the kernel buffer 1508 * @usize: The required size for user space buffers 1509 * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations 1510 * 1511 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer 1512 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks 1513 * with large states are likely to live longer. 1514 * 1515 * Returns: 0 on success, -ENOMEM on allocation error. 1516 */ 1517 static int fpstate_realloc(u64 xfeatures, unsigned int ksize, 1518 unsigned int usize, struct fpu_guest *guest_fpu) 1519 { 1520 struct fpu *fpu = ¤t->thread.fpu; 1521 struct fpstate *curfps, *newfps = NULL; 1522 unsigned int fpsize; 1523 bool in_use; 1524 1525 fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); 1526 1527 newfps = vzalloc(fpsize); 1528 if (!newfps) 1529 return -ENOMEM; 1530 newfps->size = ksize; 1531 newfps->user_size = usize; 1532 newfps->is_valloc = true; 1533 1534 /* 1535 * When a guest FPU is supplied, use @guest_fpu->fpstate 1536 * as reference independent whether it is in use or not. 1537 */ 1538 curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; 1539 1540 /* Determine whether @curfps is the active fpstate */ 1541 in_use = fpu->fpstate == curfps; 1542 1543 if (guest_fpu) { 1544 newfps->is_guest = true; 1545 newfps->is_confidential = curfps->is_confidential; 1546 newfps->in_use = curfps->in_use; 1547 guest_fpu->xfeatures |= xfeatures; 1548 guest_fpu->uabi_size = usize; 1549 } 1550 1551 fpregs_lock(); 1552 /* 1553 * If @curfps is in use, ensure that the current state is in the 1554 * registers before swapping fpstate as that might invalidate it 1555 * due to layout changes. 1556 */ 1557 if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) 1558 fpregs_restore_userregs(); 1559 1560 newfps->xfeatures = curfps->xfeatures | xfeatures; 1561 newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; 1562 newfps->xfd = curfps->xfd & ~xfeatures; 1563 1564 /* Do the final updates within the locked region */ 1565 xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); 1566 1567 if (guest_fpu) { 1568 guest_fpu->fpstate = newfps; 1569 /* If curfps is active, update the FPU fpstate pointer */ 1570 if (in_use) 1571 fpu->fpstate = newfps; 1572 } else { 1573 fpu->fpstate = newfps; 1574 } 1575 1576 if (in_use) 1577 xfd_update_state(fpu->fpstate); 1578 fpregs_unlock(); 1579 1580 /* Only free valloc'ed state */ 1581 if (curfps && curfps->is_valloc) 1582 vfree(curfps); 1583 1584 return 0; 1585 } 1586 1587 static int validate_sigaltstack(unsigned int usize) 1588 { 1589 struct task_struct *thread, *leader = current->group_leader; 1590 unsigned long framesize = get_sigframe_size(); 1591 1592 lockdep_assert_held(¤t->sighand->siglock); 1593 1594 /* get_sigframe_size() is based on fpu_user_cfg.max_size */ 1595 framesize -= fpu_user_cfg.max_size; 1596 framesize += usize; 1597 for_each_thread(leader, thread) { 1598 if (thread->sas_ss_size && thread->sas_ss_size < framesize) 1599 return -ENOSPC; 1600 } 1601 return 0; 1602 } 1603 1604 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) 1605 { 1606 /* 1607 * This deliberately does not exclude !XSAVES as we still might 1608 * decide to optionally context switch XCR0 or talk the silicon 1609 * vendors into extending XFD for the pre AMX states, especially 1610 * AVX512. 1611 */ 1612 bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); 1613 struct fpu *fpu = ¤t->group_leader->thread.fpu; 1614 struct fpu_state_perm *perm; 1615 unsigned int ksize, usize; 1616 u64 mask; 1617 int ret = 0; 1618 1619 /* Check whether fully enabled */ 1620 if ((permitted & requested) == requested) 1621 return 0; 1622 1623 /* Calculate the resulting kernel state size */ 1624 mask = permitted | requested; 1625 ksize = xstate_calculate_size(mask, compacted); 1626 1627 /* Calculate the resulting user state size */ 1628 mask &= XFEATURE_MASK_USER_SUPPORTED; 1629 usize = xstate_calculate_size(mask, false); 1630 1631 if (!guest) { 1632 ret = validate_sigaltstack(usize); 1633 if (ret) 1634 return ret; 1635 } 1636 1637 perm = guest ? &fpu->guest_perm : &fpu->perm; 1638 /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ 1639 WRITE_ONCE(perm->__state_perm, requested); 1640 /* Protected by sighand lock */ 1641 perm->__state_size = ksize; 1642 perm->__user_state_size = usize; 1643 return ret; 1644 } 1645 1646 /* 1647 * Permissions array to map facilities with more than one component 1648 */ 1649 static const u64 xstate_prctl_req[XFEATURE_MAX] = { 1650 [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, 1651 }; 1652 1653 static int xstate_request_perm(unsigned long idx, bool guest) 1654 { 1655 u64 permitted, requested; 1656 int ret; 1657 1658 if (idx >= XFEATURE_MAX) 1659 return -EINVAL; 1660 1661 /* 1662 * Look up the facility mask which can require more than 1663 * one xstate component. 1664 */ 1665 idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); 1666 requested = xstate_prctl_req[idx]; 1667 if (!requested) 1668 return -EOPNOTSUPP; 1669 1670 if ((fpu_user_cfg.max_features & requested) != requested) 1671 return -EOPNOTSUPP; 1672 1673 /* Lockless quick check */ 1674 permitted = xstate_get_group_perm(guest); 1675 if ((permitted & requested) == requested) 1676 return 0; 1677 1678 /* Protect against concurrent modifications */ 1679 spin_lock_irq(¤t->sighand->siglock); 1680 permitted = xstate_get_group_perm(guest); 1681 1682 /* First vCPU allocation locks the permissions. */ 1683 if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) 1684 ret = -EBUSY; 1685 else 1686 ret = __xstate_request_perm(permitted, requested, guest); 1687 spin_unlock_irq(¤t->sighand->siglock); 1688 return ret; 1689 } 1690 1691 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) 1692 { 1693 u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; 1694 struct fpu_state_perm *perm; 1695 unsigned int ksize, usize; 1696 struct fpu *fpu; 1697 1698 if (!xfd_event) { 1699 if (!guest_fpu) 1700 pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); 1701 return 0; 1702 } 1703 1704 /* Protect against concurrent modifications */ 1705 spin_lock_irq(¤t->sighand->siglock); 1706 1707 /* If not permitted let it die */ 1708 if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { 1709 spin_unlock_irq(¤t->sighand->siglock); 1710 return -EPERM; 1711 } 1712 1713 fpu = ¤t->group_leader->thread.fpu; 1714 perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; 1715 ksize = perm->__state_size; 1716 usize = perm->__user_state_size; 1717 1718 /* 1719 * The feature is permitted. State size is sufficient. Dropping 1720 * the lock is safe here even if more features are added from 1721 * another task, the retrieved buffer sizes are valid for the 1722 * currently requested feature(s). 1723 */ 1724 spin_unlock_irq(¤t->sighand->siglock); 1725 1726 /* 1727 * Try to allocate a new fpstate. If that fails there is no way 1728 * out. 1729 */ 1730 if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) 1731 return -EFAULT; 1732 return 0; 1733 } 1734 1735 int xfd_enable_feature(u64 xfd_err) 1736 { 1737 return __xfd_enable_feature(xfd_err, NULL); 1738 } 1739 1740 #else /* CONFIG_X86_64 */ 1741 static inline int xstate_request_perm(unsigned long idx, bool guest) 1742 { 1743 return -EPERM; 1744 } 1745 #endif /* !CONFIG_X86_64 */ 1746 1747 u64 xstate_get_guest_group_perm(void) 1748 { 1749 return xstate_get_group_perm(true); 1750 } 1751 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); 1752 1753 /** 1754 * fpu_xstate_prctl - xstate permission operations 1755 * @tsk: Redundant pointer to current 1756 * @option: A subfunction of arch_prctl() 1757 * @arg2: option argument 1758 * Return: 0 if successful; otherwise, an error code 1759 * 1760 * Option arguments: 1761 * 1762 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info 1763 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info 1764 * ARCH_REQ_XCOMP_PERM: Facility number requested 1765 * 1766 * For facilities which require more than one XSTATE component, the request 1767 * must be the highest state component number related to that facility, 1768 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and 1769 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). 1770 */ 1771 long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) 1772 { 1773 u64 __user *uptr = (u64 __user *)arg2; 1774 u64 permitted, supported; 1775 unsigned long idx = arg2; 1776 bool guest = false; 1777 1778 if (tsk != current) 1779 return -EPERM; 1780 1781 switch (option) { 1782 case ARCH_GET_XCOMP_SUPP: 1783 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; 1784 return put_user(supported, uptr); 1785 1786 case ARCH_GET_XCOMP_PERM: 1787 /* 1788 * Lockless snapshot as it can also change right after the 1789 * dropping the lock. 1790 */ 1791 permitted = xstate_get_host_group_perm(); 1792 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1793 return put_user(permitted, uptr); 1794 1795 case ARCH_GET_XCOMP_GUEST_PERM: 1796 permitted = xstate_get_guest_group_perm(); 1797 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1798 return put_user(permitted, uptr); 1799 1800 case ARCH_REQ_XCOMP_GUEST_PERM: 1801 guest = true; 1802 fallthrough; 1803 1804 case ARCH_REQ_XCOMP_PERM: 1805 if (!IS_ENABLED(CONFIG_X86_64)) 1806 return -EOPNOTSUPP; 1807 1808 return xstate_request_perm(idx, guest); 1809 1810 default: 1811 return -EINVAL; 1812 } 1813 } 1814 1815 #ifdef CONFIG_PROC_PID_ARCH_STATUS 1816 /* 1817 * Report the amount of time elapsed in millisecond since last AVX512 1818 * use in the task. 1819 */ 1820 static void avx512_status(struct seq_file *m, struct task_struct *task) 1821 { 1822 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); 1823 long delta; 1824 1825 if (!timestamp) { 1826 /* 1827 * Report -1 if no AVX512 usage 1828 */ 1829 delta = -1; 1830 } else { 1831 delta = (long)(jiffies - timestamp); 1832 /* 1833 * Cap to LONG_MAX if time difference > LONG_MAX 1834 */ 1835 if (delta < 0) 1836 delta = LONG_MAX; 1837 delta = jiffies_to_msecs(delta); 1838 } 1839 1840 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); 1841 seq_putc(m, '\n'); 1842 } 1843 1844 /* 1845 * Report architecture specific information 1846 */ 1847 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, 1848 struct pid *pid, struct task_struct *task) 1849 { 1850 /* 1851 * Report AVX512 state if the processor and build option supported. 1852 */ 1853 if (cpu_feature_enabled(X86_FEATURE_AVX512F)) 1854 avx512_status(m, task); 1855 1856 return 0; 1857 } 1858 #endif /* CONFIG_PROC_PID_ARCH_STATUS */ 1859