1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * xsave/xrstor support. 4 * 5 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 6 */ 7 #include <linux/bitops.h> 8 #include <linux/compat.h> 9 #include <linux/cpu.h> 10 #include <linux/mman.h> 11 #include <linux/nospec.h> 12 #include <linux/pkeys.h> 13 #include <linux/seq_file.h> 14 #include <linux/proc_fs.h> 15 #include <linux/vmalloc.h> 16 17 #include <asm/fpu/api.h> 18 #include <asm/fpu/regset.h> 19 #include <asm/fpu/signal.h> 20 #include <asm/fpu/xcr.h> 21 22 #include <asm/tlbflush.h> 23 #include <asm/prctl.h> 24 #include <asm/elf.h> 25 26 #include "context.h" 27 #include "internal.h" 28 #include "legacy.h" 29 #include "xstate.h" 30 31 #define for_each_extended_xfeature(bit, mask) \ 32 (bit) = FIRST_EXTENDED_XFEATURE; \ 33 for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) 34 35 /* 36 * Although we spell it out in here, the Processor Trace 37 * xfeature is completely unused. We use other mechanisms 38 * to save/restore PT state in Linux. 39 */ 40 static const char *xfeature_names[] = 41 { 42 "x87 floating point registers", 43 "SSE registers", 44 "AVX registers", 45 "MPX bounds registers", 46 "MPX CSR", 47 "AVX-512 opmask", 48 "AVX-512 Hi256", 49 "AVX-512 ZMM_Hi256", 50 "Processor Trace (unused)", 51 "Protection Keys User registers", 52 "PASID state", 53 "Control-flow User registers", 54 "Control-flow Kernel registers (unused)", 55 "unknown xstate feature", 56 "unknown xstate feature", 57 "unknown xstate feature", 58 "unknown xstate feature", 59 "AMX Tile config", 60 "AMX Tile data", 61 "unknown xstate feature", 62 }; 63 64 static unsigned short xsave_cpuid_features[] __initdata = { 65 [XFEATURE_FP] = X86_FEATURE_FPU, 66 [XFEATURE_SSE] = X86_FEATURE_XMM, 67 [XFEATURE_YMM] = X86_FEATURE_AVX, 68 [XFEATURE_BNDREGS] = X86_FEATURE_MPX, 69 [XFEATURE_BNDCSR] = X86_FEATURE_MPX, 70 [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, 71 [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, 72 [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, 73 [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, 74 [XFEATURE_PKRU] = X86_FEATURE_OSPKE, 75 [XFEATURE_PASID] = X86_FEATURE_ENQCMD, 76 [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, 77 [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, 78 [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, 79 }; 80 81 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = 82 { [ 0 ... XFEATURE_MAX - 1] = -1}; 83 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = 84 { [ 0 ... XFEATURE_MAX - 1] = -1}; 85 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; 86 87 #define XSTATE_FLAG_SUPERVISOR BIT(0) 88 #define XSTATE_FLAG_ALIGNED64 BIT(1) 89 90 /* 91 * Return whether the system supports a given xfeature. 92 * 93 * Also return the name of the (most advanced) feature that the caller requested: 94 */ 95 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) 96 { 97 u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; 98 99 if (unlikely(feature_name)) { 100 long xfeature_idx, max_idx; 101 u64 xfeatures_print; 102 /* 103 * So we use FLS here to be able to print the most advanced 104 * feature that was requested but is missing. So if a driver 105 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the 106 * missing AVX feature - this is the most informative message 107 * to users: 108 */ 109 if (xfeatures_missing) 110 xfeatures_print = xfeatures_missing; 111 else 112 xfeatures_print = xfeatures_needed; 113 114 xfeature_idx = fls64(xfeatures_print)-1; 115 max_idx = ARRAY_SIZE(xfeature_names)-1; 116 xfeature_idx = min(xfeature_idx, max_idx); 117 118 *feature_name = xfeature_names[xfeature_idx]; 119 } 120 121 if (xfeatures_missing) 122 return 0; 123 124 return 1; 125 } 126 EXPORT_SYMBOL_GPL(cpu_has_xfeatures); 127 128 static bool xfeature_is_aligned64(int xfeature_nr) 129 { 130 return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; 131 } 132 133 static bool xfeature_is_supervisor(int xfeature_nr) 134 { 135 return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; 136 } 137 138 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) 139 { 140 unsigned int offs, i; 141 142 /* 143 * Non-compacted format and legacy features use the cached fixed 144 * offsets. 145 */ 146 if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || 147 xfeature <= XFEATURE_SSE) 148 return xstate_offsets[xfeature]; 149 150 /* 151 * Compacted format offsets depend on the actual content of the 152 * compacted xsave area which is determined by the xcomp_bv header 153 * field. 154 */ 155 offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; 156 for_each_extended_xfeature(i, xcomp_bv) { 157 if (xfeature_is_aligned64(i)) 158 offs = ALIGN(offs, 64); 159 if (i == xfeature) 160 break; 161 offs += xstate_sizes[i]; 162 } 163 return offs; 164 } 165 166 /* 167 * Enable the extended processor state save/restore feature. 168 * Called once per CPU onlining. 169 */ 170 void fpu__init_cpu_xstate(void) 171 { 172 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) 173 return; 174 175 cr4_set_bits(X86_CR4_OSXSAVE); 176 177 /* 178 * Must happen after CR4 setup and before xsetbv() to allow KVM 179 * lazy passthrough. Write independent of the dynamic state static 180 * key as that does not work on the boot CPU. This also ensures 181 * that any stale state is wiped out from XFD. 182 */ 183 if (cpu_feature_enabled(X86_FEATURE_XFD)) 184 wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); 185 186 /* 187 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features 188 * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user 189 * states can be set here. 190 */ 191 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 192 193 /* 194 * MSR_IA32_XSS sets supervisor states managed by XSAVES. 195 */ 196 if (boot_cpu_has(X86_FEATURE_XSAVES)) { 197 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 198 xfeatures_mask_independent()); 199 } 200 } 201 202 static bool xfeature_enabled(enum xfeature xfeature) 203 { 204 return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); 205 } 206 207 /* 208 * Record the offsets and sizes of various xstates contained 209 * in the XSAVE state memory layout. 210 */ 211 static void __init setup_xstate_cache(void) 212 { 213 u32 eax, ebx, ecx, edx, i; 214 /* start at the beginning of the "extended state" */ 215 unsigned int last_good_offset = offsetof(struct xregs_state, 216 extended_state_area); 217 /* 218 * The FP xstates and SSE xstates are legacy states. They are always 219 * in the fixed offsets in the xsave area in either compacted form 220 * or standard form. 221 */ 222 xstate_offsets[XFEATURE_FP] = 0; 223 xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, 224 xmm_space); 225 226 xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; 227 xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, 228 xmm_space); 229 230 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 231 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); 232 233 xstate_sizes[i] = eax; 234 xstate_flags[i] = ecx; 235 236 /* 237 * If an xfeature is supervisor state, the offset in EBX is 238 * invalid, leave it to -1. 239 */ 240 if (xfeature_is_supervisor(i)) 241 continue; 242 243 xstate_offsets[i] = ebx; 244 245 /* 246 * In our xstate size checks, we assume that the highest-numbered 247 * xstate feature has the highest offset in the buffer. Ensure 248 * it does. 249 */ 250 WARN_ONCE(last_good_offset > xstate_offsets[i], 251 "x86/fpu: misordered xstate at %d\n", last_good_offset); 252 253 last_good_offset = xstate_offsets[i]; 254 } 255 } 256 257 static void __init print_xstate_feature(u64 xstate_mask) 258 { 259 const char *feature_name; 260 261 if (cpu_has_xfeatures(xstate_mask, &feature_name)) 262 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); 263 } 264 265 /* 266 * Print out all the supported xstate features: 267 */ 268 static void __init print_xstate_features(void) 269 { 270 print_xstate_feature(XFEATURE_MASK_FP); 271 print_xstate_feature(XFEATURE_MASK_SSE); 272 print_xstate_feature(XFEATURE_MASK_YMM); 273 print_xstate_feature(XFEATURE_MASK_BNDREGS); 274 print_xstate_feature(XFEATURE_MASK_BNDCSR); 275 print_xstate_feature(XFEATURE_MASK_OPMASK); 276 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); 277 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); 278 print_xstate_feature(XFEATURE_MASK_PKRU); 279 print_xstate_feature(XFEATURE_MASK_PASID); 280 print_xstate_feature(XFEATURE_MASK_CET_USER); 281 print_xstate_feature(XFEATURE_MASK_XTILE_CFG); 282 print_xstate_feature(XFEATURE_MASK_XTILE_DATA); 283 } 284 285 /* 286 * This check is important because it is easy to get XSTATE_* 287 * confused with XSTATE_BIT_*. 288 */ 289 #define CHECK_XFEATURE(nr) do { \ 290 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ 291 WARN_ON(nr >= XFEATURE_MAX); \ 292 } while (0) 293 294 /* 295 * Print out xstate component offsets and sizes 296 */ 297 static void __init print_xstate_offset_size(void) 298 { 299 int i; 300 301 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 302 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", 303 i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), 304 i, xstate_sizes[i]); 305 } 306 } 307 308 /* 309 * This function is called only during boot time when x86 caps are not set 310 * up and alternative can not be used yet. 311 */ 312 static __init void os_xrstor_booting(struct xregs_state *xstate) 313 { 314 u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; 315 u32 lmask = mask; 316 u32 hmask = mask >> 32; 317 int err; 318 319 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) 320 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); 321 else 322 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); 323 324 /* 325 * We should never fault when copying from a kernel buffer, and the FPU 326 * state we set at boot time should be valid. 327 */ 328 WARN_ON_FPU(err); 329 } 330 331 /* 332 * All supported features have either init state all zeros or are 333 * handled in setup_init_fpu() individually. This is an explicit 334 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch 335 * newly added supported features at build time and make people 336 * actually look at the init state for the new feature. 337 */ 338 #define XFEATURES_INIT_FPSTATE_HANDLED \ 339 (XFEATURE_MASK_FP | \ 340 XFEATURE_MASK_SSE | \ 341 XFEATURE_MASK_YMM | \ 342 XFEATURE_MASK_OPMASK | \ 343 XFEATURE_MASK_ZMM_Hi256 | \ 344 XFEATURE_MASK_Hi16_ZMM | \ 345 XFEATURE_MASK_PKRU | \ 346 XFEATURE_MASK_BNDREGS | \ 347 XFEATURE_MASK_BNDCSR | \ 348 XFEATURE_MASK_PASID | \ 349 XFEATURE_MASK_CET_USER | \ 350 XFEATURE_MASK_XTILE) 351 352 /* 353 * setup the xstate image representing the init state 354 */ 355 static void __init setup_init_fpu_buf(void) 356 { 357 BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | 358 XFEATURE_MASK_SUPERVISOR_SUPPORTED) != 359 XFEATURES_INIT_FPSTATE_HANDLED); 360 361 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 362 return; 363 364 print_xstate_features(); 365 366 xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); 367 368 /* 369 * Init all the features state with header.xfeatures being 0x0 370 */ 371 os_xrstor_booting(&init_fpstate.regs.xsave); 372 373 /* 374 * All components are now in init state. Read the state back so 375 * that init_fpstate contains all non-zero init state. This only 376 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because 377 * those use the init optimization which skips writing data for 378 * components in init state. 379 * 380 * XSAVE could be used, but that would require to reshuffle the 381 * data when XSAVEC/S is available because XSAVEC/S uses xstate 382 * compaction. But doing so is a pointless exercise because most 383 * components have an all zeros init state except for the legacy 384 * ones (FP and SSE). Those can be saved with FXSAVE into the 385 * legacy area. Adding new features requires to ensure that init 386 * state is all zeroes or if not to add the necessary handling 387 * here. 388 */ 389 fxsave(&init_fpstate.regs.fxsave); 390 } 391 392 int xfeature_size(int xfeature_nr) 393 { 394 u32 eax, ebx, ecx, edx; 395 396 CHECK_XFEATURE(xfeature_nr); 397 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); 398 return eax; 399 } 400 401 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ 402 static int validate_user_xstate_header(const struct xstate_header *hdr, 403 struct fpstate *fpstate) 404 { 405 /* No unknown or supervisor features may be set */ 406 if (hdr->xfeatures & ~fpstate->user_xfeatures) 407 return -EINVAL; 408 409 /* Userspace must use the uncompacted format */ 410 if (hdr->xcomp_bv) 411 return -EINVAL; 412 413 /* 414 * If 'reserved' is shrunken to add a new field, make sure to validate 415 * that new field here! 416 */ 417 BUILD_BUG_ON(sizeof(hdr->reserved) != 48); 418 419 /* No reserved bits may be set */ 420 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) 421 return -EINVAL; 422 423 return 0; 424 } 425 426 static void __init __xstate_dump_leaves(void) 427 { 428 int i; 429 u32 eax, ebx, ecx, edx; 430 static int should_dump = 1; 431 432 if (!should_dump) 433 return; 434 should_dump = 0; 435 /* 436 * Dump out a few leaves past the ones that we support 437 * just in case there are some goodies up there 438 */ 439 for (i = 0; i < XFEATURE_MAX + 10; i++) { 440 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); 441 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", 442 XSTATE_CPUID, i, eax, ebx, ecx, edx); 443 } 444 } 445 446 #define XSTATE_WARN_ON(x, fmt, ...) do { \ 447 if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ 448 __xstate_dump_leaves(); \ 449 } \ 450 } while (0) 451 452 #define XCHECK_SZ(sz, nr, __struct) ({ \ 453 if (WARN_ONCE(sz != sizeof(__struct), \ 454 "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ 455 xfeature_names[nr], sizeof(__struct), sz)) { \ 456 __xstate_dump_leaves(); \ 457 } \ 458 true; \ 459 }) 460 461 462 /** 463 * check_xtile_data_against_struct - Check tile data state size. 464 * 465 * Calculate the state size by multiplying the single tile size which is 466 * recorded in a C struct, and the number of tiles that the CPU informs. 467 * Compare the provided size with the calculation. 468 * 469 * @size: The tile data state size 470 * 471 * Returns: 0 on success, -EINVAL on mismatch. 472 */ 473 static int __init check_xtile_data_against_struct(int size) 474 { 475 u32 max_palid, palid, state_size; 476 u32 eax, ebx, ecx, edx; 477 u16 max_tile; 478 479 /* 480 * Check the maximum palette id: 481 * eax: the highest numbered palette subleaf. 482 */ 483 cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); 484 485 /* 486 * Cross-check each tile size and find the maximum number of 487 * supported tiles. 488 */ 489 for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { 490 u16 tile_size, max; 491 492 /* 493 * Check the tile size info: 494 * eax[31:16]: bytes per title 495 * ebx[31:16]: the max names (or max number of tiles) 496 */ 497 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); 498 tile_size = eax >> 16; 499 max = ebx >> 16; 500 501 if (tile_size != sizeof(struct xtile_data)) { 502 pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", 503 __stringify(XFEATURE_XTILE_DATA), 504 sizeof(struct xtile_data), tile_size); 505 __xstate_dump_leaves(); 506 return -EINVAL; 507 } 508 509 if (max > max_tile) 510 max_tile = max; 511 } 512 513 state_size = sizeof(struct xtile_data) * max_tile; 514 if (size != state_size) { 515 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", 516 __stringify(XFEATURE_XTILE_DATA), state_size, size); 517 __xstate_dump_leaves(); 518 return -EINVAL; 519 } 520 return 0; 521 } 522 523 /* 524 * We have a C struct for each 'xstate'. We need to ensure 525 * that our software representation matches what the CPU 526 * tells us about the state's size. 527 */ 528 static bool __init check_xstate_against_struct(int nr) 529 { 530 /* 531 * Ask the CPU for the size of the state. 532 */ 533 int sz = xfeature_size(nr); 534 535 /* 536 * Match each CPU state with the corresponding software 537 * structure. 538 */ 539 switch (nr) { 540 case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); 541 case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); 542 case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); 543 case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); 544 case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); 545 case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); 546 case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); 547 case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); 548 case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); 549 case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); 550 case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; 551 default: 552 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); 553 return false; 554 } 555 556 return true; 557 } 558 559 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) 560 { 561 unsigned int topmost = fls64(xfeatures) - 1; 562 unsigned int offset = xstate_offsets[topmost]; 563 564 if (topmost <= XFEATURE_SSE) 565 return sizeof(struct xregs_state); 566 567 if (compacted) 568 offset = xfeature_get_offset(xfeatures, topmost); 569 return offset + xstate_sizes[topmost]; 570 } 571 572 /* 573 * This essentially double-checks what the cpu told us about 574 * how large the XSAVE buffer needs to be. We are recalculating 575 * it to be safe. 576 * 577 * Independent XSAVE features allocate their own buffers and are not 578 * covered by these checks. Only the size of the buffer for task->fpu 579 * is checked here. 580 */ 581 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) 582 { 583 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 584 bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); 585 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; 586 int i; 587 588 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 589 if (!check_xstate_against_struct(i)) 590 return false; 591 /* 592 * Supervisor state components can be managed only by 593 * XSAVES. 594 */ 595 if (!xsaves && xfeature_is_supervisor(i)) { 596 XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); 597 return false; 598 } 599 } 600 size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); 601 XSTATE_WARN_ON(size != kernel_size, 602 "size %u != kernel_size %u\n", size, kernel_size); 603 return size == kernel_size; 604 } 605 606 /* 607 * Get total size of enabled xstates in XCR0 | IA32_XSS. 608 * 609 * Note the SDM's wording here. "sub-function 0" only enumerates 610 * the size of the *user* states. If we use it to size a buffer 611 * that we use 'XSAVES' on, we could potentially overflow the 612 * buffer because 'XSAVES' saves system states too. 613 * 614 * This also takes compaction into account. So this works for 615 * XSAVEC as well. 616 */ 617 static unsigned int __init get_compacted_size(void) 618 { 619 unsigned int eax, ebx, ecx, edx; 620 /* 621 * - CPUID function 0DH, sub-function 1: 622 * EBX enumerates the size (in bytes) required by 623 * the XSAVES instruction for an XSAVE area 624 * containing all the state components 625 * corresponding to bits currently set in 626 * XCR0 | IA32_XSS. 627 * 628 * When XSAVES is not available but XSAVEC is (virt), then there 629 * are no supervisor states, but XSAVEC still uses compacted 630 * format. 631 */ 632 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); 633 return ebx; 634 } 635 636 /* 637 * Get the total size of the enabled xstates without the independent supervisor 638 * features. 639 */ 640 static unsigned int __init get_xsave_compacted_size(void) 641 { 642 u64 mask = xfeatures_mask_independent(); 643 unsigned int size; 644 645 if (!mask) 646 return get_compacted_size(); 647 648 /* Disable independent features. */ 649 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); 650 651 /* 652 * Ask the hardware what size is required of the buffer. 653 * This is the size required for the task->fpu buffer. 654 */ 655 size = get_compacted_size(); 656 657 /* Re-enable independent features so XSAVES will work on them again. */ 658 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); 659 660 return size; 661 } 662 663 static unsigned int __init get_xsave_size_user(void) 664 { 665 unsigned int eax, ebx, ecx, edx; 666 /* 667 * - CPUID function 0DH, sub-function 0: 668 * EBX enumerates the size (in bytes) required by 669 * the XSAVE instruction for an XSAVE area 670 * containing all the *user* state components 671 * corresponding to bits currently set in XCR0. 672 */ 673 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); 674 return ebx; 675 } 676 677 static int __init init_xstate_size(void) 678 { 679 /* Recompute the context size for enabled features: */ 680 unsigned int user_size, kernel_size, kernel_default_size; 681 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 682 683 /* Uncompacted user space size */ 684 user_size = get_xsave_size_user(); 685 686 /* 687 * XSAVES kernel size includes supervisor states and uses compacted 688 * format. XSAVEC uses compacted format, but does not save 689 * supervisor states. 690 * 691 * XSAVE[OPT] do not support supervisor states so kernel and user 692 * size is identical. 693 */ 694 if (compacted) 695 kernel_size = get_xsave_compacted_size(); 696 else 697 kernel_size = user_size; 698 699 kernel_default_size = 700 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); 701 702 if (!paranoid_xstate_size_valid(kernel_size)) 703 return -EINVAL; 704 705 fpu_kernel_cfg.max_size = kernel_size; 706 fpu_user_cfg.max_size = user_size; 707 708 fpu_kernel_cfg.default_size = kernel_default_size; 709 fpu_user_cfg.default_size = 710 xstate_calculate_size(fpu_user_cfg.default_features, false); 711 712 return 0; 713 } 714 715 /* 716 * We enabled the XSAVE hardware, but something went wrong and 717 * we can not use it. Disable it. 718 */ 719 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) 720 { 721 fpu_kernel_cfg.max_features = 0; 722 cr4_clear_bits(X86_CR4_OSXSAVE); 723 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 724 725 /* Restore the legacy size.*/ 726 fpu_kernel_cfg.max_size = legacy_size; 727 fpu_kernel_cfg.default_size = legacy_size; 728 fpu_user_cfg.max_size = legacy_size; 729 fpu_user_cfg.default_size = legacy_size; 730 731 /* 732 * Prevent enabling the static branch which enables writes to the 733 * XFD MSR. 734 */ 735 init_fpstate.xfd = 0; 736 737 fpstate_reset(¤t->thread.fpu); 738 } 739 740 /* 741 * Enable and initialize the xsave feature. 742 * Called once per system bootup. 743 */ 744 void __init fpu__init_system_xstate(unsigned int legacy_size) 745 { 746 unsigned int eax, ebx, ecx, edx; 747 u64 xfeatures; 748 int err; 749 int i; 750 751 if (!boot_cpu_has(X86_FEATURE_FPU)) { 752 pr_info("x86/fpu: No FPU detected\n"); 753 return; 754 } 755 756 if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 757 pr_info("x86/fpu: x87 FPU will use %s\n", 758 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); 759 return; 760 } 761 762 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { 763 WARN_ON_FPU(1); 764 return; 765 } 766 767 /* 768 * Find user xstates supported by the processor. 769 */ 770 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); 771 fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); 772 773 /* 774 * Find supervisor xstates supported by the processor. 775 */ 776 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); 777 fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); 778 779 if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { 780 /* 781 * This indicates that something really unexpected happened 782 * with the enumeration. Disable XSAVE and try to continue 783 * booting without it. This is too early to BUG(). 784 */ 785 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", 786 fpu_kernel_cfg.max_features); 787 goto out_disable; 788 } 789 790 /* 791 * Clear XSAVE features that are disabled in the normal CPUID. 792 */ 793 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { 794 unsigned short cid = xsave_cpuid_features[i]; 795 796 /* Careful: X86_FEATURE_FPU is 0! */ 797 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) 798 fpu_kernel_cfg.max_features &= ~BIT_ULL(i); 799 } 800 801 if (!cpu_feature_enabled(X86_FEATURE_XFD)) 802 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; 803 804 if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) 805 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; 806 else 807 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | 808 XFEATURE_MASK_SUPERVISOR_SUPPORTED; 809 810 fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; 811 fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; 812 813 /* Clean out dynamic features from default */ 814 fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; 815 fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 816 817 fpu_user_cfg.default_features = fpu_user_cfg.max_features; 818 fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 819 820 /* Store it for paranoia check at the end */ 821 xfeatures = fpu_kernel_cfg.max_features; 822 823 /* 824 * Initialize the default XFD state in initfp_state and enable the 825 * dynamic sizing mechanism if dynamic states are available. The 826 * static key cannot be enabled here because this runs before 827 * jump_label_init(). This is delayed to an initcall. 828 */ 829 init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; 830 831 /* Set up compaction feature bit */ 832 if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || 833 cpu_feature_enabled(X86_FEATURE_XSAVES)) 834 setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); 835 836 /* Enable xstate instructions to be able to continue with initialization: */ 837 fpu__init_cpu_xstate(); 838 839 /* Cache size, offset and flags for initialization */ 840 setup_xstate_cache(); 841 842 err = init_xstate_size(); 843 if (err) 844 goto out_disable; 845 846 /* Reset the state for the current task */ 847 fpstate_reset(¤t->thread.fpu); 848 849 /* 850 * Update info used for ptrace frames; use standard-format size and no 851 * supervisor xstates: 852 */ 853 update_regset_xstate_info(fpu_user_cfg.max_size, 854 fpu_user_cfg.max_features); 855 856 /* 857 * init_fpstate excludes dynamic states as they are large but init 858 * state is zero. 859 */ 860 init_fpstate.size = fpu_kernel_cfg.default_size; 861 init_fpstate.xfeatures = fpu_kernel_cfg.default_features; 862 863 if (init_fpstate.size > sizeof(init_fpstate.regs)) { 864 pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", 865 sizeof(init_fpstate.regs), init_fpstate.size); 866 goto out_disable; 867 } 868 869 setup_init_fpu_buf(); 870 871 /* 872 * Paranoia check whether something in the setup modified the 873 * xfeatures mask. 874 */ 875 if (xfeatures != fpu_kernel_cfg.max_features) { 876 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", 877 xfeatures, fpu_kernel_cfg.max_features); 878 goto out_disable; 879 } 880 881 /* 882 * CPU capabilities initialization runs before FPU init. So 883 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely 884 * functional, set the feature bit so depending code works. 885 */ 886 setup_force_cpu_cap(X86_FEATURE_OSXSAVE); 887 888 print_xstate_offset_size(); 889 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", 890 fpu_kernel_cfg.max_features, 891 fpu_kernel_cfg.max_size, 892 boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); 893 return; 894 895 out_disable: 896 /* something went wrong, try to boot without any XSAVE support */ 897 fpu__init_disable_system_xstate(legacy_size); 898 } 899 900 /* 901 * Restore minimal FPU state after suspend: 902 */ 903 void fpu__resume_cpu(void) 904 { 905 /* 906 * Restore XCR0 on xsave capable CPUs: 907 */ 908 if (cpu_feature_enabled(X86_FEATURE_XSAVE)) 909 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 910 911 /* 912 * Restore IA32_XSS. The same CPUID bit enumerates support 913 * of XSAVES and MSR_IA32_XSS. 914 */ 915 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { 916 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 917 xfeatures_mask_independent()); 918 } 919 920 if (fpu_state_size_dynamic()) 921 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); 922 } 923 924 /* 925 * Given an xstate feature nr, calculate where in the xsave 926 * buffer the state is. Callers should ensure that the buffer 927 * is valid. 928 */ 929 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 930 { 931 u64 xcomp_bv = xsave->header.xcomp_bv; 932 933 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 934 return NULL; 935 936 if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { 937 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) 938 return NULL; 939 } 940 941 return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); 942 } 943 944 /* 945 * Given the xsave area and a state inside, this function returns the 946 * address of the state. 947 * 948 * This is the API that is called to get xstate address in either 949 * standard format or compacted format of xsave area. 950 * 951 * Note that if there is no data for the field in the xsave buffer 952 * this will return NULL. 953 * 954 * Inputs: 955 * xstate: the thread's storage area for all FPU data 956 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, 957 * XFEATURE_SSE, etc...) 958 * Output: 959 * address of the state in the xsave area, or NULL if the 960 * field is not present in the xsave buffer. 961 */ 962 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 963 { 964 /* 965 * Do we even *have* xsave state? 966 */ 967 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 968 return NULL; 969 970 /* 971 * We should not ever be requesting features that we 972 * have not enabled. 973 */ 974 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 975 return NULL; 976 977 /* 978 * This assumes the last 'xsave*' instruction to 979 * have requested that 'xfeature_nr' be saved. 980 * If it did not, we might be seeing and old value 981 * of the field in the buffer. 982 * 983 * This can happen because the last 'xsave' did not 984 * request that this feature be saved (unlikely) 985 * or because the "init optimization" caused it 986 * to not be saved. 987 */ 988 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) 989 return NULL; 990 991 return __raw_xsave_addr(xsave, xfeature_nr); 992 } 993 994 #ifdef CONFIG_ARCH_HAS_PKEYS 995 996 /* 997 * This will go out and modify PKRU register to set the access 998 * rights for @pkey to @init_val. 999 */ 1000 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 1001 unsigned long init_val) 1002 { 1003 u32 old_pkru, new_pkru_bits = 0; 1004 int pkey_shift; 1005 1006 /* 1007 * This check implies XSAVE support. OSPKE only gets 1008 * set if we enable XSAVE and we enable PKU in XCR0. 1009 */ 1010 if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) 1011 return -EINVAL; 1012 1013 /* 1014 * This code should only be called with valid 'pkey' 1015 * values originating from in-kernel users. Complain 1016 * if a bad value is observed. 1017 */ 1018 if (WARN_ON_ONCE(pkey >= arch_max_pkey())) 1019 return -EINVAL; 1020 1021 /* Set the bits we need in PKRU: */ 1022 if (init_val & PKEY_DISABLE_ACCESS) 1023 new_pkru_bits |= PKRU_AD_BIT; 1024 if (init_val & PKEY_DISABLE_WRITE) 1025 new_pkru_bits |= PKRU_WD_BIT; 1026 1027 /* Shift the bits in to the correct place in PKRU for pkey: */ 1028 pkey_shift = pkey * PKRU_BITS_PER_PKEY; 1029 new_pkru_bits <<= pkey_shift; 1030 1031 /* Get old PKRU and mask off any old bits in place: */ 1032 old_pkru = read_pkru(); 1033 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); 1034 1035 /* Write old part along with new part: */ 1036 write_pkru(old_pkru | new_pkru_bits); 1037 1038 return 0; 1039 } 1040 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ 1041 1042 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, 1043 void *init_xstate, unsigned int size) 1044 { 1045 membuf_write(to, from_xstate ? xstate : init_xstate, size); 1046 } 1047 1048 /** 1049 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1050 * @to: membuf descriptor 1051 * @fpstate: The fpstate buffer from which to copy 1052 * @xfeatures: The mask of xfeatures to save (XSAVE mode only) 1053 * @pkru_val: The PKRU value to store in the PKRU component 1054 * @copy_mode: The requested copy mode 1055 * 1056 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1057 * format, i.e. from the kernel internal hardware dependent storage format 1058 * to the requested @mode. UABI XSTATE is always uncompacted! 1059 * 1060 * It supports partial copy but @to.pos always starts from zero. 1061 */ 1062 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, 1063 u64 xfeatures, u32 pkru_val, 1064 enum xstate_copy_mode copy_mode) 1065 { 1066 const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); 1067 struct xregs_state *xinit = &init_fpstate.regs.xsave; 1068 struct xregs_state *xsave = &fpstate->regs.xsave; 1069 struct xstate_header header; 1070 unsigned int zerofrom; 1071 u64 mask; 1072 int i; 1073 1074 memset(&header, 0, sizeof(header)); 1075 header.xfeatures = xsave->header.xfeatures; 1076 1077 /* Mask out the feature bits depending on copy mode */ 1078 switch (copy_mode) { 1079 case XSTATE_COPY_FP: 1080 header.xfeatures &= XFEATURE_MASK_FP; 1081 break; 1082 1083 case XSTATE_COPY_FX: 1084 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; 1085 break; 1086 1087 case XSTATE_COPY_XSAVE: 1088 header.xfeatures &= fpstate->user_xfeatures & xfeatures; 1089 break; 1090 } 1091 1092 /* Copy FP state up to MXCSR */ 1093 copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, 1094 &xinit->i387, off_mxcsr); 1095 1096 /* Copy MXCSR when SSE or YMM are set in the feature mask */ 1097 copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), 1098 &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, 1099 MXCSR_AND_FLAGS_SIZE); 1100 1101 /* Copy the remaining FP state */ 1102 copy_feature(header.xfeatures & XFEATURE_MASK_FP, 1103 &to, &xsave->i387.st_space, &xinit->i387.st_space, 1104 sizeof(xsave->i387.st_space)); 1105 1106 /* Copy the SSE state - shared with YMM, but independently managed */ 1107 copy_feature(header.xfeatures & XFEATURE_MASK_SSE, 1108 &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, 1109 sizeof(xsave->i387.xmm_space)); 1110 1111 if (copy_mode != XSTATE_COPY_XSAVE) 1112 goto out; 1113 1114 /* Zero the padding area */ 1115 membuf_zero(&to, sizeof(xsave->i387.padding)); 1116 1117 /* Copy xsave->i387.sw_reserved */ 1118 membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); 1119 1120 /* Copy the user space relevant state of @xsave->header */ 1121 membuf_write(&to, &header, sizeof(header)); 1122 1123 zerofrom = offsetof(struct xregs_state, extended_state_area); 1124 1125 /* 1126 * This 'mask' indicates which states to copy from fpstate. 1127 * Those extended states that are not present in fpstate are 1128 * either disabled or initialized: 1129 * 1130 * In non-compacted format, disabled features still occupy 1131 * state space but there is no state to copy from in the 1132 * compacted init_fpstate. The gap tracking will zero these 1133 * states. 1134 * 1135 * The extended features have an all zeroes init state. Thus, 1136 * remove them from 'mask' to zero those features in the user 1137 * buffer instead of retrieving them from init_fpstate. 1138 */ 1139 mask = header.xfeatures; 1140 1141 for_each_extended_xfeature(i, mask) { 1142 /* 1143 * If there was a feature or alignment gap, zero the space 1144 * in the destination buffer. 1145 */ 1146 if (zerofrom < xstate_offsets[i]) 1147 membuf_zero(&to, xstate_offsets[i] - zerofrom); 1148 1149 if (i == XFEATURE_PKRU) { 1150 struct pkru_state pkru = {0}; 1151 /* 1152 * PKRU is not necessarily up to date in the 1153 * XSAVE buffer. Use the provided value. 1154 */ 1155 pkru.pkru = pkru_val; 1156 membuf_write(&to, &pkru, sizeof(pkru)); 1157 } else { 1158 membuf_write(&to, 1159 __raw_xsave_addr(xsave, i), 1160 xstate_sizes[i]); 1161 } 1162 /* 1163 * Keep track of the last copied state in the non-compacted 1164 * target buffer for gap zeroing. 1165 */ 1166 zerofrom = xstate_offsets[i] + xstate_sizes[i]; 1167 } 1168 1169 out: 1170 if (to.left) 1171 membuf_zero(&to, to.left); 1172 } 1173 1174 /** 1175 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1176 * @to: membuf descriptor 1177 * @tsk: The task from which to copy the saved xstate 1178 * @copy_mode: The requested copy mode 1179 * 1180 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1181 * format, i.e. from the kernel internal hardware dependent storage format 1182 * to the requested @mode. UABI XSTATE is always uncompacted! 1183 * 1184 * It supports partial copy but @to.pos always starts from zero. 1185 */ 1186 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, 1187 enum xstate_copy_mode copy_mode) 1188 { 1189 __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, 1190 tsk->thread.fpu.fpstate->user_xfeatures, 1191 tsk->thread.pkru, copy_mode); 1192 } 1193 1194 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, 1195 const void *kbuf, const void __user *ubuf) 1196 { 1197 if (kbuf) { 1198 memcpy(dst, kbuf + offset, size); 1199 } else { 1200 if (copy_from_user(dst, ubuf + offset, size)) 1201 return -EFAULT; 1202 } 1203 return 0; 1204 } 1205 1206 1207 /** 1208 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate 1209 * @fpstate: The fpstate buffer to copy to 1210 * @kbuf: The UABI format buffer, if it comes from the kernel 1211 * @ubuf: The UABI format buffer, if it comes from userspace 1212 * @pkru: The location to write the PKRU value to 1213 * 1214 * Converts from the UABI format into the kernel internal hardware 1215 * dependent format. 1216 * 1217 * This function ultimately has three different callers with distinct PKRU 1218 * behavior. 1219 * 1. When called from sigreturn the PKRU register will be restored from 1220 * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to 1221 * @fpstate is sufficient to cover this case, but the caller will also 1222 * pass a pointer to the thread_struct's pkru field in @pkru and updating 1223 * it is harmless. 1224 * 2. When called from ptrace the PKRU register will be restored from the 1225 * thread_struct's pkru field. A pointer to that is passed in @pkru. 1226 * The kernel will restore it manually, so the XRSTOR behavior that resets 1227 * the PKRU register to the hardware init value (0) if the corresponding 1228 * xfeatures bit is not set is emulated here. 1229 * 3. When called from KVM the PKRU register will be restored from the vcpu's 1230 * pkru field. A pointer to that is passed in @pkru. KVM hasn't used 1231 * XRSTOR and hasn't had the PKRU resetting behavior described above. To 1232 * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures 1233 * bit is not set. 1234 */ 1235 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, 1236 const void __user *ubuf, u32 *pkru) 1237 { 1238 struct xregs_state *xsave = &fpstate->regs.xsave; 1239 unsigned int offset, size; 1240 struct xstate_header hdr; 1241 u64 mask; 1242 int i; 1243 1244 offset = offsetof(struct xregs_state, header); 1245 if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) 1246 return -EFAULT; 1247 1248 if (validate_user_xstate_header(&hdr, fpstate)) 1249 return -EINVAL; 1250 1251 /* Validate MXCSR when any of the related features is in use */ 1252 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; 1253 if (hdr.xfeatures & mask) { 1254 u32 mxcsr[2]; 1255 1256 offset = offsetof(struct fxregs_state, mxcsr); 1257 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) 1258 return -EFAULT; 1259 1260 /* Reserved bits in MXCSR must be zero. */ 1261 if (mxcsr[0] & ~mxcsr_feature_mask) 1262 return -EINVAL; 1263 1264 /* SSE and YMM require MXCSR even when FP is not in use. */ 1265 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { 1266 xsave->i387.mxcsr = mxcsr[0]; 1267 xsave->i387.mxcsr_mask = mxcsr[1]; 1268 } 1269 } 1270 1271 for (i = 0; i < XFEATURE_MAX; i++) { 1272 mask = BIT_ULL(i); 1273 1274 if (hdr.xfeatures & mask) { 1275 void *dst = __raw_xsave_addr(xsave, i); 1276 1277 offset = xstate_offsets[i]; 1278 size = xstate_sizes[i]; 1279 1280 if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) 1281 return -EFAULT; 1282 } 1283 } 1284 1285 if (hdr.xfeatures & XFEATURE_MASK_PKRU) { 1286 struct pkru_state *xpkru; 1287 1288 xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); 1289 *pkru = xpkru->pkru; 1290 } else { 1291 /* 1292 * KVM may pass NULL here to indicate that it does not need 1293 * PKRU updated. 1294 */ 1295 if (pkru) 1296 *pkru = 0; 1297 } 1298 1299 /* 1300 * The state that came in from userspace was user-state only. 1301 * Mask all the user states out of 'xfeatures': 1302 */ 1303 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; 1304 1305 /* 1306 * Add back in the features that came in from userspace: 1307 */ 1308 xsave->header.xfeatures |= hdr.xfeatures; 1309 1310 return 0; 1311 } 1312 1313 /* 1314 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] 1315 * format and copy to the target thread. Used by ptrace and KVM. 1316 */ 1317 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) 1318 { 1319 return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); 1320 } 1321 1322 /* 1323 * Convert from a sigreturn standard-format user-space buffer to kernel 1324 * XSAVE[S] format and copy to the target thread. This is called from the 1325 * sigreturn() and rt_sigreturn() system calls. 1326 */ 1327 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, 1328 const void __user *ubuf) 1329 { 1330 return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); 1331 } 1332 1333 static bool validate_independent_components(u64 mask) 1334 { 1335 u64 xchk; 1336 1337 if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) 1338 return false; 1339 1340 xchk = ~xfeatures_mask_independent(); 1341 1342 if (WARN_ON_ONCE(!mask || mask & xchk)) 1343 return false; 1344 1345 return true; 1346 } 1347 1348 /** 1349 * xsaves - Save selected components to a kernel xstate buffer 1350 * @xstate: Pointer to the buffer 1351 * @mask: Feature mask to select the components to save 1352 * 1353 * The @xstate buffer must be 64 byte aligned and correctly initialized as 1354 * XSAVES does not write the full xstate header. Before first use the 1355 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer 1356 * can #GP. 1357 * 1358 * The feature mask must be a subset of the independent features. 1359 */ 1360 void xsaves(struct xregs_state *xstate, u64 mask) 1361 { 1362 int err; 1363 1364 if (!validate_independent_components(mask)) 1365 return; 1366 1367 XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); 1368 WARN_ON_ONCE(err); 1369 } 1370 1371 /** 1372 * xrstors - Restore selected components from a kernel xstate buffer 1373 * @xstate: Pointer to the buffer 1374 * @mask: Feature mask to select the components to restore 1375 * 1376 * The @xstate buffer must be 64 byte aligned and correctly initialized 1377 * otherwise XRSTORS from that buffer can #GP. 1378 * 1379 * Proper usage is to restore the state which was saved with 1380 * xsaves() into @xstate. 1381 * 1382 * The feature mask must be a subset of the independent features. 1383 */ 1384 void xrstors(struct xregs_state *xstate, u64 mask) 1385 { 1386 int err; 1387 1388 if (!validate_independent_components(mask)) 1389 return; 1390 1391 XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); 1392 WARN_ON_ONCE(err); 1393 } 1394 1395 #if IS_ENABLED(CONFIG_KVM) 1396 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) 1397 { 1398 void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); 1399 1400 if (addr) 1401 memset(addr, 0, xstate_sizes[xfeature]); 1402 } 1403 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); 1404 #endif 1405 1406 #ifdef CONFIG_X86_64 1407 1408 #ifdef CONFIG_X86_DEBUG_FPU 1409 /* 1410 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask 1411 * can safely operate on the @fpstate buffer. 1412 */ 1413 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) 1414 { 1415 u64 xfd = __this_cpu_read(xfd_state); 1416 1417 if (fpstate->xfd == xfd) 1418 return true; 1419 1420 /* 1421 * The XFD MSR does not match fpstate->xfd. That's invalid when 1422 * the passed in fpstate is current's fpstate. 1423 */ 1424 if (fpstate->xfd == current->thread.fpu.fpstate->xfd) 1425 return false; 1426 1427 /* 1428 * XRSTOR(S) from init_fpstate are always correct as it will just 1429 * bring all components into init state and not read from the 1430 * buffer. XSAVE(S) raises #PF after init. 1431 */ 1432 if (fpstate == &init_fpstate) 1433 return rstor; 1434 1435 /* 1436 * XSAVE(S): clone(), fpu_swap_kvm_fpu() 1437 * XRSTORS(S): fpu_swap_kvm_fpu() 1438 */ 1439 1440 /* 1441 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch 1442 * the buffer area for XFD-disabled state components. 1443 */ 1444 mask &= ~xfd; 1445 1446 /* 1447 * Remove features which are valid in fpstate. They 1448 * have space allocated in fpstate. 1449 */ 1450 mask &= ~fpstate->xfeatures; 1451 1452 /* 1453 * Any remaining state components in 'mask' might be written 1454 * by XSAVE/XRSTOR. Fail validation it found. 1455 */ 1456 return !mask; 1457 } 1458 1459 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) 1460 { 1461 WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); 1462 } 1463 #endif /* CONFIG_X86_DEBUG_FPU */ 1464 1465 static int __init xfd_update_static_branch(void) 1466 { 1467 /* 1468 * If init_fpstate.xfd has bits set then dynamic features are 1469 * available and the dynamic sizing must be enabled. 1470 */ 1471 if (init_fpstate.xfd) 1472 static_branch_enable(&__fpu_state_size_dynamic); 1473 return 0; 1474 } 1475 arch_initcall(xfd_update_static_branch) 1476 1477 void fpstate_free(struct fpu *fpu) 1478 { 1479 if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) 1480 vfree(fpu->fpstate); 1481 } 1482 1483 /** 1484 * fpstate_realloc - Reallocate struct fpstate for the requested new features 1485 * 1486 * @xfeatures: A bitmap of xstate features which extend the enabled features 1487 * of that task 1488 * @ksize: The required size for the kernel buffer 1489 * @usize: The required size for user space buffers 1490 * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations 1491 * 1492 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer 1493 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks 1494 * with large states are likely to live longer. 1495 * 1496 * Returns: 0 on success, -ENOMEM on allocation error. 1497 */ 1498 static int fpstate_realloc(u64 xfeatures, unsigned int ksize, 1499 unsigned int usize, struct fpu_guest *guest_fpu) 1500 { 1501 struct fpu *fpu = ¤t->thread.fpu; 1502 struct fpstate *curfps, *newfps = NULL; 1503 unsigned int fpsize; 1504 bool in_use; 1505 1506 fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); 1507 1508 newfps = vzalloc(fpsize); 1509 if (!newfps) 1510 return -ENOMEM; 1511 newfps->size = ksize; 1512 newfps->user_size = usize; 1513 newfps->is_valloc = true; 1514 1515 /* 1516 * When a guest FPU is supplied, use @guest_fpu->fpstate 1517 * as reference independent whether it is in use or not. 1518 */ 1519 curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; 1520 1521 /* Determine whether @curfps is the active fpstate */ 1522 in_use = fpu->fpstate == curfps; 1523 1524 if (guest_fpu) { 1525 newfps->is_guest = true; 1526 newfps->is_confidential = curfps->is_confidential; 1527 newfps->in_use = curfps->in_use; 1528 guest_fpu->xfeatures |= xfeatures; 1529 guest_fpu->uabi_size = usize; 1530 } 1531 1532 fpregs_lock(); 1533 /* 1534 * If @curfps is in use, ensure that the current state is in the 1535 * registers before swapping fpstate as that might invalidate it 1536 * due to layout changes. 1537 */ 1538 if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) 1539 fpregs_restore_userregs(); 1540 1541 newfps->xfeatures = curfps->xfeatures | xfeatures; 1542 newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; 1543 newfps->xfd = curfps->xfd & ~xfeatures; 1544 1545 /* Do the final updates within the locked region */ 1546 xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); 1547 1548 if (guest_fpu) { 1549 guest_fpu->fpstate = newfps; 1550 /* If curfps is active, update the FPU fpstate pointer */ 1551 if (in_use) 1552 fpu->fpstate = newfps; 1553 } else { 1554 fpu->fpstate = newfps; 1555 } 1556 1557 if (in_use) 1558 xfd_update_state(fpu->fpstate); 1559 fpregs_unlock(); 1560 1561 /* Only free valloc'ed state */ 1562 if (curfps && curfps->is_valloc) 1563 vfree(curfps); 1564 1565 return 0; 1566 } 1567 1568 static int validate_sigaltstack(unsigned int usize) 1569 { 1570 struct task_struct *thread, *leader = current->group_leader; 1571 unsigned long framesize = get_sigframe_size(); 1572 1573 lockdep_assert_held(¤t->sighand->siglock); 1574 1575 /* get_sigframe_size() is based on fpu_user_cfg.max_size */ 1576 framesize -= fpu_user_cfg.max_size; 1577 framesize += usize; 1578 for_each_thread(leader, thread) { 1579 if (thread->sas_ss_size && thread->sas_ss_size < framesize) 1580 return -ENOSPC; 1581 } 1582 return 0; 1583 } 1584 1585 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) 1586 { 1587 /* 1588 * This deliberately does not exclude !XSAVES as we still might 1589 * decide to optionally context switch XCR0 or talk the silicon 1590 * vendors into extending XFD for the pre AMX states, especially 1591 * AVX512. 1592 */ 1593 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 1594 struct fpu *fpu = ¤t->group_leader->thread.fpu; 1595 struct fpu_state_perm *perm; 1596 unsigned int ksize, usize; 1597 u64 mask; 1598 int ret = 0; 1599 1600 /* Check whether fully enabled */ 1601 if ((permitted & requested) == requested) 1602 return 0; 1603 1604 /* Calculate the resulting kernel state size */ 1605 mask = permitted | requested; 1606 /* Take supervisor states into account on the host */ 1607 if (!guest) 1608 mask |= xfeatures_mask_supervisor(); 1609 ksize = xstate_calculate_size(mask, compacted); 1610 1611 /* Calculate the resulting user state size */ 1612 mask &= XFEATURE_MASK_USER_SUPPORTED; 1613 usize = xstate_calculate_size(mask, false); 1614 1615 if (!guest) { 1616 ret = validate_sigaltstack(usize); 1617 if (ret) 1618 return ret; 1619 } 1620 1621 perm = guest ? &fpu->guest_perm : &fpu->perm; 1622 /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ 1623 WRITE_ONCE(perm->__state_perm, mask); 1624 /* Protected by sighand lock */ 1625 perm->__state_size = ksize; 1626 perm->__user_state_size = usize; 1627 return ret; 1628 } 1629 1630 /* 1631 * Permissions array to map facilities with more than one component 1632 */ 1633 static const u64 xstate_prctl_req[XFEATURE_MAX] = { 1634 [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, 1635 }; 1636 1637 static int xstate_request_perm(unsigned long idx, bool guest) 1638 { 1639 u64 permitted, requested; 1640 int ret; 1641 1642 if (idx >= XFEATURE_MAX) 1643 return -EINVAL; 1644 1645 /* 1646 * Look up the facility mask which can require more than 1647 * one xstate component. 1648 */ 1649 idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); 1650 requested = xstate_prctl_req[idx]; 1651 if (!requested) 1652 return -EOPNOTSUPP; 1653 1654 if ((fpu_user_cfg.max_features & requested) != requested) 1655 return -EOPNOTSUPP; 1656 1657 /* Lockless quick check */ 1658 permitted = xstate_get_group_perm(guest); 1659 if ((permitted & requested) == requested) 1660 return 0; 1661 1662 /* Protect against concurrent modifications */ 1663 spin_lock_irq(¤t->sighand->siglock); 1664 permitted = xstate_get_group_perm(guest); 1665 1666 /* First vCPU allocation locks the permissions. */ 1667 if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) 1668 ret = -EBUSY; 1669 else 1670 ret = __xstate_request_perm(permitted, requested, guest); 1671 spin_unlock_irq(¤t->sighand->siglock); 1672 return ret; 1673 } 1674 1675 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) 1676 { 1677 u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; 1678 struct fpu_state_perm *perm; 1679 unsigned int ksize, usize; 1680 struct fpu *fpu; 1681 1682 if (!xfd_event) { 1683 if (!guest_fpu) 1684 pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); 1685 return 0; 1686 } 1687 1688 /* Protect against concurrent modifications */ 1689 spin_lock_irq(¤t->sighand->siglock); 1690 1691 /* If not permitted let it die */ 1692 if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { 1693 spin_unlock_irq(¤t->sighand->siglock); 1694 return -EPERM; 1695 } 1696 1697 fpu = ¤t->group_leader->thread.fpu; 1698 perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; 1699 ksize = perm->__state_size; 1700 usize = perm->__user_state_size; 1701 1702 /* 1703 * The feature is permitted. State size is sufficient. Dropping 1704 * the lock is safe here even if more features are added from 1705 * another task, the retrieved buffer sizes are valid for the 1706 * currently requested feature(s). 1707 */ 1708 spin_unlock_irq(¤t->sighand->siglock); 1709 1710 /* 1711 * Try to allocate a new fpstate. If that fails there is no way 1712 * out. 1713 */ 1714 if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) 1715 return -EFAULT; 1716 return 0; 1717 } 1718 1719 int xfd_enable_feature(u64 xfd_err) 1720 { 1721 return __xfd_enable_feature(xfd_err, NULL); 1722 } 1723 1724 #else /* CONFIG_X86_64 */ 1725 static inline int xstate_request_perm(unsigned long idx, bool guest) 1726 { 1727 return -EPERM; 1728 } 1729 #endif /* !CONFIG_X86_64 */ 1730 1731 u64 xstate_get_guest_group_perm(void) 1732 { 1733 return xstate_get_group_perm(true); 1734 } 1735 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); 1736 1737 /** 1738 * fpu_xstate_prctl - xstate permission operations 1739 * @tsk: Redundant pointer to current 1740 * @option: A subfunction of arch_prctl() 1741 * @arg2: option argument 1742 * Return: 0 if successful; otherwise, an error code 1743 * 1744 * Option arguments: 1745 * 1746 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info 1747 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info 1748 * ARCH_REQ_XCOMP_PERM: Facility number requested 1749 * 1750 * For facilities which require more than one XSTATE component, the request 1751 * must be the highest state component number related to that facility, 1752 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and 1753 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). 1754 */ 1755 long fpu_xstate_prctl(int option, unsigned long arg2) 1756 { 1757 u64 __user *uptr = (u64 __user *)arg2; 1758 u64 permitted, supported; 1759 unsigned long idx = arg2; 1760 bool guest = false; 1761 1762 switch (option) { 1763 case ARCH_GET_XCOMP_SUPP: 1764 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; 1765 return put_user(supported, uptr); 1766 1767 case ARCH_GET_XCOMP_PERM: 1768 /* 1769 * Lockless snapshot as it can also change right after the 1770 * dropping the lock. 1771 */ 1772 permitted = xstate_get_host_group_perm(); 1773 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1774 return put_user(permitted, uptr); 1775 1776 case ARCH_GET_XCOMP_GUEST_PERM: 1777 permitted = xstate_get_guest_group_perm(); 1778 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1779 return put_user(permitted, uptr); 1780 1781 case ARCH_REQ_XCOMP_GUEST_PERM: 1782 guest = true; 1783 fallthrough; 1784 1785 case ARCH_REQ_XCOMP_PERM: 1786 if (!IS_ENABLED(CONFIG_X86_64)) 1787 return -EOPNOTSUPP; 1788 1789 return xstate_request_perm(idx, guest); 1790 1791 default: 1792 return -EINVAL; 1793 } 1794 } 1795 1796 #ifdef CONFIG_PROC_PID_ARCH_STATUS 1797 /* 1798 * Report the amount of time elapsed in millisecond since last AVX512 1799 * use in the task. 1800 */ 1801 static void avx512_status(struct seq_file *m, struct task_struct *task) 1802 { 1803 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); 1804 long delta; 1805 1806 if (!timestamp) { 1807 /* 1808 * Report -1 if no AVX512 usage 1809 */ 1810 delta = -1; 1811 } else { 1812 delta = (long)(jiffies - timestamp); 1813 /* 1814 * Cap to LONG_MAX if time difference > LONG_MAX 1815 */ 1816 if (delta < 0) 1817 delta = LONG_MAX; 1818 delta = jiffies_to_msecs(delta); 1819 } 1820 1821 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); 1822 seq_putc(m, '\n'); 1823 } 1824 1825 /* 1826 * Report architecture specific information 1827 */ 1828 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, 1829 struct pid *pid, struct task_struct *task) 1830 { 1831 /* 1832 * Report AVX512 state if the processor and build option supported. 1833 */ 1834 if (cpu_feature_enabled(X86_FEATURE_AVX512F)) 1835 avx512_status(m, task); 1836 1837 return 0; 1838 } 1839 #endif /* CONFIG_PROC_PID_ARCH_STATUS */ 1840