1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * AMD Memory Encryption Support 4 * 5 * Copyright (C) 2019 SUSE 6 * 7 * Author: Joerg Roedel <jroedel@suse.de> 8 */ 9 10 #define pr_fmt(fmt) "SEV: " fmt 11 12 #include <linux/sched/debug.h> /* For show_regs() */ 13 #include <linux/percpu-defs.h> 14 #include <linux/cc_platform.h> 15 #include <linux/printk.h> 16 #include <linux/mm_types.h> 17 #include <linux/set_memory.h> 18 #include <linux/memblock.h> 19 #include <linux/kernel.h> 20 #include <linux/mm.h> 21 #include <linux/cpumask.h> 22 #include <linux/efi.h> 23 #include <linux/platform_device.h> 24 #include <linux/io.h> 25 #include <linux/psp-sev.h> 26 #include <uapi/linux/sev-guest.h> 27 28 #include <asm/cpu_entry_area.h> 29 #include <asm/stacktrace.h> 30 #include <asm/sev.h> 31 #include <asm/insn-eval.h> 32 #include <asm/fpu/xcr.h> 33 #include <asm/processor.h> 34 #include <asm/realmode.h> 35 #include <asm/setup.h> 36 #include <asm/traps.h> 37 #include <asm/svm.h> 38 #include <asm/smp.h> 39 #include <asm/cpu.h> 40 #include <asm/apic.h> 41 #include <asm/cpuid.h> 42 #include <asm/cmdline.h> 43 44 #define DR7_RESET_VALUE 0x400 45 46 /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ 47 #define AP_INIT_CS_LIMIT 0xffff 48 #define AP_INIT_DS_LIMIT 0xffff 49 #define AP_INIT_LDTR_LIMIT 0xffff 50 #define AP_INIT_GDTR_LIMIT 0xffff 51 #define AP_INIT_IDTR_LIMIT 0xffff 52 #define AP_INIT_TR_LIMIT 0xffff 53 #define AP_INIT_RFLAGS_DEFAULT 0x2 54 #define AP_INIT_DR6_DEFAULT 0xffff0ff0 55 #define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL 56 #define AP_INIT_XCR0_DEFAULT 0x1 57 #define AP_INIT_X87_FTW_DEFAULT 0x5555 58 #define AP_INIT_X87_FCW_DEFAULT 0x0040 59 #define AP_INIT_CR0_DEFAULT 0x60000010 60 #define AP_INIT_MXCSR_DEFAULT 0x1f80 61 62 /* For early boot hypervisor communication in SEV-ES enabled guests */ 63 static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); 64 65 /* 66 * Needs to be in the .data section because we need it NULL before bss is 67 * cleared 68 */ 69 static struct ghcb *boot_ghcb __section(".data"); 70 71 /* Bitmap of SEV features supported by the hypervisor */ 72 static u64 sev_hv_features __ro_after_init; 73 74 /* #VC handler runtime per-CPU data */ 75 struct sev_es_runtime_data { 76 struct ghcb ghcb_page; 77 78 /* 79 * Reserve one page per CPU as backup storage for the unencrypted GHCB. 80 * It is needed when an NMI happens while the #VC handler uses the real 81 * GHCB, and the NMI handler itself is causing another #VC exception. In 82 * that case the GHCB content of the first handler needs to be backed up 83 * and restored. 84 */ 85 struct ghcb backup_ghcb; 86 87 /* 88 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. 89 * There is no need for it to be atomic, because nothing is written to 90 * the GHCB between the read and the write of ghcb_active. So it is safe 91 * to use it when a nested #VC exception happens before the write. 92 * 93 * This is necessary for example in the #VC->NMI->#VC case when the NMI 94 * happens while the first #VC handler uses the GHCB. When the NMI code 95 * raises a second #VC handler it might overwrite the contents of the 96 * GHCB written by the first handler. To avoid this the content of the 97 * GHCB is saved and restored when the GHCB is detected to be in use 98 * already. 99 */ 100 bool ghcb_active; 101 bool backup_ghcb_active; 102 103 /* 104 * Cached DR7 value - write it on DR7 writes and return it on reads. 105 * That value will never make it to the real hardware DR7 as debugging 106 * is currently unsupported in SEV-ES guests. 107 */ 108 unsigned long dr7; 109 }; 110 111 struct ghcb_state { 112 struct ghcb *ghcb; 113 }; 114 115 static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); 116 DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); 117 118 static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); 119 120 struct sev_config { 121 __u64 debug : 1, 122 __reserved : 63; 123 }; 124 125 static struct sev_config sev_cfg __read_mostly; 126 127 static __always_inline bool on_vc_stack(struct pt_regs *regs) 128 { 129 unsigned long sp = regs->sp; 130 131 /* User-mode RSP is not trusted */ 132 if (user_mode(regs)) 133 return false; 134 135 /* SYSCALL gap still has user-mode RSP */ 136 if (ip_within_syscall_gap(regs)) 137 return false; 138 139 return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); 140 } 141 142 /* 143 * This function handles the case when an NMI is raised in the #VC 144 * exception handler entry code, before the #VC handler has switched off 145 * its IST stack. In this case, the IST entry for #VC must be adjusted, 146 * so that any nested #VC exception will not overwrite the stack 147 * contents of the interrupted #VC handler. 148 * 149 * The IST entry is adjusted unconditionally so that it can be also be 150 * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a 151 * nested sev_es_ist_exit() call may adjust back the IST entry too 152 * early. 153 * 154 * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run 155 * on the NMI IST stack, as they are only called from NMI handling code 156 * right now. 157 */ 158 void noinstr __sev_es_ist_enter(struct pt_regs *regs) 159 { 160 unsigned long old_ist, new_ist; 161 162 /* Read old IST entry */ 163 new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); 164 165 /* 166 * If NMI happened while on the #VC IST stack, set the new IST 167 * value below regs->sp, so that the interrupted stack frame is 168 * not overwritten by subsequent #VC exceptions. 169 */ 170 if (on_vc_stack(regs)) 171 new_ist = regs->sp; 172 173 /* 174 * Reserve additional 8 bytes and store old IST value so this 175 * adjustment can be unrolled in __sev_es_ist_exit(). 176 */ 177 new_ist -= sizeof(old_ist); 178 *(unsigned long *)new_ist = old_ist; 179 180 /* Set new IST entry */ 181 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); 182 } 183 184 void noinstr __sev_es_ist_exit(void) 185 { 186 unsigned long ist; 187 188 /* Read IST entry */ 189 ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); 190 191 if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) 192 return; 193 194 /* Read back old IST entry and write it to the TSS */ 195 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); 196 } 197 198 /* 199 * Nothing shall interrupt this code path while holding the per-CPU 200 * GHCB. The backup GHCB is only for NMIs interrupting this path. 201 * 202 * Callers must disable local interrupts around it. 203 */ 204 static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) 205 { 206 struct sev_es_runtime_data *data; 207 struct ghcb *ghcb; 208 209 WARN_ON(!irqs_disabled()); 210 211 data = this_cpu_read(runtime_data); 212 ghcb = &data->ghcb_page; 213 214 if (unlikely(data->ghcb_active)) { 215 /* GHCB is already in use - save its contents */ 216 217 if (unlikely(data->backup_ghcb_active)) { 218 /* 219 * Backup-GHCB is also already in use. There is no way 220 * to continue here so just kill the machine. To make 221 * panic() work, mark GHCBs inactive so that messages 222 * can be printed out. 223 */ 224 data->ghcb_active = false; 225 data->backup_ghcb_active = false; 226 227 instrumentation_begin(); 228 panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); 229 instrumentation_end(); 230 } 231 232 /* Mark backup_ghcb active before writing to it */ 233 data->backup_ghcb_active = true; 234 235 state->ghcb = &data->backup_ghcb; 236 237 /* Backup GHCB content */ 238 *state->ghcb = *ghcb; 239 } else { 240 state->ghcb = NULL; 241 data->ghcb_active = true; 242 } 243 244 return ghcb; 245 } 246 247 static inline u64 sev_es_rd_ghcb_msr(void) 248 { 249 return __rdmsr(MSR_AMD64_SEV_ES_GHCB); 250 } 251 252 static __always_inline void sev_es_wr_ghcb_msr(u64 val) 253 { 254 u32 low, high; 255 256 low = (u32)(val); 257 high = (u32)(val >> 32); 258 259 native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); 260 } 261 262 static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, 263 unsigned char *buffer) 264 { 265 return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); 266 } 267 268 static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) 269 { 270 char buffer[MAX_INSN_SIZE]; 271 int insn_bytes; 272 273 insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); 274 if (insn_bytes == 0) { 275 /* Nothing could be copied */ 276 ctxt->fi.vector = X86_TRAP_PF; 277 ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; 278 ctxt->fi.cr2 = ctxt->regs->ip; 279 return ES_EXCEPTION; 280 } else if (insn_bytes == -EINVAL) { 281 /* Effective RIP could not be calculated */ 282 ctxt->fi.vector = X86_TRAP_GP; 283 ctxt->fi.error_code = 0; 284 ctxt->fi.cr2 = 0; 285 return ES_EXCEPTION; 286 } 287 288 if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) 289 return ES_DECODE_FAILED; 290 291 if (ctxt->insn.immediate.got) 292 return ES_OK; 293 else 294 return ES_DECODE_FAILED; 295 } 296 297 static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) 298 { 299 char buffer[MAX_INSN_SIZE]; 300 int res, ret; 301 302 res = vc_fetch_insn_kernel(ctxt, buffer); 303 if (res) { 304 ctxt->fi.vector = X86_TRAP_PF; 305 ctxt->fi.error_code = X86_PF_INSTR; 306 ctxt->fi.cr2 = ctxt->regs->ip; 307 return ES_EXCEPTION; 308 } 309 310 ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); 311 if (ret < 0) 312 return ES_DECODE_FAILED; 313 else 314 return ES_OK; 315 } 316 317 static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) 318 { 319 if (user_mode(ctxt->regs)) 320 return __vc_decode_user_insn(ctxt); 321 else 322 return __vc_decode_kern_insn(ctxt); 323 } 324 325 static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, 326 char *dst, char *buf, size_t size) 327 { 328 unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; 329 330 /* 331 * This function uses __put_user() independent of whether kernel or user 332 * memory is accessed. This works fine because __put_user() does no 333 * sanity checks of the pointer being accessed. All that it does is 334 * to report when the access failed. 335 * 336 * Also, this function runs in atomic context, so __put_user() is not 337 * allowed to sleep. The page-fault handler detects that it is running 338 * in atomic context and will not try to take mmap_sem and handle the 339 * fault, so additional pagefault_enable()/disable() calls are not 340 * needed. 341 * 342 * The access can't be done via copy_to_user() here because 343 * vc_write_mem() must not use string instructions to access unsafe 344 * memory. The reason is that MOVS is emulated by the #VC handler by 345 * splitting the move up into a read and a write and taking a nested #VC 346 * exception on whatever of them is the MMIO access. Using string 347 * instructions here would cause infinite nesting. 348 */ 349 switch (size) { 350 case 1: { 351 u8 d1; 352 u8 __user *target = (u8 __user *)dst; 353 354 memcpy(&d1, buf, 1); 355 if (__put_user(d1, target)) 356 goto fault; 357 break; 358 } 359 case 2: { 360 u16 d2; 361 u16 __user *target = (u16 __user *)dst; 362 363 memcpy(&d2, buf, 2); 364 if (__put_user(d2, target)) 365 goto fault; 366 break; 367 } 368 case 4: { 369 u32 d4; 370 u32 __user *target = (u32 __user *)dst; 371 372 memcpy(&d4, buf, 4); 373 if (__put_user(d4, target)) 374 goto fault; 375 break; 376 } 377 case 8: { 378 u64 d8; 379 u64 __user *target = (u64 __user *)dst; 380 381 memcpy(&d8, buf, 8); 382 if (__put_user(d8, target)) 383 goto fault; 384 break; 385 } 386 default: 387 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 388 return ES_UNSUPPORTED; 389 } 390 391 return ES_OK; 392 393 fault: 394 if (user_mode(ctxt->regs)) 395 error_code |= X86_PF_USER; 396 397 ctxt->fi.vector = X86_TRAP_PF; 398 ctxt->fi.error_code = error_code; 399 ctxt->fi.cr2 = (unsigned long)dst; 400 401 return ES_EXCEPTION; 402 } 403 404 static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, 405 char *src, char *buf, size_t size) 406 { 407 unsigned long error_code = X86_PF_PROT; 408 409 /* 410 * This function uses __get_user() independent of whether kernel or user 411 * memory is accessed. This works fine because __get_user() does no 412 * sanity checks of the pointer being accessed. All that it does is 413 * to report when the access failed. 414 * 415 * Also, this function runs in atomic context, so __get_user() is not 416 * allowed to sleep. The page-fault handler detects that it is running 417 * in atomic context and will not try to take mmap_sem and handle the 418 * fault, so additional pagefault_enable()/disable() calls are not 419 * needed. 420 * 421 * The access can't be done via copy_from_user() here because 422 * vc_read_mem() must not use string instructions to access unsafe 423 * memory. The reason is that MOVS is emulated by the #VC handler by 424 * splitting the move up into a read and a write and taking a nested #VC 425 * exception on whatever of them is the MMIO access. Using string 426 * instructions here would cause infinite nesting. 427 */ 428 switch (size) { 429 case 1: { 430 u8 d1; 431 u8 __user *s = (u8 __user *)src; 432 433 if (__get_user(d1, s)) 434 goto fault; 435 memcpy(buf, &d1, 1); 436 break; 437 } 438 case 2: { 439 u16 d2; 440 u16 __user *s = (u16 __user *)src; 441 442 if (__get_user(d2, s)) 443 goto fault; 444 memcpy(buf, &d2, 2); 445 break; 446 } 447 case 4: { 448 u32 d4; 449 u32 __user *s = (u32 __user *)src; 450 451 if (__get_user(d4, s)) 452 goto fault; 453 memcpy(buf, &d4, 4); 454 break; 455 } 456 case 8: { 457 u64 d8; 458 u64 __user *s = (u64 __user *)src; 459 if (__get_user(d8, s)) 460 goto fault; 461 memcpy(buf, &d8, 8); 462 break; 463 } 464 default: 465 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 466 return ES_UNSUPPORTED; 467 } 468 469 return ES_OK; 470 471 fault: 472 if (user_mode(ctxt->regs)) 473 error_code |= X86_PF_USER; 474 475 ctxt->fi.vector = X86_TRAP_PF; 476 ctxt->fi.error_code = error_code; 477 ctxt->fi.cr2 = (unsigned long)src; 478 479 return ES_EXCEPTION; 480 } 481 482 static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, 483 unsigned long vaddr, phys_addr_t *paddr) 484 { 485 unsigned long va = (unsigned long)vaddr; 486 unsigned int level; 487 phys_addr_t pa; 488 pgd_t *pgd; 489 pte_t *pte; 490 491 pgd = __va(read_cr3_pa()); 492 pgd = &pgd[pgd_index(va)]; 493 pte = lookup_address_in_pgd(pgd, va, &level); 494 if (!pte) { 495 ctxt->fi.vector = X86_TRAP_PF; 496 ctxt->fi.cr2 = vaddr; 497 ctxt->fi.error_code = 0; 498 499 if (user_mode(ctxt->regs)) 500 ctxt->fi.error_code |= X86_PF_USER; 501 502 return ES_EXCEPTION; 503 } 504 505 if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) 506 /* Emulated MMIO to/from encrypted memory not supported */ 507 return ES_UNSUPPORTED; 508 509 pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; 510 pa |= va & ~page_level_mask(level); 511 512 *paddr = pa; 513 514 return ES_OK; 515 } 516 517 /* Include code shared with pre-decompression boot stage */ 518 #include "sev-shared.c" 519 520 static noinstr void __sev_put_ghcb(struct ghcb_state *state) 521 { 522 struct sev_es_runtime_data *data; 523 struct ghcb *ghcb; 524 525 WARN_ON(!irqs_disabled()); 526 527 data = this_cpu_read(runtime_data); 528 ghcb = &data->ghcb_page; 529 530 if (state->ghcb) { 531 /* Restore GHCB from Backup */ 532 *ghcb = *state->ghcb; 533 data->backup_ghcb_active = false; 534 state->ghcb = NULL; 535 } else { 536 /* 537 * Invalidate the GHCB so a VMGEXIT instruction issued 538 * from userspace won't appear to be valid. 539 */ 540 vc_ghcb_invalidate(ghcb); 541 data->ghcb_active = false; 542 } 543 } 544 545 void noinstr __sev_es_nmi_complete(void) 546 { 547 struct ghcb_state state; 548 struct ghcb *ghcb; 549 550 ghcb = __sev_get_ghcb(&state); 551 552 vc_ghcb_invalidate(ghcb); 553 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); 554 ghcb_set_sw_exit_info_1(ghcb, 0); 555 ghcb_set_sw_exit_info_2(ghcb, 0); 556 557 sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); 558 VMGEXIT(); 559 560 __sev_put_ghcb(&state); 561 } 562 563 static u64 __init get_secrets_page(void) 564 { 565 u64 pa_data = boot_params.cc_blob_address; 566 struct cc_blob_sev_info info; 567 void *map; 568 569 /* 570 * The CC blob contains the address of the secrets page, check if the 571 * blob is present. 572 */ 573 if (!pa_data) 574 return 0; 575 576 map = early_memremap(pa_data, sizeof(info)); 577 if (!map) { 578 pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); 579 return 0; 580 } 581 memcpy(&info, map, sizeof(info)); 582 early_memunmap(map, sizeof(info)); 583 584 /* smoke-test the secrets page passed */ 585 if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) 586 return 0; 587 588 return info.secrets_phys; 589 } 590 591 static u64 __init get_snp_jump_table_addr(void) 592 { 593 struct snp_secrets_page_layout *layout; 594 void __iomem *mem; 595 u64 pa, addr; 596 597 pa = get_secrets_page(); 598 if (!pa) 599 return 0; 600 601 mem = ioremap_encrypted(pa, PAGE_SIZE); 602 if (!mem) { 603 pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); 604 return 0; 605 } 606 607 layout = (__force struct snp_secrets_page_layout *)mem; 608 609 addr = layout->os_area.ap_jump_table_pa; 610 iounmap(mem); 611 612 return addr; 613 } 614 615 static u64 __init get_jump_table_addr(void) 616 { 617 struct ghcb_state state; 618 unsigned long flags; 619 struct ghcb *ghcb; 620 u64 ret = 0; 621 622 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 623 return get_snp_jump_table_addr(); 624 625 local_irq_save(flags); 626 627 ghcb = __sev_get_ghcb(&state); 628 629 vc_ghcb_invalidate(ghcb); 630 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); 631 ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); 632 ghcb_set_sw_exit_info_2(ghcb, 0); 633 634 sev_es_wr_ghcb_msr(__pa(ghcb)); 635 VMGEXIT(); 636 637 if (ghcb_sw_exit_info_1_is_valid(ghcb) && 638 ghcb_sw_exit_info_2_is_valid(ghcb)) 639 ret = ghcb->save.sw_exit_info_2; 640 641 __sev_put_ghcb(&state); 642 643 local_irq_restore(flags); 644 645 return ret; 646 } 647 648 static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) 649 { 650 unsigned long vaddr_end; 651 int rc; 652 653 vaddr = vaddr & PAGE_MASK; 654 vaddr_end = vaddr + (npages << PAGE_SHIFT); 655 656 while (vaddr < vaddr_end) { 657 rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); 658 if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) 659 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); 660 661 vaddr = vaddr + PAGE_SIZE; 662 } 663 } 664 665 static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) 666 { 667 unsigned long paddr_end; 668 u64 val; 669 670 paddr = paddr & PAGE_MASK; 671 paddr_end = paddr + (npages << PAGE_SHIFT); 672 673 while (paddr < paddr_end) { 674 /* 675 * Use the MSR protocol because this function can be called before 676 * the GHCB is established. 677 */ 678 sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); 679 VMGEXIT(); 680 681 val = sev_es_rd_ghcb_msr(); 682 683 if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, 684 "Wrong PSC response code: 0x%x\n", 685 (unsigned int)GHCB_RESP_CODE(val))) 686 goto e_term; 687 688 if (WARN(GHCB_MSR_PSC_RESP_VAL(val), 689 "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", 690 op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", 691 paddr, GHCB_MSR_PSC_RESP_VAL(val))) 692 goto e_term; 693 694 paddr = paddr + PAGE_SIZE; 695 } 696 697 return; 698 699 e_term: 700 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 701 } 702 703 void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, 704 unsigned int npages) 705 { 706 /* 707 * This can be invoked in early boot while running identity mapped, so 708 * use an open coded check for SNP instead of using cc_platform_has(). 709 * This eliminates worries about jump tables or checking boot_cpu_data 710 * in the cc_platform_has() function. 711 */ 712 if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) 713 return; 714 715 /* 716 * Ask the hypervisor to mark the memory pages as private in the RMP 717 * table. 718 */ 719 early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); 720 721 /* Validate the memory pages after they've been added in the RMP table. */ 722 pvalidate_pages(vaddr, npages, true); 723 } 724 725 void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, 726 unsigned int npages) 727 { 728 /* 729 * This can be invoked in early boot while running identity mapped, so 730 * use an open coded check for SNP instead of using cc_platform_has(). 731 * This eliminates worries about jump tables or checking boot_cpu_data 732 * in the cc_platform_has() function. 733 */ 734 if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) 735 return; 736 737 /* Invalidate the memory pages before they are marked shared in the RMP table. */ 738 pvalidate_pages(vaddr, npages, false); 739 740 /* Ask hypervisor to mark the memory pages shared in the RMP table. */ 741 early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); 742 } 743 744 void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) 745 { 746 unsigned long vaddr, npages; 747 748 vaddr = (unsigned long)__va(paddr); 749 npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; 750 751 if (op == SNP_PAGE_STATE_PRIVATE) 752 early_snp_set_memory_private(vaddr, paddr, npages); 753 else if (op == SNP_PAGE_STATE_SHARED) 754 early_snp_set_memory_shared(vaddr, paddr, npages); 755 else 756 WARN(1, "invalid memory op %d\n", op); 757 } 758 759 static int vmgexit_psc(struct snp_psc_desc *desc) 760 { 761 int cur_entry, end_entry, ret = 0; 762 struct snp_psc_desc *data; 763 struct ghcb_state state; 764 struct es_em_ctxt ctxt; 765 unsigned long flags; 766 struct ghcb *ghcb; 767 768 /* 769 * __sev_get_ghcb() needs to run with IRQs disabled because it is using 770 * a per-CPU GHCB. 771 */ 772 local_irq_save(flags); 773 774 ghcb = __sev_get_ghcb(&state); 775 if (!ghcb) { 776 ret = 1; 777 goto out_unlock; 778 } 779 780 /* Copy the input desc into GHCB shared buffer */ 781 data = (struct snp_psc_desc *)ghcb->shared_buffer; 782 memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); 783 784 /* 785 * As per the GHCB specification, the hypervisor can resume the guest 786 * before processing all the entries. Check whether all the entries 787 * are processed. If not, then keep retrying. Note, the hypervisor 788 * will update the data memory directly to indicate the status, so 789 * reference the data->hdr everywhere. 790 * 791 * The strategy here is to wait for the hypervisor to change the page 792 * state in the RMP table before guest accesses the memory pages. If the 793 * page state change was not successful, then later memory access will 794 * result in a crash. 795 */ 796 cur_entry = data->hdr.cur_entry; 797 end_entry = data->hdr.end_entry; 798 799 while (data->hdr.cur_entry <= data->hdr.end_entry) { 800 ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); 801 802 /* This will advance the shared buffer data points to. */ 803 ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); 804 805 /* 806 * Page State Change VMGEXIT can pass error code through 807 * exit_info_2. 808 */ 809 if (WARN(ret || ghcb->save.sw_exit_info_2, 810 "SNP: PSC failed ret=%d exit_info_2=%llx\n", 811 ret, ghcb->save.sw_exit_info_2)) { 812 ret = 1; 813 goto out; 814 } 815 816 /* Verify that reserved bit is not set */ 817 if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { 818 ret = 1; 819 goto out; 820 } 821 822 /* 823 * Sanity check that entry processing is not going backwards. 824 * This will happen only if hypervisor is tricking us. 825 */ 826 if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, 827 "SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", 828 end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { 829 ret = 1; 830 goto out; 831 } 832 } 833 834 out: 835 __sev_put_ghcb(&state); 836 837 out_unlock: 838 local_irq_restore(flags); 839 840 return ret; 841 } 842 843 static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, 844 unsigned long vaddr_end, int op) 845 { 846 struct psc_hdr *hdr; 847 struct psc_entry *e; 848 unsigned long pfn; 849 int i; 850 851 hdr = &data->hdr; 852 e = data->entries; 853 854 memset(data, 0, sizeof(*data)); 855 i = 0; 856 857 while (vaddr < vaddr_end) { 858 if (is_vmalloc_addr((void *)vaddr)) 859 pfn = vmalloc_to_pfn((void *)vaddr); 860 else 861 pfn = __pa(vaddr) >> PAGE_SHIFT; 862 863 e->gfn = pfn; 864 e->operation = op; 865 hdr->end_entry = i; 866 867 /* 868 * Current SNP implementation doesn't keep track of the RMP page 869 * size so use 4K for simplicity. 870 */ 871 e->pagesize = RMP_PG_SIZE_4K; 872 873 vaddr = vaddr + PAGE_SIZE; 874 e++; 875 i++; 876 } 877 878 if (vmgexit_psc(data)) 879 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 880 } 881 882 static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) 883 { 884 unsigned long vaddr_end, next_vaddr; 885 struct snp_psc_desc *desc; 886 887 desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); 888 if (!desc) 889 panic("SNP: failed to allocate memory for PSC descriptor\n"); 890 891 vaddr = vaddr & PAGE_MASK; 892 vaddr_end = vaddr + (npages << PAGE_SHIFT); 893 894 while (vaddr < vaddr_end) { 895 /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ 896 next_vaddr = min_t(unsigned long, vaddr_end, 897 (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); 898 899 __set_pages_state(desc, vaddr, next_vaddr, op); 900 901 vaddr = next_vaddr; 902 } 903 904 kfree(desc); 905 } 906 907 void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) 908 { 909 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 910 return; 911 912 pvalidate_pages(vaddr, npages, false); 913 914 set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); 915 } 916 917 void snp_set_memory_private(unsigned long vaddr, unsigned int npages) 918 { 919 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 920 return; 921 922 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); 923 924 pvalidate_pages(vaddr, npages, true); 925 } 926 927 static int snp_set_vmsa(void *va, bool vmsa) 928 { 929 u64 attrs; 930 931 /* 932 * Running at VMPL0 allows the kernel to change the VMSA bit for a page 933 * using the RMPADJUST instruction. However, for the instruction to 934 * succeed it must target the permissions of a lesser privileged 935 * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST 936 * instruction in the AMD64 APM Volume 3). 937 */ 938 attrs = 1; 939 if (vmsa) 940 attrs |= RMPADJUST_VMSA_PAGE_BIT; 941 942 return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); 943 } 944 945 #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) 946 #define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) 947 #define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) 948 949 #define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) 950 #define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) 951 952 static void *snp_alloc_vmsa_page(void) 953 { 954 struct page *p; 955 956 /* 957 * Allocate VMSA page to work around the SNP erratum where the CPU will 958 * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) 959 * collides with the RMP entry of VMSA page. The recommended workaround 960 * is to not use a large page. 961 * 962 * Allocate an 8k page which is also 8k-aligned. 963 */ 964 p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); 965 if (!p) 966 return NULL; 967 968 split_page(p, 1); 969 970 /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ 971 __free_page(p); 972 973 return page_address(p + 1); 974 } 975 976 static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) 977 { 978 int err; 979 980 err = snp_set_vmsa(vmsa, false); 981 if (err) 982 pr_err("clear VMSA page failed (%u), leaking page\n", err); 983 else 984 free_page((unsigned long)vmsa); 985 } 986 987 static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) 988 { 989 struct sev_es_save_area *cur_vmsa, *vmsa; 990 struct ghcb_state state; 991 unsigned long flags; 992 struct ghcb *ghcb; 993 u8 sipi_vector; 994 int cpu, ret; 995 u64 cr4; 996 997 /* 998 * The hypervisor SNP feature support check has happened earlier, just check 999 * the AP_CREATION one here. 1000 */ 1001 if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) 1002 return -EOPNOTSUPP; 1003 1004 /* 1005 * Verify the desired start IP against the known trampoline start IP 1006 * to catch any future new trampolines that may be introduced that 1007 * would require a new protected guest entry point. 1008 */ 1009 if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, 1010 "Unsupported SNP start_ip: %lx\n", start_ip)) 1011 return -EINVAL; 1012 1013 /* Override start_ip with known protected guest start IP */ 1014 start_ip = real_mode_header->sev_es_trampoline_start; 1015 1016 /* Find the logical CPU for the APIC ID */ 1017 for_each_present_cpu(cpu) { 1018 if (arch_match_cpu_phys_id(cpu, apic_id)) 1019 break; 1020 } 1021 if (cpu >= nr_cpu_ids) 1022 return -EINVAL; 1023 1024 cur_vmsa = per_cpu(sev_vmsa, cpu); 1025 1026 /* 1027 * A new VMSA is created each time because there is no guarantee that 1028 * the current VMSA is the kernels or that the vCPU is not running. If 1029 * an attempt was done to use the current VMSA with a running vCPU, a 1030 * #VMEXIT of that vCPU would wipe out all of the settings being done 1031 * here. 1032 */ 1033 vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); 1034 if (!vmsa) 1035 return -ENOMEM; 1036 1037 /* CR4 should maintain the MCE value */ 1038 cr4 = native_read_cr4() & X86_CR4_MCE; 1039 1040 /* Set the CS value based on the start_ip converted to a SIPI vector */ 1041 sipi_vector = (start_ip >> 12); 1042 vmsa->cs.base = sipi_vector << 12; 1043 vmsa->cs.limit = AP_INIT_CS_LIMIT; 1044 vmsa->cs.attrib = INIT_CS_ATTRIBS; 1045 vmsa->cs.selector = sipi_vector << 8; 1046 1047 /* Set the RIP value based on start_ip */ 1048 vmsa->rip = start_ip & 0xfff; 1049 1050 /* Set AP INIT defaults as documented in the APM */ 1051 vmsa->ds.limit = AP_INIT_DS_LIMIT; 1052 vmsa->ds.attrib = INIT_DS_ATTRIBS; 1053 vmsa->es = vmsa->ds; 1054 vmsa->fs = vmsa->ds; 1055 vmsa->gs = vmsa->ds; 1056 vmsa->ss = vmsa->ds; 1057 1058 vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; 1059 vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; 1060 vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; 1061 vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; 1062 vmsa->tr.limit = AP_INIT_TR_LIMIT; 1063 vmsa->tr.attrib = INIT_TR_ATTRIBS; 1064 1065 vmsa->cr4 = cr4; 1066 vmsa->cr0 = AP_INIT_CR0_DEFAULT; 1067 vmsa->dr7 = DR7_RESET_VALUE; 1068 vmsa->dr6 = AP_INIT_DR6_DEFAULT; 1069 vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; 1070 vmsa->g_pat = AP_INIT_GPAT_DEFAULT; 1071 vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; 1072 vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; 1073 vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; 1074 vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; 1075 1076 /* SVME must be set. */ 1077 vmsa->efer = EFER_SVME; 1078 1079 /* 1080 * Set the SNP-specific fields for this VMSA: 1081 * VMPL level 1082 * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) 1083 */ 1084 vmsa->vmpl = 0; 1085 vmsa->sev_features = sev_status >> 2; 1086 1087 /* Switch the page over to a VMSA page now that it is initialized */ 1088 ret = snp_set_vmsa(vmsa, true); 1089 if (ret) { 1090 pr_err("set VMSA page failed (%u)\n", ret); 1091 free_page((unsigned long)vmsa); 1092 1093 return -EINVAL; 1094 } 1095 1096 /* Issue VMGEXIT AP Creation NAE event */ 1097 local_irq_save(flags); 1098 1099 ghcb = __sev_get_ghcb(&state); 1100 1101 vc_ghcb_invalidate(ghcb); 1102 ghcb_set_rax(ghcb, vmsa->sev_features); 1103 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); 1104 ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); 1105 ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); 1106 1107 sev_es_wr_ghcb_msr(__pa(ghcb)); 1108 VMGEXIT(); 1109 1110 if (!ghcb_sw_exit_info_1_is_valid(ghcb) || 1111 lower_32_bits(ghcb->save.sw_exit_info_1)) { 1112 pr_err("SNP AP Creation error\n"); 1113 ret = -EINVAL; 1114 } 1115 1116 __sev_put_ghcb(&state); 1117 1118 local_irq_restore(flags); 1119 1120 /* Perform cleanup if there was an error */ 1121 if (ret) { 1122 snp_cleanup_vmsa(vmsa); 1123 vmsa = NULL; 1124 } 1125 1126 /* Free up any previous VMSA page */ 1127 if (cur_vmsa) 1128 snp_cleanup_vmsa(cur_vmsa); 1129 1130 /* Record the current VMSA page */ 1131 per_cpu(sev_vmsa, cpu) = vmsa; 1132 1133 return ret; 1134 } 1135 1136 void snp_set_wakeup_secondary_cpu(void) 1137 { 1138 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1139 return; 1140 1141 /* 1142 * Always set this override if SNP is enabled. This makes it the 1143 * required method to start APs under SNP. If the hypervisor does 1144 * not support AP creation, then no APs will be started. 1145 */ 1146 apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; 1147 } 1148 1149 int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) 1150 { 1151 u16 startup_cs, startup_ip; 1152 phys_addr_t jump_table_pa; 1153 u64 jump_table_addr; 1154 u16 __iomem *jump_table; 1155 1156 jump_table_addr = get_jump_table_addr(); 1157 1158 /* On UP guests there is no jump table so this is not a failure */ 1159 if (!jump_table_addr) 1160 return 0; 1161 1162 /* Check if AP Jump Table is page-aligned */ 1163 if (jump_table_addr & ~PAGE_MASK) 1164 return -EINVAL; 1165 1166 jump_table_pa = jump_table_addr & PAGE_MASK; 1167 1168 startup_cs = (u16)(rmh->trampoline_start >> 4); 1169 startup_ip = (u16)(rmh->sev_es_trampoline_start - 1170 rmh->trampoline_start); 1171 1172 jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); 1173 if (!jump_table) 1174 return -EIO; 1175 1176 writew(startup_ip, &jump_table[0]); 1177 writew(startup_cs, &jump_table[1]); 1178 1179 iounmap(jump_table); 1180 1181 return 0; 1182 } 1183 1184 /* 1185 * This is needed by the OVMF UEFI firmware which will use whatever it finds in 1186 * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu 1187 * runtime GHCBs used by the kernel are also mapped in the EFI page-table. 1188 */ 1189 int __init sev_es_efi_map_ghcbs(pgd_t *pgd) 1190 { 1191 struct sev_es_runtime_data *data; 1192 unsigned long address, pflags; 1193 int cpu; 1194 u64 pfn; 1195 1196 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1197 return 0; 1198 1199 pflags = _PAGE_NX | _PAGE_RW; 1200 1201 for_each_possible_cpu(cpu) { 1202 data = per_cpu(runtime_data, cpu); 1203 1204 address = __pa(&data->ghcb_page); 1205 pfn = address >> PAGE_SHIFT; 1206 1207 if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) 1208 return 1; 1209 } 1210 1211 return 0; 1212 } 1213 1214 static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1215 { 1216 struct pt_regs *regs = ctxt->regs; 1217 enum es_result ret; 1218 u64 exit_info_1; 1219 1220 /* Is it a WRMSR? */ 1221 exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; 1222 1223 ghcb_set_rcx(ghcb, regs->cx); 1224 if (exit_info_1) { 1225 ghcb_set_rax(ghcb, regs->ax); 1226 ghcb_set_rdx(ghcb, regs->dx); 1227 } 1228 1229 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); 1230 1231 if ((ret == ES_OK) && (!exit_info_1)) { 1232 regs->ax = ghcb->save.rax; 1233 regs->dx = ghcb->save.rdx; 1234 } 1235 1236 return ret; 1237 } 1238 1239 static void snp_register_per_cpu_ghcb(void) 1240 { 1241 struct sev_es_runtime_data *data; 1242 struct ghcb *ghcb; 1243 1244 data = this_cpu_read(runtime_data); 1245 ghcb = &data->ghcb_page; 1246 1247 snp_register_ghcb_early(__pa(ghcb)); 1248 } 1249 1250 void setup_ghcb(void) 1251 { 1252 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1253 return; 1254 1255 /* First make sure the hypervisor talks a supported protocol. */ 1256 if (!sev_es_negotiate_protocol()) 1257 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 1258 1259 /* 1260 * Check whether the runtime #VC exception handler is active. It uses 1261 * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). 1262 * 1263 * If SNP is active, register the per-CPU GHCB page so that the runtime 1264 * exception handler can use it. 1265 */ 1266 if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { 1267 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1268 snp_register_per_cpu_ghcb(); 1269 1270 return; 1271 } 1272 1273 /* 1274 * Clear the boot_ghcb. The first exception comes in before the bss 1275 * section is cleared. 1276 */ 1277 memset(&boot_ghcb_page, 0, PAGE_SIZE); 1278 1279 /* Alright - Make the boot-ghcb public */ 1280 boot_ghcb = &boot_ghcb_page; 1281 1282 /* SNP guest requires that GHCB GPA must be registered. */ 1283 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1284 snp_register_ghcb_early(__pa(&boot_ghcb_page)); 1285 } 1286 1287 #ifdef CONFIG_HOTPLUG_CPU 1288 static void sev_es_ap_hlt_loop(void) 1289 { 1290 struct ghcb_state state; 1291 struct ghcb *ghcb; 1292 1293 ghcb = __sev_get_ghcb(&state); 1294 1295 while (true) { 1296 vc_ghcb_invalidate(ghcb); 1297 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); 1298 ghcb_set_sw_exit_info_1(ghcb, 0); 1299 ghcb_set_sw_exit_info_2(ghcb, 0); 1300 1301 sev_es_wr_ghcb_msr(__pa(ghcb)); 1302 VMGEXIT(); 1303 1304 /* Wakeup signal? */ 1305 if (ghcb_sw_exit_info_2_is_valid(ghcb) && 1306 ghcb->save.sw_exit_info_2) 1307 break; 1308 } 1309 1310 __sev_put_ghcb(&state); 1311 } 1312 1313 /* 1314 * Play_dead handler when running under SEV-ES. This is needed because 1315 * the hypervisor can't deliver an SIPI request to restart the AP. 1316 * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the 1317 * hypervisor wakes it up again. 1318 */ 1319 static void sev_es_play_dead(void) 1320 { 1321 play_dead_common(); 1322 1323 /* IRQs now disabled */ 1324 1325 sev_es_ap_hlt_loop(); 1326 1327 /* 1328 * If we get here, the VCPU was woken up again. Jump to CPU 1329 * startup code to get it back online. 1330 */ 1331 start_cpu0(); 1332 } 1333 #else /* CONFIG_HOTPLUG_CPU */ 1334 #define sev_es_play_dead native_play_dead 1335 #endif /* CONFIG_HOTPLUG_CPU */ 1336 1337 #ifdef CONFIG_SMP 1338 static void __init sev_es_setup_play_dead(void) 1339 { 1340 smp_ops.play_dead = sev_es_play_dead; 1341 } 1342 #else 1343 static inline void sev_es_setup_play_dead(void) { } 1344 #endif 1345 1346 static void __init alloc_runtime_data(int cpu) 1347 { 1348 struct sev_es_runtime_data *data; 1349 1350 data = memblock_alloc(sizeof(*data), PAGE_SIZE); 1351 if (!data) 1352 panic("Can't allocate SEV-ES runtime data"); 1353 1354 per_cpu(runtime_data, cpu) = data; 1355 } 1356 1357 static void __init init_ghcb(int cpu) 1358 { 1359 struct sev_es_runtime_data *data; 1360 int err; 1361 1362 data = per_cpu(runtime_data, cpu); 1363 1364 err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, 1365 sizeof(data->ghcb_page)); 1366 if (err) 1367 panic("Can't map GHCBs unencrypted"); 1368 1369 memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); 1370 1371 data->ghcb_active = false; 1372 data->backup_ghcb_active = false; 1373 } 1374 1375 void __init sev_es_init_vc_handling(void) 1376 { 1377 int cpu; 1378 1379 BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); 1380 1381 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1382 return; 1383 1384 if (!sev_es_check_cpu_features()) 1385 panic("SEV-ES CPU Features missing"); 1386 1387 /* 1388 * SNP is supported in v2 of the GHCB spec which mandates support for HV 1389 * features. 1390 */ 1391 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { 1392 sev_hv_features = get_hv_features(); 1393 1394 if (!(sev_hv_features & GHCB_HV_FT_SNP)) 1395 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); 1396 } 1397 1398 /* Enable SEV-ES special handling */ 1399 static_branch_enable(&sev_es_enable_key); 1400 1401 /* Initialize per-cpu GHCB pages */ 1402 for_each_possible_cpu(cpu) { 1403 alloc_runtime_data(cpu); 1404 init_ghcb(cpu); 1405 } 1406 1407 sev_es_setup_play_dead(); 1408 1409 /* Secondary CPUs use the runtime #VC handler */ 1410 initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; 1411 } 1412 1413 static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) 1414 { 1415 int trapnr = ctxt->fi.vector; 1416 1417 if (trapnr == X86_TRAP_PF) 1418 native_write_cr2(ctxt->fi.cr2); 1419 1420 ctxt->regs->orig_ax = ctxt->fi.error_code; 1421 do_early_exception(ctxt->regs, trapnr); 1422 } 1423 1424 static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) 1425 { 1426 long *reg_array; 1427 int offset; 1428 1429 reg_array = (long *)ctxt->regs; 1430 offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); 1431 1432 if (offset < 0) 1433 return NULL; 1434 1435 offset /= sizeof(long); 1436 1437 return reg_array + offset; 1438 } 1439 static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, 1440 unsigned int bytes, bool read) 1441 { 1442 u64 exit_code, exit_info_1, exit_info_2; 1443 unsigned long ghcb_pa = __pa(ghcb); 1444 enum es_result res; 1445 phys_addr_t paddr; 1446 void __user *ref; 1447 1448 ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); 1449 if (ref == (void __user *)-1L) 1450 return ES_UNSUPPORTED; 1451 1452 exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; 1453 1454 res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); 1455 if (res != ES_OK) { 1456 if (res == ES_EXCEPTION && !read) 1457 ctxt->fi.error_code |= X86_PF_WRITE; 1458 1459 return res; 1460 } 1461 1462 exit_info_1 = paddr; 1463 /* Can never be greater than 8 */ 1464 exit_info_2 = bytes; 1465 1466 ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); 1467 1468 return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); 1469 } 1470 1471 /* 1472 * The MOVS instruction has two memory operands, which raises the 1473 * problem that it is not known whether the access to the source or the 1474 * destination caused the #VC exception (and hence whether an MMIO read 1475 * or write operation needs to be emulated). 1476 * 1477 * Instead of playing games with walking page-tables and trying to guess 1478 * whether the source or destination is an MMIO range, split the move 1479 * into two operations, a read and a write with only one memory operand. 1480 * This will cause a nested #VC exception on the MMIO address which can 1481 * then be handled. 1482 * 1483 * This implementation has the benefit that it also supports MOVS where 1484 * source _and_ destination are MMIO regions. 1485 * 1486 * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a 1487 * rare operation. If it turns out to be a performance problem the split 1488 * operations can be moved to memcpy_fromio() and memcpy_toio(). 1489 */ 1490 static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, 1491 unsigned int bytes) 1492 { 1493 unsigned long ds_base, es_base; 1494 unsigned char *src, *dst; 1495 unsigned char buffer[8]; 1496 enum es_result ret; 1497 bool rep; 1498 int off; 1499 1500 ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); 1501 es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); 1502 1503 if (ds_base == -1L || es_base == -1L) { 1504 ctxt->fi.vector = X86_TRAP_GP; 1505 ctxt->fi.error_code = 0; 1506 return ES_EXCEPTION; 1507 } 1508 1509 src = ds_base + (unsigned char *)ctxt->regs->si; 1510 dst = es_base + (unsigned char *)ctxt->regs->di; 1511 1512 ret = vc_read_mem(ctxt, src, buffer, bytes); 1513 if (ret != ES_OK) 1514 return ret; 1515 1516 ret = vc_write_mem(ctxt, dst, buffer, bytes); 1517 if (ret != ES_OK) 1518 return ret; 1519 1520 if (ctxt->regs->flags & X86_EFLAGS_DF) 1521 off = -bytes; 1522 else 1523 off = bytes; 1524 1525 ctxt->regs->si += off; 1526 ctxt->regs->di += off; 1527 1528 rep = insn_has_rep_prefix(&ctxt->insn); 1529 if (rep) 1530 ctxt->regs->cx -= 1; 1531 1532 if (!rep || ctxt->regs->cx == 0) 1533 return ES_OK; 1534 else 1535 return ES_RETRY; 1536 } 1537 1538 static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1539 { 1540 struct insn *insn = &ctxt->insn; 1541 enum insn_mmio_type mmio; 1542 unsigned int bytes = 0; 1543 enum es_result ret; 1544 u8 sign_byte; 1545 long *reg_data; 1546 1547 mmio = insn_decode_mmio(insn, &bytes); 1548 if (mmio == INSN_MMIO_DECODE_FAILED) 1549 return ES_DECODE_FAILED; 1550 1551 if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { 1552 reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); 1553 if (!reg_data) 1554 return ES_DECODE_FAILED; 1555 } 1556 1557 switch (mmio) { 1558 case INSN_MMIO_WRITE: 1559 memcpy(ghcb->shared_buffer, reg_data, bytes); 1560 ret = vc_do_mmio(ghcb, ctxt, bytes, false); 1561 break; 1562 case INSN_MMIO_WRITE_IMM: 1563 memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); 1564 ret = vc_do_mmio(ghcb, ctxt, bytes, false); 1565 break; 1566 case INSN_MMIO_READ: 1567 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1568 if (ret) 1569 break; 1570 1571 /* Zero-extend for 32-bit operation */ 1572 if (bytes == 4) 1573 *reg_data = 0; 1574 1575 memcpy(reg_data, ghcb->shared_buffer, bytes); 1576 break; 1577 case INSN_MMIO_READ_ZERO_EXTEND: 1578 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1579 if (ret) 1580 break; 1581 1582 /* Zero extend based on operand size */ 1583 memset(reg_data, 0, insn->opnd_bytes); 1584 memcpy(reg_data, ghcb->shared_buffer, bytes); 1585 break; 1586 case INSN_MMIO_READ_SIGN_EXTEND: 1587 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1588 if (ret) 1589 break; 1590 1591 if (bytes == 1) { 1592 u8 *val = (u8 *)ghcb->shared_buffer; 1593 1594 sign_byte = (*val & 0x80) ? 0xff : 0x00; 1595 } else { 1596 u16 *val = (u16 *)ghcb->shared_buffer; 1597 1598 sign_byte = (*val & 0x8000) ? 0xff : 0x00; 1599 } 1600 1601 /* Sign extend based on operand size */ 1602 memset(reg_data, sign_byte, insn->opnd_bytes); 1603 memcpy(reg_data, ghcb->shared_buffer, bytes); 1604 break; 1605 case INSN_MMIO_MOVS: 1606 ret = vc_handle_mmio_movs(ctxt, bytes); 1607 break; 1608 default: 1609 ret = ES_UNSUPPORTED; 1610 break; 1611 } 1612 1613 return ret; 1614 } 1615 1616 static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, 1617 struct es_em_ctxt *ctxt) 1618 { 1619 struct sev_es_runtime_data *data = this_cpu_read(runtime_data); 1620 long val, *reg = vc_insn_get_rm(ctxt); 1621 enum es_result ret; 1622 1623 if (!reg) 1624 return ES_DECODE_FAILED; 1625 1626 val = *reg; 1627 1628 /* Upper 32 bits must be written as zeroes */ 1629 if (val >> 32) { 1630 ctxt->fi.vector = X86_TRAP_GP; 1631 ctxt->fi.error_code = 0; 1632 return ES_EXCEPTION; 1633 } 1634 1635 /* Clear out other reserved bits and set bit 10 */ 1636 val = (val & 0xffff23ffL) | BIT(10); 1637 1638 /* Early non-zero writes to DR7 are not supported */ 1639 if (!data && (val & ~DR7_RESET_VALUE)) 1640 return ES_UNSUPPORTED; 1641 1642 /* Using a value of 0 for ExitInfo1 means RAX holds the value */ 1643 ghcb_set_rax(ghcb, val); 1644 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); 1645 if (ret != ES_OK) 1646 return ret; 1647 1648 if (data) 1649 data->dr7 = val; 1650 1651 return ES_OK; 1652 } 1653 1654 static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, 1655 struct es_em_ctxt *ctxt) 1656 { 1657 struct sev_es_runtime_data *data = this_cpu_read(runtime_data); 1658 long *reg = vc_insn_get_rm(ctxt); 1659 1660 if (!reg) 1661 return ES_DECODE_FAILED; 1662 1663 if (data) 1664 *reg = data->dr7; 1665 else 1666 *reg = DR7_RESET_VALUE; 1667 1668 return ES_OK; 1669 } 1670 1671 static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, 1672 struct es_em_ctxt *ctxt) 1673 { 1674 return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); 1675 } 1676 1677 static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1678 { 1679 enum es_result ret; 1680 1681 ghcb_set_rcx(ghcb, ctxt->regs->cx); 1682 1683 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); 1684 if (ret != ES_OK) 1685 return ret; 1686 1687 if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) 1688 return ES_VMM_ERROR; 1689 1690 ctxt->regs->ax = ghcb->save.rax; 1691 ctxt->regs->dx = ghcb->save.rdx; 1692 1693 return ES_OK; 1694 } 1695 1696 static enum es_result vc_handle_monitor(struct ghcb *ghcb, 1697 struct es_em_ctxt *ctxt) 1698 { 1699 /* 1700 * Treat it as a NOP and do not leak a physical address to the 1701 * hypervisor. 1702 */ 1703 return ES_OK; 1704 } 1705 1706 static enum es_result vc_handle_mwait(struct ghcb *ghcb, 1707 struct es_em_ctxt *ctxt) 1708 { 1709 /* Treat the same as MONITOR/MONITORX */ 1710 return ES_OK; 1711 } 1712 1713 static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, 1714 struct es_em_ctxt *ctxt) 1715 { 1716 enum es_result ret; 1717 1718 ghcb_set_rax(ghcb, ctxt->regs->ax); 1719 ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); 1720 1721 if (x86_platform.hyper.sev_es_hcall_prepare) 1722 x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); 1723 1724 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); 1725 if (ret != ES_OK) 1726 return ret; 1727 1728 if (!ghcb_rax_is_valid(ghcb)) 1729 return ES_VMM_ERROR; 1730 1731 ctxt->regs->ax = ghcb->save.rax; 1732 1733 /* 1734 * Call sev_es_hcall_finish() after regs->ax is already set. 1735 * This allows the hypervisor handler to overwrite it again if 1736 * necessary. 1737 */ 1738 if (x86_platform.hyper.sev_es_hcall_finish && 1739 !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) 1740 return ES_VMM_ERROR; 1741 1742 return ES_OK; 1743 } 1744 1745 static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, 1746 struct es_em_ctxt *ctxt) 1747 { 1748 /* 1749 * Calling ecx_alignment_check() directly does not work, because it 1750 * enables IRQs and the GHCB is active. Forward the exception and call 1751 * it later from vc_forward_exception(). 1752 */ 1753 ctxt->fi.vector = X86_TRAP_AC; 1754 ctxt->fi.error_code = 0; 1755 return ES_EXCEPTION; 1756 } 1757 1758 static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, 1759 struct ghcb *ghcb, 1760 unsigned long exit_code) 1761 { 1762 enum es_result result; 1763 1764 switch (exit_code) { 1765 case SVM_EXIT_READ_DR7: 1766 result = vc_handle_dr7_read(ghcb, ctxt); 1767 break; 1768 case SVM_EXIT_WRITE_DR7: 1769 result = vc_handle_dr7_write(ghcb, ctxt); 1770 break; 1771 case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: 1772 result = vc_handle_trap_ac(ghcb, ctxt); 1773 break; 1774 case SVM_EXIT_RDTSC: 1775 case SVM_EXIT_RDTSCP: 1776 result = vc_handle_rdtsc(ghcb, ctxt, exit_code); 1777 break; 1778 case SVM_EXIT_RDPMC: 1779 result = vc_handle_rdpmc(ghcb, ctxt); 1780 break; 1781 case SVM_EXIT_INVD: 1782 pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); 1783 result = ES_UNSUPPORTED; 1784 break; 1785 case SVM_EXIT_CPUID: 1786 result = vc_handle_cpuid(ghcb, ctxt); 1787 break; 1788 case SVM_EXIT_IOIO: 1789 result = vc_handle_ioio(ghcb, ctxt); 1790 break; 1791 case SVM_EXIT_MSR: 1792 result = vc_handle_msr(ghcb, ctxt); 1793 break; 1794 case SVM_EXIT_VMMCALL: 1795 result = vc_handle_vmmcall(ghcb, ctxt); 1796 break; 1797 case SVM_EXIT_WBINVD: 1798 result = vc_handle_wbinvd(ghcb, ctxt); 1799 break; 1800 case SVM_EXIT_MONITOR: 1801 result = vc_handle_monitor(ghcb, ctxt); 1802 break; 1803 case SVM_EXIT_MWAIT: 1804 result = vc_handle_mwait(ghcb, ctxt); 1805 break; 1806 case SVM_EXIT_NPF: 1807 result = vc_handle_mmio(ghcb, ctxt); 1808 break; 1809 default: 1810 /* 1811 * Unexpected #VC exception 1812 */ 1813 result = ES_UNSUPPORTED; 1814 } 1815 1816 return result; 1817 } 1818 1819 static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) 1820 { 1821 long error_code = ctxt->fi.error_code; 1822 int trapnr = ctxt->fi.vector; 1823 1824 ctxt->regs->orig_ax = ctxt->fi.error_code; 1825 1826 switch (trapnr) { 1827 case X86_TRAP_GP: 1828 exc_general_protection(ctxt->regs, error_code); 1829 break; 1830 case X86_TRAP_UD: 1831 exc_invalid_op(ctxt->regs); 1832 break; 1833 case X86_TRAP_PF: 1834 write_cr2(ctxt->fi.cr2); 1835 exc_page_fault(ctxt->regs, error_code); 1836 break; 1837 case X86_TRAP_AC: 1838 exc_alignment_check(ctxt->regs, error_code); 1839 break; 1840 default: 1841 pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); 1842 BUG(); 1843 } 1844 } 1845 1846 static __always_inline bool is_vc2_stack(unsigned long sp) 1847 { 1848 return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); 1849 } 1850 1851 static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) 1852 { 1853 unsigned long sp, prev_sp; 1854 1855 sp = (unsigned long)regs; 1856 prev_sp = regs->sp; 1857 1858 /* 1859 * If the code was already executing on the VC2 stack when the #VC 1860 * happened, let it proceed to the normal handling routine. This way the 1861 * code executing on the VC2 stack can cause #VC exceptions to get handled. 1862 */ 1863 return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); 1864 } 1865 1866 static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) 1867 { 1868 struct ghcb_state state; 1869 struct es_em_ctxt ctxt; 1870 enum es_result result; 1871 struct ghcb *ghcb; 1872 bool ret = true; 1873 1874 ghcb = __sev_get_ghcb(&state); 1875 1876 vc_ghcb_invalidate(ghcb); 1877 result = vc_init_em_ctxt(&ctxt, regs, error_code); 1878 1879 if (result == ES_OK) 1880 result = vc_handle_exitcode(&ctxt, ghcb, error_code); 1881 1882 __sev_put_ghcb(&state); 1883 1884 /* Done - now check the result */ 1885 switch (result) { 1886 case ES_OK: 1887 vc_finish_insn(&ctxt); 1888 break; 1889 case ES_UNSUPPORTED: 1890 pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", 1891 error_code, regs->ip); 1892 ret = false; 1893 break; 1894 case ES_VMM_ERROR: 1895 pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", 1896 error_code, regs->ip); 1897 ret = false; 1898 break; 1899 case ES_DECODE_FAILED: 1900 pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", 1901 error_code, regs->ip); 1902 ret = false; 1903 break; 1904 case ES_EXCEPTION: 1905 vc_forward_exception(&ctxt); 1906 break; 1907 case ES_RETRY: 1908 /* Nothing to do */ 1909 break; 1910 default: 1911 pr_emerg("Unknown result in %s():%d\n", __func__, result); 1912 /* 1913 * Emulating the instruction which caused the #VC exception 1914 * failed - can't continue so print debug information 1915 */ 1916 BUG(); 1917 } 1918 1919 return ret; 1920 } 1921 1922 static __always_inline bool vc_is_db(unsigned long error_code) 1923 { 1924 return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; 1925 } 1926 1927 /* 1928 * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode 1929 * and will panic when an error happens. 1930 */ 1931 DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) 1932 { 1933 irqentry_state_t irq_state; 1934 1935 /* 1936 * With the current implementation it is always possible to switch to a 1937 * safe stack because #VC exceptions only happen at known places, like 1938 * intercepted instructions or accesses to MMIO areas/IO ports. They can 1939 * also happen with code instrumentation when the hypervisor intercepts 1940 * #DB, but the critical paths are forbidden to be instrumented, so #DB 1941 * exceptions currently also only happen in safe places. 1942 * 1943 * But keep this here in case the noinstr annotations are violated due 1944 * to bug elsewhere. 1945 */ 1946 if (unlikely(vc_from_invalid_context(regs))) { 1947 instrumentation_begin(); 1948 panic("Can't handle #VC exception from unsupported context\n"); 1949 instrumentation_end(); 1950 } 1951 1952 /* 1953 * Handle #DB before calling into !noinstr code to avoid recursive #DB. 1954 */ 1955 if (vc_is_db(error_code)) { 1956 exc_debug(regs); 1957 return; 1958 } 1959 1960 irq_state = irqentry_nmi_enter(regs); 1961 1962 instrumentation_begin(); 1963 1964 if (!vc_raw_handle_exception(regs, error_code)) { 1965 /* Show some debug info */ 1966 show_regs(regs); 1967 1968 /* Ask hypervisor to sev_es_terminate */ 1969 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 1970 1971 /* If that fails and we get here - just panic */ 1972 panic("Returned from Terminate-Request to Hypervisor\n"); 1973 } 1974 1975 instrumentation_end(); 1976 irqentry_nmi_exit(regs, irq_state); 1977 } 1978 1979 /* 1980 * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode 1981 * and will kill the current task with SIGBUS when an error happens. 1982 */ 1983 DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) 1984 { 1985 /* 1986 * Handle #DB before calling into !noinstr code to avoid recursive #DB. 1987 */ 1988 if (vc_is_db(error_code)) { 1989 noist_exc_debug(regs); 1990 return; 1991 } 1992 1993 irqentry_enter_from_user_mode(regs); 1994 instrumentation_begin(); 1995 1996 if (!vc_raw_handle_exception(regs, error_code)) { 1997 /* 1998 * Do not kill the machine if user-space triggered the 1999 * exception. Send SIGBUS instead and let user-space deal with 2000 * it. 2001 */ 2002 force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); 2003 } 2004 2005 instrumentation_end(); 2006 irqentry_exit_to_user_mode(regs); 2007 } 2008 2009 bool __init handle_vc_boot_ghcb(struct pt_regs *regs) 2010 { 2011 unsigned long exit_code = regs->orig_ax; 2012 struct es_em_ctxt ctxt; 2013 enum es_result result; 2014 2015 vc_ghcb_invalidate(boot_ghcb); 2016 2017 result = vc_init_em_ctxt(&ctxt, regs, exit_code); 2018 if (result == ES_OK) 2019 result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); 2020 2021 /* Done - now check the result */ 2022 switch (result) { 2023 case ES_OK: 2024 vc_finish_insn(&ctxt); 2025 break; 2026 case ES_UNSUPPORTED: 2027 early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", 2028 exit_code, regs->ip); 2029 goto fail; 2030 case ES_VMM_ERROR: 2031 early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", 2032 exit_code, regs->ip); 2033 goto fail; 2034 case ES_DECODE_FAILED: 2035 early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", 2036 exit_code, regs->ip); 2037 goto fail; 2038 case ES_EXCEPTION: 2039 vc_early_forward_exception(&ctxt); 2040 break; 2041 case ES_RETRY: 2042 /* Nothing to do */ 2043 break; 2044 default: 2045 BUG(); 2046 } 2047 2048 return true; 2049 2050 fail: 2051 show_regs(regs); 2052 2053 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 2054 } 2055 2056 /* 2057 * Initial set up of SNP relies on information provided by the 2058 * Confidential Computing blob, which can be passed to the kernel 2059 * in the following ways, depending on how it is booted: 2060 * 2061 * - when booted via the boot/decompress kernel: 2062 * - via boot_params 2063 * 2064 * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): 2065 * - via a setup_data entry, as defined by the Linux Boot Protocol 2066 * 2067 * Scan for the blob in that order. 2068 */ 2069 static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) 2070 { 2071 struct cc_blob_sev_info *cc_info; 2072 2073 /* Boot kernel would have passed the CC blob via boot_params. */ 2074 if (bp->cc_blob_address) { 2075 cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; 2076 goto found_cc_info; 2077 } 2078 2079 /* 2080 * If kernel was booted directly, without the use of the 2081 * boot/decompression kernel, the CC blob may have been passed via 2082 * setup_data instead. 2083 */ 2084 cc_info = find_cc_blob_setup_data(bp); 2085 if (!cc_info) 2086 return NULL; 2087 2088 found_cc_info: 2089 if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) 2090 snp_abort(); 2091 2092 return cc_info; 2093 } 2094 2095 bool __init snp_init(struct boot_params *bp) 2096 { 2097 struct cc_blob_sev_info *cc_info; 2098 2099 if (!bp) 2100 return false; 2101 2102 cc_info = find_cc_blob(bp); 2103 if (!cc_info) 2104 return false; 2105 2106 setup_cpuid_table(cc_info); 2107 2108 /* 2109 * The CC blob will be used later to access the secrets page. Cache 2110 * it here like the boot kernel does. 2111 */ 2112 bp->cc_blob_address = (u32)(unsigned long)cc_info; 2113 2114 return true; 2115 } 2116 2117 void __init __noreturn snp_abort(void) 2118 { 2119 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); 2120 } 2121 2122 static void dump_cpuid_table(void) 2123 { 2124 const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); 2125 int i = 0; 2126 2127 pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", 2128 cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); 2129 2130 for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { 2131 const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; 2132 2133 pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", 2134 i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, 2135 fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); 2136 } 2137 } 2138 2139 /* 2140 * It is useful from an auditing/testing perspective to provide an easy way 2141 * for the guest owner to know that the CPUID table has been initialized as 2142 * expected, but that initialization happens too early in boot to print any 2143 * sort of indicator, and there's not really any other good place to do it, 2144 * so do it here. 2145 */ 2146 static int __init report_cpuid_table(void) 2147 { 2148 const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); 2149 2150 if (!cpuid_table->count) 2151 return 0; 2152 2153 pr_info("Using SNP CPUID table, %d entries present.\n", 2154 cpuid_table->count); 2155 2156 if (sev_cfg.debug) 2157 dump_cpuid_table(); 2158 2159 return 0; 2160 } 2161 arch_initcall(report_cpuid_table); 2162 2163 static int __init init_sev_config(char *str) 2164 { 2165 char *s; 2166 2167 while ((s = strsep(&str, ","))) { 2168 if (!strcmp(s, "debug")) { 2169 sev_cfg.debug = true; 2170 continue; 2171 } 2172 2173 pr_info("SEV command-line option '%s' was not recognized\n", s); 2174 } 2175 2176 return 1; 2177 } 2178 __setup("sev=", init_sev_config); 2179 2180 int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) 2181 { 2182 struct ghcb_state state; 2183 struct es_em_ctxt ctxt; 2184 unsigned long flags; 2185 struct ghcb *ghcb; 2186 int ret; 2187 2188 rio->exitinfo2 = SEV_RET_NO_FW_CALL; 2189 2190 /* 2191 * __sev_get_ghcb() needs to run with IRQs disabled because it is using 2192 * a per-CPU GHCB. 2193 */ 2194 local_irq_save(flags); 2195 2196 ghcb = __sev_get_ghcb(&state); 2197 if (!ghcb) { 2198 ret = -EIO; 2199 goto e_restore_irq; 2200 } 2201 2202 vc_ghcb_invalidate(ghcb); 2203 2204 if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { 2205 ghcb_set_rax(ghcb, input->data_gpa); 2206 ghcb_set_rbx(ghcb, input->data_npages); 2207 } 2208 2209 ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); 2210 if (ret) 2211 goto e_put; 2212 2213 rio->exitinfo2 = ghcb->save.sw_exit_info_2; 2214 switch (rio->exitinfo2) { 2215 case 0: 2216 break; 2217 2218 case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY): 2219 ret = -EAGAIN; 2220 break; 2221 2222 case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): 2223 /* Number of expected pages are returned in RBX */ 2224 if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { 2225 input->data_npages = ghcb_get_rbx(ghcb); 2226 ret = -ENOSPC; 2227 break; 2228 } 2229 fallthrough; 2230 default: 2231 ret = -EIO; 2232 break; 2233 } 2234 2235 e_put: 2236 __sev_put_ghcb(&state); 2237 e_restore_irq: 2238 local_irq_restore(flags); 2239 2240 return ret; 2241 } 2242 EXPORT_SYMBOL_GPL(snp_issue_guest_request); 2243 2244 static struct platform_device sev_guest_device = { 2245 .name = "sev-guest", 2246 .id = -1, 2247 }; 2248 2249 static int __init snp_init_platform_device(void) 2250 { 2251 struct sev_guest_platform_data data; 2252 u64 gpa; 2253 2254 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 2255 return -ENODEV; 2256 2257 gpa = get_secrets_page(); 2258 if (!gpa) 2259 return -ENODEV; 2260 2261 data.secrets_gpa = gpa; 2262 if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) 2263 return -ENODEV; 2264 2265 if (platform_device_register(&sev_guest_device)) 2266 return -ENODEV; 2267 2268 pr_info("SNP guest platform device initialized.\n"); 2269 return 0; 2270 } 2271 device_initcall(snp_init_platform_device); 2272