1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * AMD Memory Encryption Support 4 * 5 * Copyright (C) 2019 SUSE 6 * 7 * Author: Joerg Roedel <jroedel@suse.de> 8 */ 9 10 #define pr_fmt(fmt) "SEV: " fmt 11 12 #include <linux/sched/debug.h> /* For show_regs() */ 13 #include <linux/percpu-defs.h> 14 #include <linux/cc_platform.h> 15 #include <linux/printk.h> 16 #include <linux/mm_types.h> 17 #include <linux/set_memory.h> 18 #include <linux/memblock.h> 19 #include <linux/kernel.h> 20 #include <linux/mm.h> 21 #include <linux/cpumask.h> 22 #include <linux/efi.h> 23 #include <linux/platform_device.h> 24 #include <linux/io.h> 25 #include <linux/psp-sev.h> 26 #include <linux/dmi.h> 27 #include <uapi/linux/sev-guest.h> 28 29 #include <asm/init.h> 30 #include <asm/cpu_entry_area.h> 31 #include <asm/stacktrace.h> 32 #include <asm/sev.h> 33 #include <asm/insn-eval.h> 34 #include <asm/fpu/xcr.h> 35 #include <asm/processor.h> 36 #include <asm/realmode.h> 37 #include <asm/setup.h> 38 #include <asm/traps.h> 39 #include <asm/svm.h> 40 #include <asm/smp.h> 41 #include <asm/cpu.h> 42 #include <asm/apic.h> 43 #include <asm/cpuid.h> 44 #include <asm/cmdline.h> 45 46 #define DR7_RESET_VALUE 0x400 47 48 /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ 49 #define AP_INIT_CS_LIMIT 0xffff 50 #define AP_INIT_DS_LIMIT 0xffff 51 #define AP_INIT_LDTR_LIMIT 0xffff 52 #define AP_INIT_GDTR_LIMIT 0xffff 53 #define AP_INIT_IDTR_LIMIT 0xffff 54 #define AP_INIT_TR_LIMIT 0xffff 55 #define AP_INIT_RFLAGS_DEFAULT 0x2 56 #define AP_INIT_DR6_DEFAULT 0xffff0ff0 57 #define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL 58 #define AP_INIT_XCR0_DEFAULT 0x1 59 #define AP_INIT_X87_FTW_DEFAULT 0x5555 60 #define AP_INIT_X87_FCW_DEFAULT 0x0040 61 #define AP_INIT_CR0_DEFAULT 0x60000010 62 #define AP_INIT_MXCSR_DEFAULT 0x1f80 63 64 /* For early boot hypervisor communication in SEV-ES enabled guests */ 65 static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); 66 67 /* 68 * Needs to be in the .data section because we need it NULL before bss is 69 * cleared 70 */ 71 static struct ghcb *boot_ghcb __section(".data"); 72 73 /* Bitmap of SEV features supported by the hypervisor */ 74 static u64 sev_hv_features __ro_after_init; 75 76 /* #VC handler runtime per-CPU data */ 77 struct sev_es_runtime_data { 78 struct ghcb ghcb_page; 79 80 /* 81 * Reserve one page per CPU as backup storage for the unencrypted GHCB. 82 * It is needed when an NMI happens while the #VC handler uses the real 83 * GHCB, and the NMI handler itself is causing another #VC exception. In 84 * that case the GHCB content of the first handler needs to be backed up 85 * and restored. 86 */ 87 struct ghcb backup_ghcb; 88 89 /* 90 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. 91 * There is no need for it to be atomic, because nothing is written to 92 * the GHCB between the read and the write of ghcb_active. So it is safe 93 * to use it when a nested #VC exception happens before the write. 94 * 95 * This is necessary for example in the #VC->NMI->#VC case when the NMI 96 * happens while the first #VC handler uses the GHCB. When the NMI code 97 * raises a second #VC handler it might overwrite the contents of the 98 * GHCB written by the first handler. To avoid this the content of the 99 * GHCB is saved and restored when the GHCB is detected to be in use 100 * already. 101 */ 102 bool ghcb_active; 103 bool backup_ghcb_active; 104 105 /* 106 * Cached DR7 value - write it on DR7 writes and return it on reads. 107 * That value will never make it to the real hardware DR7 as debugging 108 * is currently unsupported in SEV-ES guests. 109 */ 110 unsigned long dr7; 111 }; 112 113 struct ghcb_state { 114 struct ghcb *ghcb; 115 }; 116 117 static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); 118 static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); 119 120 struct sev_config { 121 __u64 debug : 1, 122 123 /* 124 * A flag used by __set_pages_state() that indicates when the 125 * per-CPU GHCB has been created and registered and thus can be 126 * used by the BSP instead of the early boot GHCB. 127 * 128 * For APs, the per-CPU GHCB is created before they are started 129 * and registered upon startup, so this flag can be used globally 130 * for the BSP and APs. 131 */ 132 ghcbs_initialized : 1, 133 134 __reserved : 62; 135 }; 136 137 static struct sev_config sev_cfg __read_mostly; 138 139 static __always_inline bool on_vc_stack(struct pt_regs *regs) 140 { 141 unsigned long sp = regs->sp; 142 143 /* User-mode RSP is not trusted */ 144 if (user_mode(regs)) 145 return false; 146 147 /* SYSCALL gap still has user-mode RSP */ 148 if (ip_within_syscall_gap(regs)) 149 return false; 150 151 return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); 152 } 153 154 /* 155 * This function handles the case when an NMI is raised in the #VC 156 * exception handler entry code, before the #VC handler has switched off 157 * its IST stack. In this case, the IST entry for #VC must be adjusted, 158 * so that any nested #VC exception will not overwrite the stack 159 * contents of the interrupted #VC handler. 160 * 161 * The IST entry is adjusted unconditionally so that it can be also be 162 * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a 163 * nested sev_es_ist_exit() call may adjust back the IST entry too 164 * early. 165 * 166 * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run 167 * on the NMI IST stack, as they are only called from NMI handling code 168 * right now. 169 */ 170 void noinstr __sev_es_ist_enter(struct pt_regs *regs) 171 { 172 unsigned long old_ist, new_ist; 173 174 /* Read old IST entry */ 175 new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); 176 177 /* 178 * If NMI happened while on the #VC IST stack, set the new IST 179 * value below regs->sp, so that the interrupted stack frame is 180 * not overwritten by subsequent #VC exceptions. 181 */ 182 if (on_vc_stack(regs)) 183 new_ist = regs->sp; 184 185 /* 186 * Reserve additional 8 bytes and store old IST value so this 187 * adjustment can be unrolled in __sev_es_ist_exit(). 188 */ 189 new_ist -= sizeof(old_ist); 190 *(unsigned long *)new_ist = old_ist; 191 192 /* Set new IST entry */ 193 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); 194 } 195 196 void noinstr __sev_es_ist_exit(void) 197 { 198 unsigned long ist; 199 200 /* Read IST entry */ 201 ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); 202 203 if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) 204 return; 205 206 /* Read back old IST entry and write it to the TSS */ 207 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); 208 } 209 210 /* 211 * Nothing shall interrupt this code path while holding the per-CPU 212 * GHCB. The backup GHCB is only for NMIs interrupting this path. 213 * 214 * Callers must disable local interrupts around it. 215 */ 216 static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) 217 { 218 struct sev_es_runtime_data *data; 219 struct ghcb *ghcb; 220 221 WARN_ON(!irqs_disabled()); 222 223 data = this_cpu_read(runtime_data); 224 ghcb = &data->ghcb_page; 225 226 if (unlikely(data->ghcb_active)) { 227 /* GHCB is already in use - save its contents */ 228 229 if (unlikely(data->backup_ghcb_active)) { 230 /* 231 * Backup-GHCB is also already in use. There is no way 232 * to continue here so just kill the machine. To make 233 * panic() work, mark GHCBs inactive so that messages 234 * can be printed out. 235 */ 236 data->ghcb_active = false; 237 data->backup_ghcb_active = false; 238 239 instrumentation_begin(); 240 panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); 241 instrumentation_end(); 242 } 243 244 /* Mark backup_ghcb active before writing to it */ 245 data->backup_ghcb_active = true; 246 247 state->ghcb = &data->backup_ghcb; 248 249 /* Backup GHCB content */ 250 *state->ghcb = *ghcb; 251 } else { 252 state->ghcb = NULL; 253 data->ghcb_active = true; 254 } 255 256 return ghcb; 257 } 258 259 static inline u64 sev_es_rd_ghcb_msr(void) 260 { 261 return __rdmsr(MSR_AMD64_SEV_ES_GHCB); 262 } 263 264 static __always_inline void sev_es_wr_ghcb_msr(u64 val) 265 { 266 u32 low, high; 267 268 low = (u32)(val); 269 high = (u32)(val >> 32); 270 271 native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); 272 } 273 274 static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, 275 unsigned char *buffer) 276 { 277 return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); 278 } 279 280 static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) 281 { 282 char buffer[MAX_INSN_SIZE]; 283 int insn_bytes; 284 285 insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); 286 if (insn_bytes == 0) { 287 /* Nothing could be copied */ 288 ctxt->fi.vector = X86_TRAP_PF; 289 ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; 290 ctxt->fi.cr2 = ctxt->regs->ip; 291 return ES_EXCEPTION; 292 } else if (insn_bytes == -EINVAL) { 293 /* Effective RIP could not be calculated */ 294 ctxt->fi.vector = X86_TRAP_GP; 295 ctxt->fi.error_code = 0; 296 ctxt->fi.cr2 = 0; 297 return ES_EXCEPTION; 298 } 299 300 if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) 301 return ES_DECODE_FAILED; 302 303 if (ctxt->insn.immediate.got) 304 return ES_OK; 305 else 306 return ES_DECODE_FAILED; 307 } 308 309 static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) 310 { 311 char buffer[MAX_INSN_SIZE]; 312 int res, ret; 313 314 res = vc_fetch_insn_kernel(ctxt, buffer); 315 if (res) { 316 ctxt->fi.vector = X86_TRAP_PF; 317 ctxt->fi.error_code = X86_PF_INSTR; 318 ctxt->fi.cr2 = ctxt->regs->ip; 319 return ES_EXCEPTION; 320 } 321 322 ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); 323 if (ret < 0) 324 return ES_DECODE_FAILED; 325 else 326 return ES_OK; 327 } 328 329 static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) 330 { 331 if (user_mode(ctxt->regs)) 332 return __vc_decode_user_insn(ctxt); 333 else 334 return __vc_decode_kern_insn(ctxt); 335 } 336 337 static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, 338 char *dst, char *buf, size_t size) 339 { 340 unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; 341 342 /* 343 * This function uses __put_user() independent of whether kernel or user 344 * memory is accessed. This works fine because __put_user() does no 345 * sanity checks of the pointer being accessed. All that it does is 346 * to report when the access failed. 347 * 348 * Also, this function runs in atomic context, so __put_user() is not 349 * allowed to sleep. The page-fault handler detects that it is running 350 * in atomic context and will not try to take mmap_sem and handle the 351 * fault, so additional pagefault_enable()/disable() calls are not 352 * needed. 353 * 354 * The access can't be done via copy_to_user() here because 355 * vc_write_mem() must not use string instructions to access unsafe 356 * memory. The reason is that MOVS is emulated by the #VC handler by 357 * splitting the move up into a read and a write and taking a nested #VC 358 * exception on whatever of them is the MMIO access. Using string 359 * instructions here would cause infinite nesting. 360 */ 361 switch (size) { 362 case 1: { 363 u8 d1; 364 u8 __user *target = (u8 __user *)dst; 365 366 memcpy(&d1, buf, 1); 367 if (__put_user(d1, target)) 368 goto fault; 369 break; 370 } 371 case 2: { 372 u16 d2; 373 u16 __user *target = (u16 __user *)dst; 374 375 memcpy(&d2, buf, 2); 376 if (__put_user(d2, target)) 377 goto fault; 378 break; 379 } 380 case 4: { 381 u32 d4; 382 u32 __user *target = (u32 __user *)dst; 383 384 memcpy(&d4, buf, 4); 385 if (__put_user(d4, target)) 386 goto fault; 387 break; 388 } 389 case 8: { 390 u64 d8; 391 u64 __user *target = (u64 __user *)dst; 392 393 memcpy(&d8, buf, 8); 394 if (__put_user(d8, target)) 395 goto fault; 396 break; 397 } 398 default: 399 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 400 return ES_UNSUPPORTED; 401 } 402 403 return ES_OK; 404 405 fault: 406 if (user_mode(ctxt->regs)) 407 error_code |= X86_PF_USER; 408 409 ctxt->fi.vector = X86_TRAP_PF; 410 ctxt->fi.error_code = error_code; 411 ctxt->fi.cr2 = (unsigned long)dst; 412 413 return ES_EXCEPTION; 414 } 415 416 static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, 417 char *src, char *buf, size_t size) 418 { 419 unsigned long error_code = X86_PF_PROT; 420 421 /* 422 * This function uses __get_user() independent of whether kernel or user 423 * memory is accessed. This works fine because __get_user() does no 424 * sanity checks of the pointer being accessed. All that it does is 425 * to report when the access failed. 426 * 427 * Also, this function runs in atomic context, so __get_user() is not 428 * allowed to sleep. The page-fault handler detects that it is running 429 * in atomic context and will not try to take mmap_sem and handle the 430 * fault, so additional pagefault_enable()/disable() calls are not 431 * needed. 432 * 433 * The access can't be done via copy_from_user() here because 434 * vc_read_mem() must not use string instructions to access unsafe 435 * memory. The reason is that MOVS is emulated by the #VC handler by 436 * splitting the move up into a read and a write and taking a nested #VC 437 * exception on whatever of them is the MMIO access. Using string 438 * instructions here would cause infinite nesting. 439 */ 440 switch (size) { 441 case 1: { 442 u8 d1; 443 u8 __user *s = (u8 __user *)src; 444 445 if (__get_user(d1, s)) 446 goto fault; 447 memcpy(buf, &d1, 1); 448 break; 449 } 450 case 2: { 451 u16 d2; 452 u16 __user *s = (u16 __user *)src; 453 454 if (__get_user(d2, s)) 455 goto fault; 456 memcpy(buf, &d2, 2); 457 break; 458 } 459 case 4: { 460 u32 d4; 461 u32 __user *s = (u32 __user *)src; 462 463 if (__get_user(d4, s)) 464 goto fault; 465 memcpy(buf, &d4, 4); 466 break; 467 } 468 case 8: { 469 u64 d8; 470 u64 __user *s = (u64 __user *)src; 471 if (__get_user(d8, s)) 472 goto fault; 473 memcpy(buf, &d8, 8); 474 break; 475 } 476 default: 477 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 478 return ES_UNSUPPORTED; 479 } 480 481 return ES_OK; 482 483 fault: 484 if (user_mode(ctxt->regs)) 485 error_code |= X86_PF_USER; 486 487 ctxt->fi.vector = X86_TRAP_PF; 488 ctxt->fi.error_code = error_code; 489 ctxt->fi.cr2 = (unsigned long)src; 490 491 return ES_EXCEPTION; 492 } 493 494 static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, 495 unsigned long vaddr, phys_addr_t *paddr) 496 { 497 unsigned long va = (unsigned long)vaddr; 498 unsigned int level; 499 phys_addr_t pa; 500 pgd_t *pgd; 501 pte_t *pte; 502 503 pgd = __va(read_cr3_pa()); 504 pgd = &pgd[pgd_index(va)]; 505 pte = lookup_address_in_pgd(pgd, va, &level); 506 if (!pte) { 507 ctxt->fi.vector = X86_TRAP_PF; 508 ctxt->fi.cr2 = vaddr; 509 ctxt->fi.error_code = 0; 510 511 if (user_mode(ctxt->regs)) 512 ctxt->fi.error_code |= X86_PF_USER; 513 514 return ES_EXCEPTION; 515 } 516 517 if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) 518 /* Emulated MMIO to/from encrypted memory not supported */ 519 return ES_UNSUPPORTED; 520 521 pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; 522 pa |= va & ~page_level_mask(level); 523 524 *paddr = pa; 525 526 return ES_OK; 527 } 528 529 static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) 530 { 531 BUG_ON(size > 4); 532 533 if (user_mode(ctxt->regs)) { 534 struct thread_struct *t = ¤t->thread; 535 struct io_bitmap *iobm = t->io_bitmap; 536 size_t idx; 537 538 if (!iobm) 539 goto fault; 540 541 for (idx = port; idx < port + size; ++idx) { 542 if (test_bit(idx, iobm->bitmap)) 543 goto fault; 544 } 545 } 546 547 return ES_OK; 548 549 fault: 550 ctxt->fi.vector = X86_TRAP_GP; 551 ctxt->fi.error_code = 0; 552 553 return ES_EXCEPTION; 554 } 555 556 /* Include code shared with pre-decompression boot stage */ 557 #include "sev-shared.c" 558 559 static noinstr void __sev_put_ghcb(struct ghcb_state *state) 560 { 561 struct sev_es_runtime_data *data; 562 struct ghcb *ghcb; 563 564 WARN_ON(!irqs_disabled()); 565 566 data = this_cpu_read(runtime_data); 567 ghcb = &data->ghcb_page; 568 569 if (state->ghcb) { 570 /* Restore GHCB from Backup */ 571 *ghcb = *state->ghcb; 572 data->backup_ghcb_active = false; 573 state->ghcb = NULL; 574 } else { 575 /* 576 * Invalidate the GHCB so a VMGEXIT instruction issued 577 * from userspace won't appear to be valid. 578 */ 579 vc_ghcb_invalidate(ghcb); 580 data->ghcb_active = false; 581 } 582 } 583 584 void noinstr __sev_es_nmi_complete(void) 585 { 586 struct ghcb_state state; 587 struct ghcb *ghcb; 588 589 ghcb = __sev_get_ghcb(&state); 590 591 vc_ghcb_invalidate(ghcb); 592 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); 593 ghcb_set_sw_exit_info_1(ghcb, 0); 594 ghcb_set_sw_exit_info_2(ghcb, 0); 595 596 sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); 597 VMGEXIT(); 598 599 __sev_put_ghcb(&state); 600 } 601 602 static u64 __init get_secrets_page(void) 603 { 604 u64 pa_data = boot_params.cc_blob_address; 605 struct cc_blob_sev_info info; 606 void *map; 607 608 /* 609 * The CC blob contains the address of the secrets page, check if the 610 * blob is present. 611 */ 612 if (!pa_data) 613 return 0; 614 615 map = early_memremap(pa_data, sizeof(info)); 616 if (!map) { 617 pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); 618 return 0; 619 } 620 memcpy(&info, map, sizeof(info)); 621 early_memunmap(map, sizeof(info)); 622 623 /* smoke-test the secrets page passed */ 624 if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) 625 return 0; 626 627 return info.secrets_phys; 628 } 629 630 static u64 __init get_snp_jump_table_addr(void) 631 { 632 struct snp_secrets_page_layout *layout; 633 void __iomem *mem; 634 u64 pa, addr; 635 636 pa = get_secrets_page(); 637 if (!pa) 638 return 0; 639 640 mem = ioremap_encrypted(pa, PAGE_SIZE); 641 if (!mem) { 642 pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); 643 return 0; 644 } 645 646 layout = (__force struct snp_secrets_page_layout *)mem; 647 648 addr = layout->os_area.ap_jump_table_pa; 649 iounmap(mem); 650 651 return addr; 652 } 653 654 static u64 __init get_jump_table_addr(void) 655 { 656 struct ghcb_state state; 657 unsigned long flags; 658 struct ghcb *ghcb; 659 u64 ret = 0; 660 661 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 662 return get_snp_jump_table_addr(); 663 664 local_irq_save(flags); 665 666 ghcb = __sev_get_ghcb(&state); 667 668 vc_ghcb_invalidate(ghcb); 669 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); 670 ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); 671 ghcb_set_sw_exit_info_2(ghcb, 0); 672 673 sev_es_wr_ghcb_msr(__pa(ghcb)); 674 VMGEXIT(); 675 676 if (ghcb_sw_exit_info_1_is_valid(ghcb) && 677 ghcb_sw_exit_info_2_is_valid(ghcb)) 678 ret = ghcb->save.sw_exit_info_2; 679 680 __sev_put_ghcb(&state); 681 682 local_irq_restore(flags); 683 684 return ret; 685 } 686 687 static void __head 688 early_set_pages_state(unsigned long vaddr, unsigned long paddr, 689 unsigned long npages, enum psc_op op) 690 { 691 unsigned long paddr_end; 692 u64 val; 693 int ret; 694 695 vaddr = vaddr & PAGE_MASK; 696 697 paddr = paddr & PAGE_MASK; 698 paddr_end = paddr + (npages << PAGE_SHIFT); 699 700 while (paddr < paddr_end) { 701 if (op == SNP_PAGE_STATE_SHARED) { 702 /* Page validation must be rescinded before changing to shared */ 703 ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false); 704 if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) 705 goto e_term; 706 } 707 708 /* 709 * Use the MSR protocol because this function can be called before 710 * the GHCB is established. 711 */ 712 sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); 713 VMGEXIT(); 714 715 val = sev_es_rd_ghcb_msr(); 716 717 if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, 718 "Wrong PSC response code: 0x%x\n", 719 (unsigned int)GHCB_RESP_CODE(val))) 720 goto e_term; 721 722 if (WARN(GHCB_MSR_PSC_RESP_VAL(val), 723 "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", 724 op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", 725 paddr, GHCB_MSR_PSC_RESP_VAL(val))) 726 goto e_term; 727 728 if (op == SNP_PAGE_STATE_PRIVATE) { 729 /* Page validation must be performed after changing to private */ 730 ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true); 731 if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) 732 goto e_term; 733 } 734 735 vaddr += PAGE_SIZE; 736 paddr += PAGE_SIZE; 737 } 738 739 return; 740 741 e_term: 742 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 743 } 744 745 void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, 746 unsigned long npages) 747 { 748 /* 749 * This can be invoked in early boot while running identity mapped, so 750 * use an open coded check for SNP instead of using cc_platform_has(). 751 * This eliminates worries about jump tables or checking boot_cpu_data 752 * in the cc_platform_has() function. 753 */ 754 if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) 755 return; 756 757 /* 758 * Ask the hypervisor to mark the memory pages as private in the RMP 759 * table. 760 */ 761 early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); 762 } 763 764 void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, 765 unsigned long npages) 766 { 767 /* 768 * This can be invoked in early boot while running identity mapped, so 769 * use an open coded check for SNP instead of using cc_platform_has(). 770 * This eliminates worries about jump tables or checking boot_cpu_data 771 * in the cc_platform_has() function. 772 */ 773 if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) 774 return; 775 776 /* Ask hypervisor to mark the memory pages shared in the RMP table. */ 777 early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); 778 } 779 780 static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, 781 unsigned long vaddr_end, int op) 782 { 783 struct ghcb_state state; 784 bool use_large_entry; 785 struct psc_hdr *hdr; 786 struct psc_entry *e; 787 unsigned long flags; 788 unsigned long pfn; 789 struct ghcb *ghcb; 790 int i; 791 792 hdr = &data->hdr; 793 e = data->entries; 794 795 memset(data, 0, sizeof(*data)); 796 i = 0; 797 798 while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { 799 hdr->end_entry = i; 800 801 if (is_vmalloc_addr((void *)vaddr)) { 802 pfn = vmalloc_to_pfn((void *)vaddr); 803 use_large_entry = false; 804 } else { 805 pfn = __pa(vaddr) >> PAGE_SHIFT; 806 use_large_entry = true; 807 } 808 809 e->gfn = pfn; 810 e->operation = op; 811 812 if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && 813 (vaddr_end - vaddr) >= PMD_SIZE) { 814 e->pagesize = RMP_PG_SIZE_2M; 815 vaddr += PMD_SIZE; 816 } else { 817 e->pagesize = RMP_PG_SIZE_4K; 818 vaddr += PAGE_SIZE; 819 } 820 821 e++; 822 i++; 823 } 824 825 /* Page validation must be rescinded before changing to shared */ 826 if (op == SNP_PAGE_STATE_SHARED) 827 pvalidate_pages(data); 828 829 local_irq_save(flags); 830 831 if (sev_cfg.ghcbs_initialized) 832 ghcb = __sev_get_ghcb(&state); 833 else 834 ghcb = boot_ghcb; 835 836 /* Invoke the hypervisor to perform the page state changes */ 837 if (!ghcb || vmgexit_psc(ghcb, data)) 838 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 839 840 if (sev_cfg.ghcbs_initialized) 841 __sev_put_ghcb(&state); 842 843 local_irq_restore(flags); 844 845 /* Page validation must be performed after changing to private */ 846 if (op == SNP_PAGE_STATE_PRIVATE) 847 pvalidate_pages(data); 848 849 return vaddr; 850 } 851 852 static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) 853 { 854 struct snp_psc_desc desc; 855 unsigned long vaddr_end; 856 857 /* Use the MSR protocol when a GHCB is not available. */ 858 if (!boot_ghcb) 859 return early_set_pages_state(vaddr, __pa(vaddr), npages, op); 860 861 vaddr = vaddr & PAGE_MASK; 862 vaddr_end = vaddr + (npages << PAGE_SHIFT); 863 864 while (vaddr < vaddr_end) 865 vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op); 866 } 867 868 void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) 869 { 870 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 871 return; 872 873 set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); 874 } 875 876 void snp_set_memory_private(unsigned long vaddr, unsigned long npages) 877 { 878 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 879 return; 880 881 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); 882 } 883 884 void snp_accept_memory(phys_addr_t start, phys_addr_t end) 885 { 886 unsigned long vaddr, npages; 887 888 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 889 return; 890 891 vaddr = (unsigned long)__va(start); 892 npages = (end - start) >> PAGE_SHIFT; 893 894 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); 895 } 896 897 static int snp_set_vmsa(void *va, bool vmsa) 898 { 899 u64 attrs; 900 901 /* 902 * Running at VMPL0 allows the kernel to change the VMSA bit for a page 903 * using the RMPADJUST instruction. However, for the instruction to 904 * succeed it must target the permissions of a lesser privileged 905 * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST 906 * instruction in the AMD64 APM Volume 3). 907 */ 908 attrs = 1; 909 if (vmsa) 910 attrs |= RMPADJUST_VMSA_PAGE_BIT; 911 912 return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); 913 } 914 915 #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) 916 #define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) 917 #define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) 918 919 #define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) 920 #define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) 921 922 static void *snp_alloc_vmsa_page(void) 923 { 924 struct page *p; 925 926 /* 927 * Allocate VMSA page to work around the SNP erratum where the CPU will 928 * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) 929 * collides with the RMP entry of VMSA page. The recommended workaround 930 * is to not use a large page. 931 * 932 * Allocate an 8k page which is also 8k-aligned. 933 */ 934 p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); 935 if (!p) 936 return NULL; 937 938 split_page(p, 1); 939 940 /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ 941 __free_page(p); 942 943 return page_address(p + 1); 944 } 945 946 static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) 947 { 948 int err; 949 950 err = snp_set_vmsa(vmsa, false); 951 if (err) 952 pr_err("clear VMSA page failed (%u), leaking page\n", err); 953 else 954 free_page((unsigned long)vmsa); 955 } 956 957 static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) 958 { 959 struct sev_es_save_area *cur_vmsa, *vmsa; 960 struct ghcb_state state; 961 unsigned long flags; 962 struct ghcb *ghcb; 963 u8 sipi_vector; 964 int cpu, ret; 965 u64 cr4; 966 967 /* 968 * The hypervisor SNP feature support check has happened earlier, just check 969 * the AP_CREATION one here. 970 */ 971 if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) 972 return -EOPNOTSUPP; 973 974 /* 975 * Verify the desired start IP against the known trampoline start IP 976 * to catch any future new trampolines that may be introduced that 977 * would require a new protected guest entry point. 978 */ 979 if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, 980 "Unsupported SNP start_ip: %lx\n", start_ip)) 981 return -EINVAL; 982 983 /* Override start_ip with known protected guest start IP */ 984 start_ip = real_mode_header->sev_es_trampoline_start; 985 986 /* Find the logical CPU for the APIC ID */ 987 for_each_present_cpu(cpu) { 988 if (arch_match_cpu_phys_id(cpu, apic_id)) 989 break; 990 } 991 if (cpu >= nr_cpu_ids) 992 return -EINVAL; 993 994 cur_vmsa = per_cpu(sev_vmsa, cpu); 995 996 /* 997 * A new VMSA is created each time because there is no guarantee that 998 * the current VMSA is the kernels or that the vCPU is not running. If 999 * an attempt was done to use the current VMSA with a running vCPU, a 1000 * #VMEXIT of that vCPU would wipe out all of the settings being done 1001 * here. 1002 */ 1003 vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); 1004 if (!vmsa) 1005 return -ENOMEM; 1006 1007 /* CR4 should maintain the MCE value */ 1008 cr4 = native_read_cr4() & X86_CR4_MCE; 1009 1010 /* Set the CS value based on the start_ip converted to a SIPI vector */ 1011 sipi_vector = (start_ip >> 12); 1012 vmsa->cs.base = sipi_vector << 12; 1013 vmsa->cs.limit = AP_INIT_CS_LIMIT; 1014 vmsa->cs.attrib = INIT_CS_ATTRIBS; 1015 vmsa->cs.selector = sipi_vector << 8; 1016 1017 /* Set the RIP value based on start_ip */ 1018 vmsa->rip = start_ip & 0xfff; 1019 1020 /* Set AP INIT defaults as documented in the APM */ 1021 vmsa->ds.limit = AP_INIT_DS_LIMIT; 1022 vmsa->ds.attrib = INIT_DS_ATTRIBS; 1023 vmsa->es = vmsa->ds; 1024 vmsa->fs = vmsa->ds; 1025 vmsa->gs = vmsa->ds; 1026 vmsa->ss = vmsa->ds; 1027 1028 vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; 1029 vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; 1030 vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; 1031 vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; 1032 vmsa->tr.limit = AP_INIT_TR_LIMIT; 1033 vmsa->tr.attrib = INIT_TR_ATTRIBS; 1034 1035 vmsa->cr4 = cr4; 1036 vmsa->cr0 = AP_INIT_CR0_DEFAULT; 1037 vmsa->dr7 = DR7_RESET_VALUE; 1038 vmsa->dr6 = AP_INIT_DR6_DEFAULT; 1039 vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; 1040 vmsa->g_pat = AP_INIT_GPAT_DEFAULT; 1041 vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; 1042 vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; 1043 vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; 1044 vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; 1045 1046 /* SVME must be set. */ 1047 vmsa->efer = EFER_SVME; 1048 1049 /* 1050 * Set the SNP-specific fields for this VMSA: 1051 * VMPL level 1052 * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) 1053 */ 1054 vmsa->vmpl = 0; 1055 vmsa->sev_features = sev_status >> 2; 1056 1057 /* Switch the page over to a VMSA page now that it is initialized */ 1058 ret = snp_set_vmsa(vmsa, true); 1059 if (ret) { 1060 pr_err("set VMSA page failed (%u)\n", ret); 1061 free_page((unsigned long)vmsa); 1062 1063 return -EINVAL; 1064 } 1065 1066 /* Issue VMGEXIT AP Creation NAE event */ 1067 local_irq_save(flags); 1068 1069 ghcb = __sev_get_ghcb(&state); 1070 1071 vc_ghcb_invalidate(ghcb); 1072 ghcb_set_rax(ghcb, vmsa->sev_features); 1073 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); 1074 ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); 1075 ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); 1076 1077 sev_es_wr_ghcb_msr(__pa(ghcb)); 1078 VMGEXIT(); 1079 1080 if (!ghcb_sw_exit_info_1_is_valid(ghcb) || 1081 lower_32_bits(ghcb->save.sw_exit_info_1)) { 1082 pr_err("SNP AP Creation error\n"); 1083 ret = -EINVAL; 1084 } 1085 1086 __sev_put_ghcb(&state); 1087 1088 local_irq_restore(flags); 1089 1090 /* Perform cleanup if there was an error */ 1091 if (ret) { 1092 snp_cleanup_vmsa(vmsa); 1093 vmsa = NULL; 1094 } 1095 1096 /* Free up any previous VMSA page */ 1097 if (cur_vmsa) 1098 snp_cleanup_vmsa(cur_vmsa); 1099 1100 /* Record the current VMSA page */ 1101 per_cpu(sev_vmsa, cpu) = vmsa; 1102 1103 return ret; 1104 } 1105 1106 void __init snp_set_wakeup_secondary_cpu(void) 1107 { 1108 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1109 return; 1110 1111 /* 1112 * Always set this override if SNP is enabled. This makes it the 1113 * required method to start APs under SNP. If the hypervisor does 1114 * not support AP creation, then no APs will be started. 1115 */ 1116 apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit); 1117 } 1118 1119 int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) 1120 { 1121 u16 startup_cs, startup_ip; 1122 phys_addr_t jump_table_pa; 1123 u64 jump_table_addr; 1124 u16 __iomem *jump_table; 1125 1126 jump_table_addr = get_jump_table_addr(); 1127 1128 /* On UP guests there is no jump table so this is not a failure */ 1129 if (!jump_table_addr) 1130 return 0; 1131 1132 /* Check if AP Jump Table is page-aligned */ 1133 if (jump_table_addr & ~PAGE_MASK) 1134 return -EINVAL; 1135 1136 jump_table_pa = jump_table_addr & PAGE_MASK; 1137 1138 startup_cs = (u16)(rmh->trampoline_start >> 4); 1139 startup_ip = (u16)(rmh->sev_es_trampoline_start - 1140 rmh->trampoline_start); 1141 1142 jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); 1143 if (!jump_table) 1144 return -EIO; 1145 1146 writew(startup_ip, &jump_table[0]); 1147 writew(startup_cs, &jump_table[1]); 1148 1149 iounmap(jump_table); 1150 1151 return 0; 1152 } 1153 1154 /* 1155 * This is needed by the OVMF UEFI firmware which will use whatever it finds in 1156 * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu 1157 * runtime GHCBs used by the kernel are also mapped in the EFI page-table. 1158 */ 1159 int __init sev_es_efi_map_ghcbs(pgd_t *pgd) 1160 { 1161 struct sev_es_runtime_data *data; 1162 unsigned long address, pflags; 1163 int cpu; 1164 u64 pfn; 1165 1166 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1167 return 0; 1168 1169 pflags = _PAGE_NX | _PAGE_RW; 1170 1171 for_each_possible_cpu(cpu) { 1172 data = per_cpu(runtime_data, cpu); 1173 1174 address = __pa(&data->ghcb_page); 1175 pfn = address >> PAGE_SHIFT; 1176 1177 if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) 1178 return 1; 1179 } 1180 1181 return 0; 1182 } 1183 1184 static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1185 { 1186 struct pt_regs *regs = ctxt->regs; 1187 enum es_result ret; 1188 u64 exit_info_1; 1189 1190 /* Is it a WRMSR? */ 1191 exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; 1192 1193 ghcb_set_rcx(ghcb, regs->cx); 1194 if (exit_info_1) { 1195 ghcb_set_rax(ghcb, regs->ax); 1196 ghcb_set_rdx(ghcb, regs->dx); 1197 } 1198 1199 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); 1200 1201 if ((ret == ES_OK) && (!exit_info_1)) { 1202 regs->ax = ghcb->save.rax; 1203 regs->dx = ghcb->save.rdx; 1204 } 1205 1206 return ret; 1207 } 1208 1209 static void snp_register_per_cpu_ghcb(void) 1210 { 1211 struct sev_es_runtime_data *data; 1212 struct ghcb *ghcb; 1213 1214 data = this_cpu_read(runtime_data); 1215 ghcb = &data->ghcb_page; 1216 1217 snp_register_ghcb_early(__pa(ghcb)); 1218 } 1219 1220 void setup_ghcb(void) 1221 { 1222 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1223 return; 1224 1225 /* 1226 * Check whether the runtime #VC exception handler is active. It uses 1227 * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). 1228 * 1229 * If SNP is active, register the per-CPU GHCB page so that the runtime 1230 * exception handler can use it. 1231 */ 1232 if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { 1233 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1234 snp_register_per_cpu_ghcb(); 1235 1236 sev_cfg.ghcbs_initialized = true; 1237 1238 return; 1239 } 1240 1241 /* 1242 * Make sure the hypervisor talks a supported protocol. 1243 * This gets called only in the BSP boot phase. 1244 */ 1245 if (!sev_es_negotiate_protocol()) 1246 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 1247 1248 /* 1249 * Clear the boot_ghcb. The first exception comes in before the bss 1250 * section is cleared. 1251 */ 1252 memset(&boot_ghcb_page, 0, PAGE_SIZE); 1253 1254 /* Alright - Make the boot-ghcb public */ 1255 boot_ghcb = &boot_ghcb_page; 1256 1257 /* SNP guest requires that GHCB GPA must be registered. */ 1258 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1259 snp_register_ghcb_early(__pa(&boot_ghcb_page)); 1260 } 1261 1262 #ifdef CONFIG_HOTPLUG_CPU 1263 static void sev_es_ap_hlt_loop(void) 1264 { 1265 struct ghcb_state state; 1266 struct ghcb *ghcb; 1267 1268 ghcb = __sev_get_ghcb(&state); 1269 1270 while (true) { 1271 vc_ghcb_invalidate(ghcb); 1272 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); 1273 ghcb_set_sw_exit_info_1(ghcb, 0); 1274 ghcb_set_sw_exit_info_2(ghcb, 0); 1275 1276 sev_es_wr_ghcb_msr(__pa(ghcb)); 1277 VMGEXIT(); 1278 1279 /* Wakeup signal? */ 1280 if (ghcb_sw_exit_info_2_is_valid(ghcb) && 1281 ghcb->save.sw_exit_info_2) 1282 break; 1283 } 1284 1285 __sev_put_ghcb(&state); 1286 } 1287 1288 /* 1289 * Play_dead handler when running under SEV-ES. This is needed because 1290 * the hypervisor can't deliver an SIPI request to restart the AP. 1291 * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the 1292 * hypervisor wakes it up again. 1293 */ 1294 static void sev_es_play_dead(void) 1295 { 1296 play_dead_common(); 1297 1298 /* IRQs now disabled */ 1299 1300 sev_es_ap_hlt_loop(); 1301 1302 /* 1303 * If we get here, the VCPU was woken up again. Jump to CPU 1304 * startup code to get it back online. 1305 */ 1306 soft_restart_cpu(); 1307 } 1308 #else /* CONFIG_HOTPLUG_CPU */ 1309 #define sev_es_play_dead native_play_dead 1310 #endif /* CONFIG_HOTPLUG_CPU */ 1311 1312 #ifdef CONFIG_SMP 1313 static void __init sev_es_setup_play_dead(void) 1314 { 1315 smp_ops.play_dead = sev_es_play_dead; 1316 } 1317 #else 1318 static inline void sev_es_setup_play_dead(void) { } 1319 #endif 1320 1321 static void __init alloc_runtime_data(int cpu) 1322 { 1323 struct sev_es_runtime_data *data; 1324 1325 data = memblock_alloc(sizeof(*data), PAGE_SIZE); 1326 if (!data) 1327 panic("Can't allocate SEV-ES runtime data"); 1328 1329 per_cpu(runtime_data, cpu) = data; 1330 } 1331 1332 static void __init init_ghcb(int cpu) 1333 { 1334 struct sev_es_runtime_data *data; 1335 int err; 1336 1337 data = per_cpu(runtime_data, cpu); 1338 1339 err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, 1340 sizeof(data->ghcb_page)); 1341 if (err) 1342 panic("Can't map GHCBs unencrypted"); 1343 1344 memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); 1345 1346 data->ghcb_active = false; 1347 data->backup_ghcb_active = false; 1348 } 1349 1350 void __init sev_es_init_vc_handling(void) 1351 { 1352 int cpu; 1353 1354 BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); 1355 1356 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1357 return; 1358 1359 if (!sev_es_check_cpu_features()) 1360 panic("SEV-ES CPU Features missing"); 1361 1362 /* 1363 * SNP is supported in v2 of the GHCB spec which mandates support for HV 1364 * features. 1365 */ 1366 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { 1367 sev_hv_features = get_hv_features(); 1368 1369 if (!(sev_hv_features & GHCB_HV_FT_SNP)) 1370 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); 1371 } 1372 1373 /* Initialize per-cpu GHCB pages */ 1374 for_each_possible_cpu(cpu) { 1375 alloc_runtime_data(cpu); 1376 init_ghcb(cpu); 1377 } 1378 1379 sev_es_setup_play_dead(); 1380 1381 /* Secondary CPUs use the runtime #VC handler */ 1382 initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; 1383 } 1384 1385 static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) 1386 { 1387 int trapnr = ctxt->fi.vector; 1388 1389 if (trapnr == X86_TRAP_PF) 1390 native_write_cr2(ctxt->fi.cr2); 1391 1392 ctxt->regs->orig_ax = ctxt->fi.error_code; 1393 do_early_exception(ctxt->regs, trapnr); 1394 } 1395 1396 static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) 1397 { 1398 long *reg_array; 1399 int offset; 1400 1401 reg_array = (long *)ctxt->regs; 1402 offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); 1403 1404 if (offset < 0) 1405 return NULL; 1406 1407 offset /= sizeof(long); 1408 1409 return reg_array + offset; 1410 } 1411 static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, 1412 unsigned int bytes, bool read) 1413 { 1414 u64 exit_code, exit_info_1, exit_info_2; 1415 unsigned long ghcb_pa = __pa(ghcb); 1416 enum es_result res; 1417 phys_addr_t paddr; 1418 void __user *ref; 1419 1420 ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); 1421 if (ref == (void __user *)-1L) 1422 return ES_UNSUPPORTED; 1423 1424 exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; 1425 1426 res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); 1427 if (res != ES_OK) { 1428 if (res == ES_EXCEPTION && !read) 1429 ctxt->fi.error_code |= X86_PF_WRITE; 1430 1431 return res; 1432 } 1433 1434 exit_info_1 = paddr; 1435 /* Can never be greater than 8 */ 1436 exit_info_2 = bytes; 1437 1438 ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); 1439 1440 return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); 1441 } 1442 1443 /* 1444 * The MOVS instruction has two memory operands, which raises the 1445 * problem that it is not known whether the access to the source or the 1446 * destination caused the #VC exception (and hence whether an MMIO read 1447 * or write operation needs to be emulated). 1448 * 1449 * Instead of playing games with walking page-tables and trying to guess 1450 * whether the source or destination is an MMIO range, split the move 1451 * into two operations, a read and a write with only one memory operand. 1452 * This will cause a nested #VC exception on the MMIO address which can 1453 * then be handled. 1454 * 1455 * This implementation has the benefit that it also supports MOVS where 1456 * source _and_ destination are MMIO regions. 1457 * 1458 * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a 1459 * rare operation. If it turns out to be a performance problem the split 1460 * operations can be moved to memcpy_fromio() and memcpy_toio(). 1461 */ 1462 static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, 1463 unsigned int bytes) 1464 { 1465 unsigned long ds_base, es_base; 1466 unsigned char *src, *dst; 1467 unsigned char buffer[8]; 1468 enum es_result ret; 1469 bool rep; 1470 int off; 1471 1472 ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); 1473 es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); 1474 1475 if (ds_base == -1L || es_base == -1L) { 1476 ctxt->fi.vector = X86_TRAP_GP; 1477 ctxt->fi.error_code = 0; 1478 return ES_EXCEPTION; 1479 } 1480 1481 src = ds_base + (unsigned char *)ctxt->regs->si; 1482 dst = es_base + (unsigned char *)ctxt->regs->di; 1483 1484 ret = vc_read_mem(ctxt, src, buffer, bytes); 1485 if (ret != ES_OK) 1486 return ret; 1487 1488 ret = vc_write_mem(ctxt, dst, buffer, bytes); 1489 if (ret != ES_OK) 1490 return ret; 1491 1492 if (ctxt->regs->flags & X86_EFLAGS_DF) 1493 off = -bytes; 1494 else 1495 off = bytes; 1496 1497 ctxt->regs->si += off; 1498 ctxt->regs->di += off; 1499 1500 rep = insn_has_rep_prefix(&ctxt->insn); 1501 if (rep) 1502 ctxt->regs->cx -= 1; 1503 1504 if (!rep || ctxt->regs->cx == 0) 1505 return ES_OK; 1506 else 1507 return ES_RETRY; 1508 } 1509 1510 static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1511 { 1512 struct insn *insn = &ctxt->insn; 1513 enum insn_mmio_type mmio; 1514 unsigned int bytes = 0; 1515 enum es_result ret; 1516 u8 sign_byte; 1517 long *reg_data; 1518 1519 mmio = insn_decode_mmio(insn, &bytes); 1520 if (mmio == INSN_MMIO_DECODE_FAILED) 1521 return ES_DECODE_FAILED; 1522 1523 if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { 1524 reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); 1525 if (!reg_data) 1526 return ES_DECODE_FAILED; 1527 } 1528 1529 if (user_mode(ctxt->regs)) 1530 return ES_UNSUPPORTED; 1531 1532 switch (mmio) { 1533 case INSN_MMIO_WRITE: 1534 memcpy(ghcb->shared_buffer, reg_data, bytes); 1535 ret = vc_do_mmio(ghcb, ctxt, bytes, false); 1536 break; 1537 case INSN_MMIO_WRITE_IMM: 1538 memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); 1539 ret = vc_do_mmio(ghcb, ctxt, bytes, false); 1540 break; 1541 case INSN_MMIO_READ: 1542 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1543 if (ret) 1544 break; 1545 1546 /* Zero-extend for 32-bit operation */ 1547 if (bytes == 4) 1548 *reg_data = 0; 1549 1550 memcpy(reg_data, ghcb->shared_buffer, bytes); 1551 break; 1552 case INSN_MMIO_READ_ZERO_EXTEND: 1553 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1554 if (ret) 1555 break; 1556 1557 /* Zero extend based on operand size */ 1558 memset(reg_data, 0, insn->opnd_bytes); 1559 memcpy(reg_data, ghcb->shared_buffer, bytes); 1560 break; 1561 case INSN_MMIO_READ_SIGN_EXTEND: 1562 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1563 if (ret) 1564 break; 1565 1566 if (bytes == 1) { 1567 u8 *val = (u8 *)ghcb->shared_buffer; 1568 1569 sign_byte = (*val & 0x80) ? 0xff : 0x00; 1570 } else { 1571 u16 *val = (u16 *)ghcb->shared_buffer; 1572 1573 sign_byte = (*val & 0x8000) ? 0xff : 0x00; 1574 } 1575 1576 /* Sign extend based on operand size */ 1577 memset(reg_data, sign_byte, insn->opnd_bytes); 1578 memcpy(reg_data, ghcb->shared_buffer, bytes); 1579 break; 1580 case INSN_MMIO_MOVS: 1581 ret = vc_handle_mmio_movs(ctxt, bytes); 1582 break; 1583 default: 1584 ret = ES_UNSUPPORTED; 1585 break; 1586 } 1587 1588 return ret; 1589 } 1590 1591 static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, 1592 struct es_em_ctxt *ctxt) 1593 { 1594 struct sev_es_runtime_data *data = this_cpu_read(runtime_data); 1595 long val, *reg = vc_insn_get_rm(ctxt); 1596 enum es_result ret; 1597 1598 if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) 1599 return ES_VMM_ERROR; 1600 1601 if (!reg) 1602 return ES_DECODE_FAILED; 1603 1604 val = *reg; 1605 1606 /* Upper 32 bits must be written as zeroes */ 1607 if (val >> 32) { 1608 ctxt->fi.vector = X86_TRAP_GP; 1609 ctxt->fi.error_code = 0; 1610 return ES_EXCEPTION; 1611 } 1612 1613 /* Clear out other reserved bits and set bit 10 */ 1614 val = (val & 0xffff23ffL) | BIT(10); 1615 1616 /* Early non-zero writes to DR7 are not supported */ 1617 if (!data && (val & ~DR7_RESET_VALUE)) 1618 return ES_UNSUPPORTED; 1619 1620 /* Using a value of 0 for ExitInfo1 means RAX holds the value */ 1621 ghcb_set_rax(ghcb, val); 1622 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); 1623 if (ret != ES_OK) 1624 return ret; 1625 1626 if (data) 1627 data->dr7 = val; 1628 1629 return ES_OK; 1630 } 1631 1632 static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, 1633 struct es_em_ctxt *ctxt) 1634 { 1635 struct sev_es_runtime_data *data = this_cpu_read(runtime_data); 1636 long *reg = vc_insn_get_rm(ctxt); 1637 1638 if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) 1639 return ES_VMM_ERROR; 1640 1641 if (!reg) 1642 return ES_DECODE_FAILED; 1643 1644 if (data) 1645 *reg = data->dr7; 1646 else 1647 *reg = DR7_RESET_VALUE; 1648 1649 return ES_OK; 1650 } 1651 1652 static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, 1653 struct es_em_ctxt *ctxt) 1654 { 1655 return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); 1656 } 1657 1658 static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1659 { 1660 enum es_result ret; 1661 1662 ghcb_set_rcx(ghcb, ctxt->regs->cx); 1663 1664 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); 1665 if (ret != ES_OK) 1666 return ret; 1667 1668 if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) 1669 return ES_VMM_ERROR; 1670 1671 ctxt->regs->ax = ghcb->save.rax; 1672 ctxt->regs->dx = ghcb->save.rdx; 1673 1674 return ES_OK; 1675 } 1676 1677 static enum es_result vc_handle_monitor(struct ghcb *ghcb, 1678 struct es_em_ctxt *ctxt) 1679 { 1680 /* 1681 * Treat it as a NOP and do not leak a physical address to the 1682 * hypervisor. 1683 */ 1684 return ES_OK; 1685 } 1686 1687 static enum es_result vc_handle_mwait(struct ghcb *ghcb, 1688 struct es_em_ctxt *ctxt) 1689 { 1690 /* Treat the same as MONITOR/MONITORX */ 1691 return ES_OK; 1692 } 1693 1694 static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, 1695 struct es_em_ctxt *ctxt) 1696 { 1697 enum es_result ret; 1698 1699 ghcb_set_rax(ghcb, ctxt->regs->ax); 1700 ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); 1701 1702 if (x86_platform.hyper.sev_es_hcall_prepare) 1703 x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); 1704 1705 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); 1706 if (ret != ES_OK) 1707 return ret; 1708 1709 if (!ghcb_rax_is_valid(ghcb)) 1710 return ES_VMM_ERROR; 1711 1712 ctxt->regs->ax = ghcb->save.rax; 1713 1714 /* 1715 * Call sev_es_hcall_finish() after regs->ax is already set. 1716 * This allows the hypervisor handler to overwrite it again if 1717 * necessary. 1718 */ 1719 if (x86_platform.hyper.sev_es_hcall_finish && 1720 !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) 1721 return ES_VMM_ERROR; 1722 1723 return ES_OK; 1724 } 1725 1726 static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, 1727 struct es_em_ctxt *ctxt) 1728 { 1729 /* 1730 * Calling ecx_alignment_check() directly does not work, because it 1731 * enables IRQs and the GHCB is active. Forward the exception and call 1732 * it later from vc_forward_exception(). 1733 */ 1734 ctxt->fi.vector = X86_TRAP_AC; 1735 ctxt->fi.error_code = 0; 1736 return ES_EXCEPTION; 1737 } 1738 1739 static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, 1740 struct ghcb *ghcb, 1741 unsigned long exit_code) 1742 { 1743 enum es_result result; 1744 1745 switch (exit_code) { 1746 case SVM_EXIT_READ_DR7: 1747 result = vc_handle_dr7_read(ghcb, ctxt); 1748 break; 1749 case SVM_EXIT_WRITE_DR7: 1750 result = vc_handle_dr7_write(ghcb, ctxt); 1751 break; 1752 case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: 1753 result = vc_handle_trap_ac(ghcb, ctxt); 1754 break; 1755 case SVM_EXIT_RDTSC: 1756 case SVM_EXIT_RDTSCP: 1757 result = vc_handle_rdtsc(ghcb, ctxt, exit_code); 1758 break; 1759 case SVM_EXIT_RDPMC: 1760 result = vc_handle_rdpmc(ghcb, ctxt); 1761 break; 1762 case SVM_EXIT_INVD: 1763 pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); 1764 result = ES_UNSUPPORTED; 1765 break; 1766 case SVM_EXIT_CPUID: 1767 result = vc_handle_cpuid(ghcb, ctxt); 1768 break; 1769 case SVM_EXIT_IOIO: 1770 result = vc_handle_ioio(ghcb, ctxt); 1771 break; 1772 case SVM_EXIT_MSR: 1773 result = vc_handle_msr(ghcb, ctxt); 1774 break; 1775 case SVM_EXIT_VMMCALL: 1776 result = vc_handle_vmmcall(ghcb, ctxt); 1777 break; 1778 case SVM_EXIT_WBINVD: 1779 result = vc_handle_wbinvd(ghcb, ctxt); 1780 break; 1781 case SVM_EXIT_MONITOR: 1782 result = vc_handle_monitor(ghcb, ctxt); 1783 break; 1784 case SVM_EXIT_MWAIT: 1785 result = vc_handle_mwait(ghcb, ctxt); 1786 break; 1787 case SVM_EXIT_NPF: 1788 result = vc_handle_mmio(ghcb, ctxt); 1789 break; 1790 default: 1791 /* 1792 * Unexpected #VC exception 1793 */ 1794 result = ES_UNSUPPORTED; 1795 } 1796 1797 return result; 1798 } 1799 1800 static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) 1801 { 1802 long error_code = ctxt->fi.error_code; 1803 int trapnr = ctxt->fi.vector; 1804 1805 ctxt->regs->orig_ax = ctxt->fi.error_code; 1806 1807 switch (trapnr) { 1808 case X86_TRAP_GP: 1809 exc_general_protection(ctxt->regs, error_code); 1810 break; 1811 case X86_TRAP_UD: 1812 exc_invalid_op(ctxt->regs); 1813 break; 1814 case X86_TRAP_PF: 1815 write_cr2(ctxt->fi.cr2); 1816 exc_page_fault(ctxt->regs, error_code); 1817 break; 1818 case X86_TRAP_AC: 1819 exc_alignment_check(ctxt->regs, error_code); 1820 break; 1821 default: 1822 pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); 1823 BUG(); 1824 } 1825 } 1826 1827 static __always_inline bool is_vc2_stack(unsigned long sp) 1828 { 1829 return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); 1830 } 1831 1832 static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) 1833 { 1834 unsigned long sp, prev_sp; 1835 1836 sp = (unsigned long)regs; 1837 prev_sp = regs->sp; 1838 1839 /* 1840 * If the code was already executing on the VC2 stack when the #VC 1841 * happened, let it proceed to the normal handling routine. This way the 1842 * code executing on the VC2 stack can cause #VC exceptions to get handled. 1843 */ 1844 return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); 1845 } 1846 1847 static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) 1848 { 1849 struct ghcb_state state; 1850 struct es_em_ctxt ctxt; 1851 enum es_result result; 1852 struct ghcb *ghcb; 1853 bool ret = true; 1854 1855 ghcb = __sev_get_ghcb(&state); 1856 1857 vc_ghcb_invalidate(ghcb); 1858 result = vc_init_em_ctxt(&ctxt, regs, error_code); 1859 1860 if (result == ES_OK) 1861 result = vc_handle_exitcode(&ctxt, ghcb, error_code); 1862 1863 __sev_put_ghcb(&state); 1864 1865 /* Done - now check the result */ 1866 switch (result) { 1867 case ES_OK: 1868 vc_finish_insn(&ctxt); 1869 break; 1870 case ES_UNSUPPORTED: 1871 pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", 1872 error_code, regs->ip); 1873 ret = false; 1874 break; 1875 case ES_VMM_ERROR: 1876 pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", 1877 error_code, regs->ip); 1878 ret = false; 1879 break; 1880 case ES_DECODE_FAILED: 1881 pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", 1882 error_code, regs->ip); 1883 ret = false; 1884 break; 1885 case ES_EXCEPTION: 1886 vc_forward_exception(&ctxt); 1887 break; 1888 case ES_RETRY: 1889 /* Nothing to do */ 1890 break; 1891 default: 1892 pr_emerg("Unknown result in %s():%d\n", __func__, result); 1893 /* 1894 * Emulating the instruction which caused the #VC exception 1895 * failed - can't continue so print debug information 1896 */ 1897 BUG(); 1898 } 1899 1900 return ret; 1901 } 1902 1903 static __always_inline bool vc_is_db(unsigned long error_code) 1904 { 1905 return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; 1906 } 1907 1908 /* 1909 * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode 1910 * and will panic when an error happens. 1911 */ 1912 DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) 1913 { 1914 irqentry_state_t irq_state; 1915 1916 /* 1917 * With the current implementation it is always possible to switch to a 1918 * safe stack because #VC exceptions only happen at known places, like 1919 * intercepted instructions or accesses to MMIO areas/IO ports. They can 1920 * also happen with code instrumentation when the hypervisor intercepts 1921 * #DB, but the critical paths are forbidden to be instrumented, so #DB 1922 * exceptions currently also only happen in safe places. 1923 * 1924 * But keep this here in case the noinstr annotations are violated due 1925 * to bug elsewhere. 1926 */ 1927 if (unlikely(vc_from_invalid_context(regs))) { 1928 instrumentation_begin(); 1929 panic("Can't handle #VC exception from unsupported context\n"); 1930 instrumentation_end(); 1931 } 1932 1933 /* 1934 * Handle #DB before calling into !noinstr code to avoid recursive #DB. 1935 */ 1936 if (vc_is_db(error_code)) { 1937 exc_debug(regs); 1938 return; 1939 } 1940 1941 irq_state = irqentry_nmi_enter(regs); 1942 1943 instrumentation_begin(); 1944 1945 if (!vc_raw_handle_exception(regs, error_code)) { 1946 /* Show some debug info */ 1947 show_regs(regs); 1948 1949 /* Ask hypervisor to sev_es_terminate */ 1950 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 1951 1952 /* If that fails and we get here - just panic */ 1953 panic("Returned from Terminate-Request to Hypervisor\n"); 1954 } 1955 1956 instrumentation_end(); 1957 irqentry_nmi_exit(regs, irq_state); 1958 } 1959 1960 /* 1961 * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode 1962 * and will kill the current task with SIGBUS when an error happens. 1963 */ 1964 DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) 1965 { 1966 /* 1967 * Handle #DB before calling into !noinstr code to avoid recursive #DB. 1968 */ 1969 if (vc_is_db(error_code)) { 1970 noist_exc_debug(regs); 1971 return; 1972 } 1973 1974 irqentry_enter_from_user_mode(regs); 1975 instrumentation_begin(); 1976 1977 if (!vc_raw_handle_exception(regs, error_code)) { 1978 /* 1979 * Do not kill the machine if user-space triggered the 1980 * exception. Send SIGBUS instead and let user-space deal with 1981 * it. 1982 */ 1983 force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); 1984 } 1985 1986 instrumentation_end(); 1987 irqentry_exit_to_user_mode(regs); 1988 } 1989 1990 bool __init handle_vc_boot_ghcb(struct pt_regs *regs) 1991 { 1992 unsigned long exit_code = regs->orig_ax; 1993 struct es_em_ctxt ctxt; 1994 enum es_result result; 1995 1996 vc_ghcb_invalidate(boot_ghcb); 1997 1998 result = vc_init_em_ctxt(&ctxt, regs, exit_code); 1999 if (result == ES_OK) 2000 result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); 2001 2002 /* Done - now check the result */ 2003 switch (result) { 2004 case ES_OK: 2005 vc_finish_insn(&ctxt); 2006 break; 2007 case ES_UNSUPPORTED: 2008 early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", 2009 exit_code, regs->ip); 2010 goto fail; 2011 case ES_VMM_ERROR: 2012 early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", 2013 exit_code, regs->ip); 2014 goto fail; 2015 case ES_DECODE_FAILED: 2016 early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", 2017 exit_code, regs->ip); 2018 goto fail; 2019 case ES_EXCEPTION: 2020 vc_early_forward_exception(&ctxt); 2021 break; 2022 case ES_RETRY: 2023 /* Nothing to do */ 2024 break; 2025 default: 2026 BUG(); 2027 } 2028 2029 return true; 2030 2031 fail: 2032 show_regs(regs); 2033 2034 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 2035 } 2036 2037 /* 2038 * Initial set up of SNP relies on information provided by the 2039 * Confidential Computing blob, which can be passed to the kernel 2040 * in the following ways, depending on how it is booted: 2041 * 2042 * - when booted via the boot/decompress kernel: 2043 * - via boot_params 2044 * 2045 * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): 2046 * - via a setup_data entry, as defined by the Linux Boot Protocol 2047 * 2048 * Scan for the blob in that order. 2049 */ 2050 static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) 2051 { 2052 struct cc_blob_sev_info *cc_info; 2053 2054 /* Boot kernel would have passed the CC blob via boot_params. */ 2055 if (bp->cc_blob_address) { 2056 cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; 2057 goto found_cc_info; 2058 } 2059 2060 /* 2061 * If kernel was booted directly, without the use of the 2062 * boot/decompression kernel, the CC blob may have been passed via 2063 * setup_data instead. 2064 */ 2065 cc_info = find_cc_blob_setup_data(bp); 2066 if (!cc_info) 2067 return NULL; 2068 2069 found_cc_info: 2070 if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) 2071 snp_abort(); 2072 2073 return cc_info; 2074 } 2075 2076 bool __head snp_init(struct boot_params *bp) 2077 { 2078 struct cc_blob_sev_info *cc_info; 2079 2080 if (!bp) 2081 return false; 2082 2083 cc_info = find_cc_blob(bp); 2084 if (!cc_info) 2085 return false; 2086 2087 setup_cpuid_table(cc_info); 2088 2089 /* 2090 * The CC blob will be used later to access the secrets page. Cache 2091 * it here like the boot kernel does. 2092 */ 2093 bp->cc_blob_address = (u32)(unsigned long)cc_info; 2094 2095 return true; 2096 } 2097 2098 void __head __noreturn snp_abort(void) 2099 { 2100 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); 2101 } 2102 2103 /* 2104 * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are 2105 * enabled, as the alternative (fallback) logic for DMI probing in the legacy 2106 * ROM region can cause a crash since this region is not pre-validated. 2107 */ 2108 void __init snp_dmi_setup(void) 2109 { 2110 if (efi_enabled(EFI_CONFIG_TABLES)) 2111 dmi_setup(); 2112 } 2113 2114 static void dump_cpuid_table(void) 2115 { 2116 const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); 2117 int i = 0; 2118 2119 pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", 2120 cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); 2121 2122 for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { 2123 const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; 2124 2125 pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", 2126 i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, 2127 fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); 2128 } 2129 } 2130 2131 /* 2132 * It is useful from an auditing/testing perspective to provide an easy way 2133 * for the guest owner to know that the CPUID table has been initialized as 2134 * expected, but that initialization happens too early in boot to print any 2135 * sort of indicator, and there's not really any other good place to do it, 2136 * so do it here. 2137 */ 2138 static int __init report_cpuid_table(void) 2139 { 2140 const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); 2141 2142 if (!cpuid_table->count) 2143 return 0; 2144 2145 pr_info("Using SNP CPUID table, %d entries present.\n", 2146 cpuid_table->count); 2147 2148 if (sev_cfg.debug) 2149 dump_cpuid_table(); 2150 2151 return 0; 2152 } 2153 arch_initcall(report_cpuid_table); 2154 2155 static int __init init_sev_config(char *str) 2156 { 2157 char *s; 2158 2159 while ((s = strsep(&str, ","))) { 2160 if (!strcmp(s, "debug")) { 2161 sev_cfg.debug = true; 2162 continue; 2163 } 2164 2165 pr_info("SEV command-line option '%s' was not recognized\n", s); 2166 } 2167 2168 return 1; 2169 } 2170 __setup("sev=", init_sev_config); 2171 2172 int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) 2173 { 2174 struct ghcb_state state; 2175 struct es_em_ctxt ctxt; 2176 unsigned long flags; 2177 struct ghcb *ghcb; 2178 int ret; 2179 2180 rio->exitinfo2 = SEV_RET_NO_FW_CALL; 2181 2182 /* 2183 * __sev_get_ghcb() needs to run with IRQs disabled because it is using 2184 * a per-CPU GHCB. 2185 */ 2186 local_irq_save(flags); 2187 2188 ghcb = __sev_get_ghcb(&state); 2189 if (!ghcb) { 2190 ret = -EIO; 2191 goto e_restore_irq; 2192 } 2193 2194 vc_ghcb_invalidate(ghcb); 2195 2196 if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { 2197 ghcb_set_rax(ghcb, input->data_gpa); 2198 ghcb_set_rbx(ghcb, input->data_npages); 2199 } 2200 2201 ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); 2202 if (ret) 2203 goto e_put; 2204 2205 rio->exitinfo2 = ghcb->save.sw_exit_info_2; 2206 switch (rio->exitinfo2) { 2207 case 0: 2208 break; 2209 2210 case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY): 2211 ret = -EAGAIN; 2212 break; 2213 2214 case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): 2215 /* Number of expected pages are returned in RBX */ 2216 if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { 2217 input->data_npages = ghcb_get_rbx(ghcb); 2218 ret = -ENOSPC; 2219 break; 2220 } 2221 fallthrough; 2222 default: 2223 ret = -EIO; 2224 break; 2225 } 2226 2227 e_put: 2228 __sev_put_ghcb(&state); 2229 e_restore_irq: 2230 local_irq_restore(flags); 2231 2232 return ret; 2233 } 2234 EXPORT_SYMBOL_GPL(snp_issue_guest_request); 2235 2236 static struct platform_device sev_guest_device = { 2237 .name = "sev-guest", 2238 .id = -1, 2239 }; 2240 2241 static int __init snp_init_platform_device(void) 2242 { 2243 struct sev_guest_platform_data data; 2244 u64 gpa; 2245 2246 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 2247 return -ENODEV; 2248 2249 gpa = get_secrets_page(); 2250 if (!gpa) 2251 return -ENODEV; 2252 2253 data.secrets_gpa = gpa; 2254 if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) 2255 return -ENODEV; 2256 2257 if (platform_device_register(&sev_guest_device)) 2258 return -ENODEV; 2259 2260 pr_info("SNP guest platform device initialized.\n"); 2261 return 0; 2262 } 2263 device_initcall(snp_init_platform_device); 2264