1 /* 2 * AMD Memory Encryption Support 3 * 4 * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 * 6 * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License version 2 as 10 * published by the Free Software Foundation. 11 */ 12 13 #define DISABLE_BRANCH_PROFILING 14 15 #include <linux/linkage.h> 16 #include <linux/init.h> 17 #include <linux/mm.h> 18 #include <linux/dma-direct.h> 19 #include <linux/swiotlb.h> 20 #include <linux/mem_encrypt.h> 21 22 #include <asm/tlbflush.h> 23 #include <asm/fixmap.h> 24 #include <asm/setup.h> 25 #include <asm/bootparam.h> 26 #include <asm/set_memory.h> 27 #include <asm/cacheflush.h> 28 #include <asm/sections.h> 29 #include <asm/processor-flags.h> 30 #include <asm/msr.h> 31 #include <asm/cmdline.h> 32 33 #include "mm_internal.h" 34 35 static char sme_cmdline_arg[] __initdata = "mem_encrypt"; 36 static char sme_cmdline_on[] __initdata = "on"; 37 static char sme_cmdline_off[] __initdata = "off"; 38 39 /* 40 * Since SME related variables are set early in the boot process they must 41 * reside in the .data section so as not to be zeroed out when the .bss 42 * section is later cleared. 43 */ 44 u64 sme_me_mask __section(.data) = 0; 45 EXPORT_SYMBOL(sme_me_mask); 46 DEFINE_STATIC_KEY_FALSE(sev_enable_key); 47 EXPORT_SYMBOL_GPL(sev_enable_key); 48 49 static bool sev_enabled __section(.data); 50 51 /* Buffer used for early in-place encryption by BSP, no locking needed */ 52 static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); 53 54 /* 55 * This routine does not change the underlying encryption setting of the 56 * page(s) that map this memory. It assumes that eventually the memory is 57 * meant to be accessed as either encrypted or decrypted but the contents 58 * are currently not in the desired state. 59 * 60 * This routine follows the steps outlined in the AMD64 Architecture 61 * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. 62 */ 63 static void __init __sme_early_enc_dec(resource_size_t paddr, 64 unsigned long size, bool enc) 65 { 66 void *src, *dst; 67 size_t len; 68 69 if (!sme_me_mask) 70 return; 71 72 wbinvd(); 73 74 /* 75 * There are limited number of early mapping slots, so map (at most) 76 * one page at time. 77 */ 78 while (size) { 79 len = min_t(size_t, sizeof(sme_early_buffer), size); 80 81 /* 82 * Create mappings for the current and desired format of 83 * the memory. Use a write-protected mapping for the source. 84 */ 85 src = enc ? early_memremap_decrypted_wp(paddr, len) : 86 early_memremap_encrypted_wp(paddr, len); 87 88 dst = enc ? early_memremap_encrypted(paddr, len) : 89 early_memremap_decrypted(paddr, len); 90 91 /* 92 * If a mapping can't be obtained to perform the operation, 93 * then eventual access of that area in the desired mode 94 * will cause a crash. 95 */ 96 BUG_ON(!src || !dst); 97 98 /* 99 * Use a temporary buffer, of cache-line multiple size, to 100 * avoid data corruption as documented in the APM. 101 */ 102 memcpy(sme_early_buffer, src, len); 103 memcpy(dst, sme_early_buffer, len); 104 105 early_memunmap(dst, len); 106 early_memunmap(src, len); 107 108 paddr += len; 109 size -= len; 110 } 111 } 112 113 void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) 114 { 115 __sme_early_enc_dec(paddr, size, true); 116 } 117 118 void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) 119 { 120 __sme_early_enc_dec(paddr, size, false); 121 } 122 123 static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, 124 bool map) 125 { 126 unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; 127 pmdval_t pmd_flags, pmd; 128 129 /* Use early_pmd_flags but remove the encryption mask */ 130 pmd_flags = __sme_clr(early_pmd_flags); 131 132 do { 133 pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; 134 __early_make_pgtable((unsigned long)vaddr, pmd); 135 136 vaddr += PMD_SIZE; 137 paddr += PMD_SIZE; 138 size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; 139 } while (size); 140 141 __native_flush_tlb(); 142 } 143 144 void __init sme_unmap_bootdata(char *real_mode_data) 145 { 146 struct boot_params *boot_data; 147 unsigned long cmdline_paddr; 148 149 if (!sme_active()) 150 return; 151 152 /* Get the command line address before unmapping the real_mode_data */ 153 boot_data = (struct boot_params *)real_mode_data; 154 cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); 155 156 __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); 157 158 if (!cmdline_paddr) 159 return; 160 161 __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); 162 } 163 164 void __init sme_map_bootdata(char *real_mode_data) 165 { 166 struct boot_params *boot_data; 167 unsigned long cmdline_paddr; 168 169 if (!sme_active()) 170 return; 171 172 __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); 173 174 /* Get the command line address after mapping the real_mode_data */ 175 boot_data = (struct boot_params *)real_mode_data; 176 cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); 177 178 if (!cmdline_paddr) 179 return; 180 181 __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); 182 } 183 184 void __init sme_early_init(void) 185 { 186 unsigned int i; 187 188 if (!sme_me_mask) 189 return; 190 191 early_pmd_flags = __sme_set(early_pmd_flags); 192 193 __supported_pte_mask = __sme_set(__supported_pte_mask); 194 195 /* Update the protection map with memory encryption mask */ 196 for (i = 0; i < ARRAY_SIZE(protection_map); i++) 197 protection_map[i] = pgprot_encrypted(protection_map[i]); 198 199 if (sev_active()) 200 swiotlb_force = SWIOTLB_FORCE; 201 } 202 203 static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, 204 gfp_t gfp, unsigned long attrs) 205 { 206 unsigned long dma_mask; 207 unsigned int order; 208 struct page *page; 209 void *vaddr = NULL; 210 211 dma_mask = dma_alloc_coherent_mask(dev, gfp); 212 order = get_order(size); 213 214 /* 215 * Memory will be memset to zero after marking decrypted, so don't 216 * bother clearing it before. 217 */ 218 gfp &= ~__GFP_ZERO; 219 220 page = alloc_pages_node(dev_to_node(dev), gfp, order); 221 if (page) { 222 dma_addr_t addr; 223 224 /* 225 * Since we will be clearing the encryption bit, check the 226 * mask with it already cleared. 227 */ 228 addr = __sme_clr(phys_to_dma(dev, page_to_phys(page))); 229 if ((addr + size) > dma_mask) { 230 __free_pages(page, get_order(size)); 231 } else { 232 vaddr = page_address(page); 233 *dma_handle = addr; 234 } 235 } 236 237 if (!vaddr) 238 vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp); 239 240 if (!vaddr) 241 return NULL; 242 243 /* Clear the SME encryption bit for DMA use if not swiotlb area */ 244 if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) { 245 set_memory_decrypted((unsigned long)vaddr, 1 << order); 246 memset(vaddr, 0, PAGE_SIZE << order); 247 *dma_handle = __sme_clr(*dma_handle); 248 } 249 250 return vaddr; 251 } 252 253 static void sev_free(struct device *dev, size_t size, void *vaddr, 254 dma_addr_t dma_handle, unsigned long attrs) 255 { 256 /* Set the SME encryption bit for re-use if not swiotlb area */ 257 if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle))) 258 set_memory_encrypted((unsigned long)vaddr, 259 1 << get_order(size)); 260 261 swiotlb_free_coherent(dev, size, vaddr, dma_handle); 262 } 263 264 static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) 265 { 266 pgprot_t old_prot, new_prot; 267 unsigned long pfn, pa, size; 268 pte_t new_pte; 269 270 switch (level) { 271 case PG_LEVEL_4K: 272 pfn = pte_pfn(*kpte); 273 old_prot = pte_pgprot(*kpte); 274 break; 275 case PG_LEVEL_2M: 276 pfn = pmd_pfn(*(pmd_t *)kpte); 277 old_prot = pmd_pgprot(*(pmd_t *)kpte); 278 break; 279 case PG_LEVEL_1G: 280 pfn = pud_pfn(*(pud_t *)kpte); 281 old_prot = pud_pgprot(*(pud_t *)kpte); 282 break; 283 default: 284 return; 285 } 286 287 new_prot = old_prot; 288 if (enc) 289 pgprot_val(new_prot) |= _PAGE_ENC; 290 else 291 pgprot_val(new_prot) &= ~_PAGE_ENC; 292 293 /* If prot is same then do nothing. */ 294 if (pgprot_val(old_prot) == pgprot_val(new_prot)) 295 return; 296 297 pa = pfn << page_level_shift(level); 298 size = page_level_size(level); 299 300 /* 301 * We are going to perform in-place en-/decryption and change the 302 * physical page attribute from C=1 to C=0 or vice versa. Flush the 303 * caches to ensure that data gets accessed with the correct C-bit. 304 */ 305 clflush_cache_range(__va(pa), size); 306 307 /* Encrypt/decrypt the contents in-place */ 308 if (enc) 309 sme_early_encrypt(pa, size); 310 else 311 sme_early_decrypt(pa, size); 312 313 /* Change the page encryption mask. */ 314 new_pte = pfn_pte(pfn, new_prot); 315 set_pte_atomic(kpte, new_pte); 316 } 317 318 static int __init early_set_memory_enc_dec(unsigned long vaddr, 319 unsigned long size, bool enc) 320 { 321 unsigned long vaddr_end, vaddr_next; 322 unsigned long psize, pmask; 323 int split_page_size_mask; 324 int level, ret; 325 pte_t *kpte; 326 327 vaddr_next = vaddr; 328 vaddr_end = vaddr + size; 329 330 for (; vaddr < vaddr_end; vaddr = vaddr_next) { 331 kpte = lookup_address(vaddr, &level); 332 if (!kpte || pte_none(*kpte)) { 333 ret = 1; 334 goto out; 335 } 336 337 if (level == PG_LEVEL_4K) { 338 __set_clr_pte_enc(kpte, level, enc); 339 vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; 340 continue; 341 } 342 343 psize = page_level_size(level); 344 pmask = page_level_mask(level); 345 346 /* 347 * Check whether we can change the large page in one go. 348 * We request a split when the address is not aligned and 349 * the number of pages to set/clear encryption bit is smaller 350 * than the number of pages in the large page. 351 */ 352 if (vaddr == (vaddr & pmask) && 353 ((vaddr_end - vaddr) >= psize)) { 354 __set_clr_pte_enc(kpte, level, enc); 355 vaddr_next = (vaddr & pmask) + psize; 356 continue; 357 } 358 359 /* 360 * The virtual address is part of a larger page, create the next 361 * level page table mapping (4K or 2M). If it is part of a 2M 362 * page then we request a split of the large page into 4K 363 * chunks. A 1GB large page is split into 2M pages, resp. 364 */ 365 if (level == PG_LEVEL_2M) 366 split_page_size_mask = 0; 367 else 368 split_page_size_mask = 1 << PG_LEVEL_2M; 369 370 kernel_physical_mapping_init(__pa(vaddr & pmask), 371 __pa((vaddr_end & pmask) + psize), 372 split_page_size_mask); 373 } 374 375 ret = 0; 376 377 out: 378 __flush_tlb_all(); 379 return ret; 380 } 381 382 int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) 383 { 384 return early_set_memory_enc_dec(vaddr, size, false); 385 } 386 387 int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) 388 { 389 return early_set_memory_enc_dec(vaddr, size, true); 390 } 391 392 /* 393 * SME and SEV are very similar but they are not the same, so there are 394 * times that the kernel will need to distinguish between SME and SEV. The 395 * sme_active() and sev_active() functions are used for this. When a 396 * distinction isn't needed, the mem_encrypt_active() function can be used. 397 * 398 * The trampoline code is a good example for this requirement. Before 399 * paging is activated, SME will access all memory as decrypted, but SEV 400 * will access all memory as encrypted. So, when APs are being brought 401 * up under SME the trampoline area cannot be encrypted, whereas under SEV 402 * the trampoline area must be encrypted. 403 */ 404 bool sme_active(void) 405 { 406 return sme_me_mask && !sev_enabled; 407 } 408 EXPORT_SYMBOL(sme_active); 409 410 bool sev_active(void) 411 { 412 return sme_me_mask && sev_enabled; 413 } 414 EXPORT_SYMBOL(sev_active); 415 416 static const struct dma_map_ops sev_dma_ops = { 417 .alloc = sev_alloc, 418 .free = sev_free, 419 .map_page = swiotlb_map_page, 420 .unmap_page = swiotlb_unmap_page, 421 .map_sg = swiotlb_map_sg_attrs, 422 .unmap_sg = swiotlb_unmap_sg_attrs, 423 .sync_single_for_cpu = swiotlb_sync_single_for_cpu, 424 .sync_single_for_device = swiotlb_sync_single_for_device, 425 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 426 .sync_sg_for_device = swiotlb_sync_sg_for_device, 427 .mapping_error = swiotlb_dma_mapping_error, 428 }; 429 430 /* Architecture __weak replacement functions */ 431 void __init mem_encrypt_init(void) 432 { 433 if (!sme_me_mask) 434 return; 435 436 /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ 437 swiotlb_update_mem_attributes(); 438 439 /* 440 * With SEV, DMA operations cannot use encryption. New DMA ops 441 * are required in order to mark the DMA areas as decrypted or 442 * to use bounce buffers. 443 */ 444 if (sev_active()) 445 dma_ops = &sev_dma_ops; 446 447 /* 448 * With SEV, we need to unroll the rep string I/O instructions. 449 */ 450 if (sev_active()) 451 static_branch_enable(&sev_enable_key); 452 453 pr_info("AMD %s active\n", 454 sev_active() ? "Secure Encrypted Virtualization (SEV)" 455 : "Secure Memory Encryption (SME)"); 456 } 457 458 void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) 459 { 460 WARN(PAGE_ALIGN(size) != size, 461 "size is not page-aligned (%#lx)\n", size); 462 463 /* Make the SWIOTLB buffer area decrypted */ 464 set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); 465 } 466 467 struct sme_populate_pgd_data { 468 void *pgtable_area; 469 pgd_t *pgd; 470 471 pmdval_t pmd_flags; 472 pteval_t pte_flags; 473 unsigned long paddr; 474 475 unsigned long vaddr; 476 unsigned long vaddr_end; 477 }; 478 479 static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) 480 { 481 unsigned long pgd_start, pgd_end, pgd_size; 482 pgd_t *pgd_p; 483 484 pgd_start = ppd->vaddr & PGDIR_MASK; 485 pgd_end = ppd->vaddr_end & PGDIR_MASK; 486 487 pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); 488 489 pgd_p = ppd->pgd + pgd_index(ppd->vaddr); 490 491 memset(pgd_p, 0, pgd_size); 492 } 493 494 #define PGD_FLAGS _KERNPG_TABLE_NOENC 495 #define P4D_FLAGS _KERNPG_TABLE_NOENC 496 #define PUD_FLAGS _KERNPG_TABLE_NOENC 497 #define PMD_FLAGS _KERNPG_TABLE_NOENC 498 499 #define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) 500 501 #define PMD_FLAGS_DEC PMD_FLAGS_LARGE 502 #define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ 503 (_PAGE_PAT | _PAGE_PWT)) 504 505 #define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) 506 507 #define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) 508 509 #define PTE_FLAGS_DEC PTE_FLAGS 510 #define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ 511 (_PAGE_PAT | _PAGE_PWT)) 512 513 #define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) 514 515 static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) 516 { 517 pgd_t *pgd_p; 518 p4d_t *p4d_p; 519 pud_t *pud_p; 520 pmd_t *pmd_p; 521 522 pgd_p = ppd->pgd + pgd_index(ppd->vaddr); 523 if (native_pgd_val(*pgd_p)) { 524 if (IS_ENABLED(CONFIG_X86_5LEVEL)) 525 p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); 526 else 527 pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); 528 } else { 529 pgd_t pgd; 530 531 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 532 p4d_p = ppd->pgtable_area; 533 memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); 534 ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; 535 536 pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); 537 } else { 538 pud_p = ppd->pgtable_area; 539 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 540 ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; 541 542 pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); 543 } 544 native_set_pgd(pgd_p, pgd); 545 } 546 547 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 548 p4d_p += p4d_index(ppd->vaddr); 549 if (native_p4d_val(*p4d_p)) { 550 pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); 551 } else { 552 p4d_t p4d; 553 554 pud_p = ppd->pgtable_area; 555 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 556 ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; 557 558 p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); 559 native_set_p4d(p4d_p, p4d); 560 } 561 } 562 563 pud_p += pud_index(ppd->vaddr); 564 if (native_pud_val(*pud_p)) { 565 if (native_pud_val(*pud_p) & _PAGE_PSE) 566 return NULL; 567 568 pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); 569 } else { 570 pud_t pud; 571 572 pmd_p = ppd->pgtable_area; 573 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); 574 ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; 575 576 pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); 577 native_set_pud(pud_p, pud); 578 } 579 580 return pmd_p; 581 } 582 583 static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) 584 { 585 pmd_t *pmd_p; 586 587 pmd_p = sme_prepare_pgd(ppd); 588 if (!pmd_p) 589 return; 590 591 pmd_p += pmd_index(ppd->vaddr); 592 if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) 593 native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); 594 } 595 596 static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) 597 { 598 pmd_t *pmd_p; 599 pte_t *pte_p; 600 601 pmd_p = sme_prepare_pgd(ppd); 602 if (!pmd_p) 603 return; 604 605 pmd_p += pmd_index(ppd->vaddr); 606 if (native_pmd_val(*pmd_p)) { 607 if (native_pmd_val(*pmd_p) & _PAGE_PSE) 608 return; 609 610 pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); 611 } else { 612 pmd_t pmd; 613 614 pte_p = ppd->pgtable_area; 615 memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); 616 ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; 617 618 pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); 619 native_set_pmd(pmd_p, pmd); 620 } 621 622 pte_p += pte_index(ppd->vaddr); 623 if (!native_pte_val(*pte_p)) 624 native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); 625 } 626 627 static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) 628 { 629 while (ppd->vaddr < ppd->vaddr_end) { 630 sme_populate_pgd_large(ppd); 631 632 ppd->vaddr += PMD_PAGE_SIZE; 633 ppd->paddr += PMD_PAGE_SIZE; 634 } 635 } 636 637 static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) 638 { 639 while (ppd->vaddr < ppd->vaddr_end) { 640 sme_populate_pgd(ppd); 641 642 ppd->vaddr += PAGE_SIZE; 643 ppd->paddr += PAGE_SIZE; 644 } 645 } 646 647 static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, 648 pmdval_t pmd_flags, pteval_t pte_flags) 649 { 650 unsigned long vaddr_end; 651 652 ppd->pmd_flags = pmd_flags; 653 ppd->pte_flags = pte_flags; 654 655 /* Save original end value since we modify the struct value */ 656 vaddr_end = ppd->vaddr_end; 657 658 /* If start is not 2MB aligned, create PTE entries */ 659 ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); 660 __sme_map_range_pte(ppd); 661 662 /* Create PMD entries */ 663 ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; 664 __sme_map_range_pmd(ppd); 665 666 /* If end is not 2MB aligned, create PTE entries */ 667 ppd->vaddr_end = vaddr_end; 668 __sme_map_range_pte(ppd); 669 } 670 671 static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) 672 { 673 __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); 674 } 675 676 static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) 677 { 678 __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); 679 } 680 681 static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) 682 { 683 __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); 684 } 685 686 static unsigned long __init sme_pgtable_calc(unsigned long len) 687 { 688 unsigned long p4d_size, pud_size, pmd_size, pte_size; 689 unsigned long total; 690 691 /* 692 * Perform a relatively simplistic calculation of the pagetable 693 * entries that are needed. Those mappings will be covered mostly 694 * by 2MB PMD entries so we can conservatively calculate the required 695 * number of P4D, PUD and PMD structures needed to perform the 696 * mappings. For mappings that are not 2MB aligned, PTE mappings 697 * would be needed for the start and end portion of the address range 698 * that fall outside of the 2MB alignment. This results in, at most, 699 * two extra pages to hold PTE entries for each range that is mapped. 700 * Incrementing the count for each covers the case where the addresses 701 * cross entries. 702 */ 703 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 704 p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; 705 p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; 706 pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; 707 pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 708 } else { 709 p4d_size = 0; 710 pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; 711 pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 712 } 713 pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; 714 pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; 715 pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; 716 717 total = p4d_size + pud_size + pmd_size + pte_size; 718 719 /* 720 * Now calculate the added pagetable structures needed to populate 721 * the new pagetables. 722 */ 723 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 724 p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; 725 p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; 726 pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; 727 pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 728 } else { 729 p4d_size = 0; 730 pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; 731 pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 732 } 733 pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; 734 pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; 735 736 total += p4d_size + pud_size + pmd_size; 737 738 return total; 739 } 740 741 void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) 742 { 743 unsigned long workarea_start, workarea_end, workarea_len; 744 unsigned long execute_start, execute_end, execute_len; 745 unsigned long kernel_start, kernel_end, kernel_len; 746 unsigned long initrd_start, initrd_end, initrd_len; 747 struct sme_populate_pgd_data ppd; 748 unsigned long pgtable_area_len; 749 unsigned long decrypted_base; 750 751 if (!sme_active()) 752 return; 753 754 /* 755 * Prepare for encrypting the kernel and initrd by building new 756 * pagetables with the necessary attributes needed to encrypt the 757 * kernel in place. 758 * 759 * One range of virtual addresses will map the memory occupied 760 * by the kernel and initrd as encrypted. 761 * 762 * Another range of virtual addresses will map the memory occupied 763 * by the kernel and initrd as decrypted and write-protected. 764 * 765 * The use of write-protect attribute will prevent any of the 766 * memory from being cached. 767 */ 768 769 /* Physical addresses gives us the identity mapped virtual addresses */ 770 kernel_start = __pa_symbol(_text); 771 kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); 772 kernel_len = kernel_end - kernel_start; 773 774 initrd_start = 0; 775 initrd_end = 0; 776 initrd_len = 0; 777 #ifdef CONFIG_BLK_DEV_INITRD 778 initrd_len = (unsigned long)bp->hdr.ramdisk_size | 779 ((unsigned long)bp->ext_ramdisk_size << 32); 780 if (initrd_len) { 781 initrd_start = (unsigned long)bp->hdr.ramdisk_image | 782 ((unsigned long)bp->ext_ramdisk_image << 32); 783 initrd_end = PAGE_ALIGN(initrd_start + initrd_len); 784 initrd_len = initrd_end - initrd_start; 785 } 786 #endif 787 788 /* Set the encryption workarea to be immediately after the kernel */ 789 workarea_start = kernel_end; 790 791 /* 792 * Calculate required number of workarea bytes needed: 793 * executable encryption area size: 794 * stack page (PAGE_SIZE) 795 * encryption routine page (PAGE_SIZE) 796 * intermediate copy buffer (PMD_PAGE_SIZE) 797 * pagetable structures for the encryption of the kernel 798 * pagetable structures for workarea (in case not currently mapped) 799 */ 800 execute_start = workarea_start; 801 execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; 802 execute_len = execute_end - execute_start; 803 804 /* 805 * One PGD for both encrypted and decrypted mappings and a set of 806 * PUDs and PMDs for each of the encrypted and decrypted mappings. 807 */ 808 pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; 809 pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; 810 if (initrd_len) 811 pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; 812 813 /* PUDs and PMDs needed in the current pagetables for the workarea */ 814 pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); 815 816 /* 817 * The total workarea includes the executable encryption area and 818 * the pagetable area. The start of the workarea is already 2MB 819 * aligned, align the end of the workarea on a 2MB boundary so that 820 * we don't try to create/allocate PTE entries from the workarea 821 * before it is mapped. 822 */ 823 workarea_len = execute_len + pgtable_area_len; 824 workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); 825 826 /* 827 * Set the address to the start of where newly created pagetable 828 * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable 829 * structures are created when the workarea is added to the current 830 * pagetables and when the new encrypted and decrypted kernel 831 * mappings are populated. 832 */ 833 ppd.pgtable_area = (void *)execute_end; 834 835 /* 836 * Make sure the current pagetable structure has entries for 837 * addressing the workarea. 838 */ 839 ppd.pgd = (pgd_t *)native_read_cr3_pa(); 840 ppd.paddr = workarea_start; 841 ppd.vaddr = workarea_start; 842 ppd.vaddr_end = workarea_end; 843 sme_map_range_decrypted(&ppd); 844 845 /* Flush the TLB - no globals so cr3 is enough */ 846 native_write_cr3(__native_read_cr3()); 847 848 /* 849 * A new pagetable structure is being built to allow for the kernel 850 * and initrd to be encrypted. It starts with an empty PGD that will 851 * then be populated with new PUDs and PMDs as the encrypted and 852 * decrypted kernel mappings are created. 853 */ 854 ppd.pgd = ppd.pgtable_area; 855 memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); 856 ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; 857 858 /* 859 * A different PGD index/entry must be used to get different 860 * pagetable entries for the decrypted mapping. Choose the next 861 * PGD index and convert it to a virtual address to be used as 862 * the base of the mapping. 863 */ 864 decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); 865 if (initrd_len) { 866 unsigned long check_base; 867 868 check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); 869 decrypted_base = max(decrypted_base, check_base); 870 } 871 decrypted_base <<= PGDIR_SHIFT; 872 873 /* Add encrypted kernel (identity) mappings */ 874 ppd.paddr = kernel_start; 875 ppd.vaddr = kernel_start; 876 ppd.vaddr_end = kernel_end; 877 sme_map_range_encrypted(&ppd); 878 879 /* Add decrypted, write-protected kernel (non-identity) mappings */ 880 ppd.paddr = kernel_start; 881 ppd.vaddr = kernel_start + decrypted_base; 882 ppd.vaddr_end = kernel_end + decrypted_base; 883 sme_map_range_decrypted_wp(&ppd); 884 885 if (initrd_len) { 886 /* Add encrypted initrd (identity) mappings */ 887 ppd.paddr = initrd_start; 888 ppd.vaddr = initrd_start; 889 ppd.vaddr_end = initrd_end; 890 sme_map_range_encrypted(&ppd); 891 /* 892 * Add decrypted, write-protected initrd (non-identity) mappings 893 */ 894 ppd.paddr = initrd_start; 895 ppd.vaddr = initrd_start + decrypted_base; 896 ppd.vaddr_end = initrd_end + decrypted_base; 897 sme_map_range_decrypted_wp(&ppd); 898 } 899 900 /* Add decrypted workarea mappings to both kernel mappings */ 901 ppd.paddr = workarea_start; 902 ppd.vaddr = workarea_start; 903 ppd.vaddr_end = workarea_end; 904 sme_map_range_decrypted(&ppd); 905 906 ppd.paddr = workarea_start; 907 ppd.vaddr = workarea_start + decrypted_base; 908 ppd.vaddr_end = workarea_end + decrypted_base; 909 sme_map_range_decrypted(&ppd); 910 911 /* Perform the encryption */ 912 sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, 913 kernel_len, workarea_start, (unsigned long)ppd.pgd); 914 915 if (initrd_len) 916 sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, 917 initrd_len, workarea_start, 918 (unsigned long)ppd.pgd); 919 920 /* 921 * At this point we are running encrypted. Remove the mappings for 922 * the decrypted areas - all that is needed for this is to remove 923 * the PGD entry/entries. 924 */ 925 ppd.vaddr = kernel_start + decrypted_base; 926 ppd.vaddr_end = kernel_end + decrypted_base; 927 sme_clear_pgd(&ppd); 928 929 if (initrd_len) { 930 ppd.vaddr = initrd_start + decrypted_base; 931 ppd.vaddr_end = initrd_end + decrypted_base; 932 sme_clear_pgd(&ppd); 933 } 934 935 ppd.vaddr = workarea_start + decrypted_base; 936 ppd.vaddr_end = workarea_end + decrypted_base; 937 sme_clear_pgd(&ppd); 938 939 /* Flush the TLB - no globals so cr3 is enough */ 940 native_write_cr3(__native_read_cr3()); 941 } 942 943 void __init __nostackprotector sme_enable(struct boot_params *bp) 944 { 945 const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; 946 unsigned int eax, ebx, ecx, edx; 947 unsigned long feature_mask; 948 bool active_by_default; 949 unsigned long me_mask; 950 char buffer[16]; 951 u64 msr; 952 953 /* Check for the SME/SEV support leaf */ 954 eax = 0x80000000; 955 ecx = 0; 956 native_cpuid(&eax, &ebx, &ecx, &edx); 957 if (eax < 0x8000001f) 958 return; 959 960 #define AMD_SME_BIT BIT(0) 961 #define AMD_SEV_BIT BIT(1) 962 /* 963 * Set the feature mask (SME or SEV) based on whether we are 964 * running under a hypervisor. 965 */ 966 eax = 1; 967 ecx = 0; 968 native_cpuid(&eax, &ebx, &ecx, &edx); 969 feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; 970 971 /* 972 * Check for the SME/SEV feature: 973 * CPUID Fn8000_001F[EAX] 974 * - Bit 0 - Secure Memory Encryption support 975 * - Bit 1 - Secure Encrypted Virtualization support 976 * CPUID Fn8000_001F[EBX] 977 * - Bits 5:0 - Pagetable bit position used to indicate encryption 978 */ 979 eax = 0x8000001f; 980 ecx = 0; 981 native_cpuid(&eax, &ebx, &ecx, &edx); 982 if (!(eax & feature_mask)) 983 return; 984 985 me_mask = 1UL << (ebx & 0x3f); 986 987 /* Check if memory encryption is enabled */ 988 if (feature_mask == AMD_SME_BIT) { 989 /* For SME, check the SYSCFG MSR */ 990 msr = __rdmsr(MSR_K8_SYSCFG); 991 if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) 992 return; 993 } else { 994 /* For SEV, check the SEV MSR */ 995 msr = __rdmsr(MSR_AMD64_SEV); 996 if (!(msr & MSR_AMD64_SEV_ENABLED)) 997 return; 998 999 /* SEV state cannot be controlled by a command line option */ 1000 sme_me_mask = me_mask; 1001 sev_enabled = true; 1002 return; 1003 } 1004 1005 /* 1006 * Fixups have not been applied to phys_base yet and we're running 1007 * identity mapped, so we must obtain the address to the SME command 1008 * line argument data using rip-relative addressing. 1009 */ 1010 asm ("lea sme_cmdline_arg(%%rip), %0" 1011 : "=r" (cmdline_arg) 1012 : "p" (sme_cmdline_arg)); 1013 asm ("lea sme_cmdline_on(%%rip), %0" 1014 : "=r" (cmdline_on) 1015 : "p" (sme_cmdline_on)); 1016 asm ("lea sme_cmdline_off(%%rip), %0" 1017 : "=r" (cmdline_off) 1018 : "p" (sme_cmdline_off)); 1019 1020 if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) 1021 active_by_default = true; 1022 else 1023 active_by_default = false; 1024 1025 cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | 1026 ((u64)bp->ext_cmd_line_ptr << 32)); 1027 1028 cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); 1029 1030 if (!strncmp(buffer, cmdline_on, sizeof(buffer))) 1031 sme_me_mask = me_mask; 1032 else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) 1033 sme_me_mask = 0; 1034 else 1035 sme_me_mask = active_by_default ? me_mask : 0; 1036 } 1037