1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-20 Intel Corporation. */ 3 4 #include <linux/lockdep.h> 5 #include <linux/mm.h> 6 #include <linux/mman.h> 7 #include <linux/shmem_fs.h> 8 #include <linux/suspend.h> 9 #include <linux/sched/mm.h> 10 #include <asm/sgx.h> 11 #include "encl.h" 12 #include "encls.h" 13 #include "sgx.h" 14 15 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) 16 /* 17 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to 18 * determine the page index associated with the first PCMD entry 19 * within a PCMD page. 20 */ 21 #define PCMD_FIRST_MASK GENMASK(4, 0) 22 23 /** 24 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with 25 * a PCMD page is in process of being reclaimed. 26 * @encl: Enclave to which PCMD page belongs 27 * @start_addr: Address of enclave page using first entry within the PCMD page 28 * 29 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is 30 * stored. The PCMD data of a reclaimed enclave page contains enough 31 * information for the processor to verify the page at the time 32 * it is loaded back into the Enclave Page Cache (EPC). 33 * 34 * The backing storage to which enclave pages are reclaimed is laid out as 35 * follows: 36 * Encrypted enclave pages:SECS page:PCMD pages 37 * 38 * Each PCMD page contains the PCMD metadata of 39 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. 40 * 41 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the 42 * process of getting data (and thus soon being non-empty). (b) is tested with 43 * a check if an enclave page sharing the PCMD page is in the process of being 44 * reclaimed. 45 * 46 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it 47 * intends to reclaim that enclave page - it means that the PCMD page 48 * associated with that enclave page is about to get some data and thus 49 * even if the PCMD page is empty, it should not be truncated. 50 * 51 * Context: Enclave mutex (&sgx_encl->lock) must be held. 52 * Return: 1 if the reclaimer is about to write to the PCMD page 53 * 0 if the reclaimer has no intention to write to the PCMD page 54 */ 55 static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, 56 unsigned long start_addr) 57 { 58 int reclaimed = 0; 59 int i; 60 61 /* 62 * PCMD_FIRST_MASK is based on number of PCMD entries within 63 * PCMD page being 32. 64 */ 65 BUILD_BUG_ON(PCMDS_PER_PAGE != 32); 66 67 for (i = 0; i < PCMDS_PER_PAGE; i++) { 68 struct sgx_encl_page *entry; 69 unsigned long addr; 70 71 addr = start_addr + i * PAGE_SIZE; 72 73 /* 74 * Stop when reaching the SECS page - it does not 75 * have a page_array entry and its reclaim is 76 * started and completed with enclave mutex held so 77 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED 78 * flag. 79 */ 80 if (addr == encl->base + encl->size) 81 break; 82 83 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 84 if (!entry) 85 continue; 86 87 /* 88 * VA page slot ID uses same bit as the flag so it is important 89 * to ensure that the page is not already in backing store. 90 */ 91 if (entry->epc_page && 92 (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { 93 reclaimed = 1; 94 break; 95 } 96 } 97 98 return reclaimed; 99 } 100 101 /* 102 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's 103 * follow right after the EPC data in the backing storage. In addition to the 104 * visible enclave pages, there's one extra page slot for SECS, before PCMD 105 * structs. 106 */ 107 static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, 108 unsigned long page_index) 109 { 110 pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); 111 112 return epc_end_off + page_index * sizeof(struct sgx_pcmd); 113 } 114 115 /* 116 * Free a page from the backing storage in the given page index. 117 */ 118 static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) 119 { 120 struct inode *inode = file_inode(encl->backing); 121 122 shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); 123 } 124 125 /* 126 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC 127 * Pages" in the SDM. 128 */ 129 static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, 130 struct sgx_epc_page *epc_page, 131 struct sgx_epc_page *secs_page) 132 { 133 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; 134 struct sgx_encl *encl = encl_page->encl; 135 pgoff_t page_index, page_pcmd_off; 136 unsigned long pcmd_first_page; 137 struct sgx_pageinfo pginfo; 138 struct sgx_backing b; 139 bool pcmd_page_empty; 140 u8 *pcmd_page; 141 int ret; 142 143 if (secs_page) 144 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); 145 else 146 page_index = PFN_DOWN(encl->size); 147 148 /* 149 * Address of enclave page using the first entry within the PCMD page. 150 */ 151 pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; 152 153 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 154 155 ret = sgx_encl_lookup_backing(encl, page_index, &b); 156 if (ret) 157 return ret; 158 159 pginfo.addr = encl_page->desc & PAGE_MASK; 160 pginfo.contents = (unsigned long)kmap_atomic(b.contents); 161 pcmd_page = kmap_atomic(b.pcmd); 162 pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; 163 164 if (secs_page) 165 pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); 166 else 167 pginfo.secs = 0; 168 169 ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), 170 sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); 171 if (ret) { 172 if (encls_failed(ret)) 173 ENCLS_WARN(ret, "ELDU"); 174 175 ret = -EFAULT; 176 } 177 178 memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); 179 set_page_dirty(b.pcmd); 180 181 /* 182 * The area for the PCMD in the page was zeroed above. Check if the 183 * whole page is now empty meaning that all PCMD's have been zeroed: 184 */ 185 pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); 186 187 kunmap_atomic(pcmd_page); 188 kunmap_atomic((void *)(unsigned long)pginfo.contents); 189 190 get_page(b.pcmd); 191 sgx_encl_put_backing(&b); 192 193 sgx_encl_truncate_backing_page(encl, page_index); 194 195 if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { 196 sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); 197 pcmd_page = kmap_atomic(b.pcmd); 198 if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) 199 pr_warn("PCMD page not empty after truncate.\n"); 200 kunmap_atomic(pcmd_page); 201 } 202 203 put_page(b.pcmd); 204 205 return ret; 206 } 207 208 static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, 209 struct sgx_epc_page *secs_page) 210 { 211 212 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; 213 struct sgx_encl *encl = encl_page->encl; 214 struct sgx_epc_page *epc_page; 215 int ret; 216 217 epc_page = sgx_alloc_epc_page(encl_page, false); 218 if (IS_ERR(epc_page)) 219 return epc_page; 220 221 ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); 222 if (ret) { 223 sgx_encl_free_epc_page(epc_page); 224 return ERR_PTR(ret); 225 } 226 227 sgx_free_va_slot(encl_page->va_page, va_offset); 228 list_move(&encl_page->va_page->list, &encl->va_pages); 229 encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; 230 encl_page->epc_page = epc_page; 231 232 return epc_page; 233 } 234 235 static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, 236 struct sgx_encl_page *entry) 237 { 238 struct sgx_epc_page *epc_page; 239 240 /* Entry successfully located. */ 241 if (entry->epc_page) { 242 if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) 243 return ERR_PTR(-EBUSY); 244 245 return entry; 246 } 247 248 if (!(encl->secs.epc_page)) { 249 epc_page = sgx_encl_eldu(&encl->secs, NULL); 250 if (IS_ERR(epc_page)) 251 return ERR_CAST(epc_page); 252 } 253 254 epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); 255 if (IS_ERR(epc_page)) 256 return ERR_CAST(epc_page); 257 258 encl->secs_child_cnt++; 259 sgx_mark_page_reclaimable(entry->epc_page); 260 261 return entry; 262 } 263 264 static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, 265 unsigned long addr, 266 unsigned long vm_flags) 267 { 268 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); 269 struct sgx_encl_page *entry; 270 271 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 272 if (!entry) 273 return ERR_PTR(-EFAULT); 274 275 /* 276 * Verify that the page has equal or higher build time 277 * permissions than the VMA permissions (i.e. the subset of {VM_READ, 278 * VM_WRITE, VM_EXECUTE} in vma->vm_flags). 279 */ 280 if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) 281 return ERR_PTR(-EFAULT); 282 283 return __sgx_encl_load_page(encl, entry); 284 } 285 286 struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, 287 unsigned long addr) 288 { 289 struct sgx_encl_page *entry; 290 291 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 292 if (!entry) 293 return ERR_PTR(-EFAULT); 294 295 return __sgx_encl_load_page(encl, entry); 296 } 297 298 /** 299 * sgx_encl_eaug_page() - Dynamically add page to initialized enclave 300 * @vma: VMA obtained from fault info from where page is accessed 301 * @encl: enclave accessing the page 302 * @addr: address that triggered the page fault 303 * 304 * When an initialized enclave accesses a page with no backing EPC page 305 * on a SGX2 system then the EPC can be added dynamically via the SGX2 306 * ENCLS[EAUG] instruction. 307 * 308 * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed 309 * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. 310 */ 311 static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, 312 struct sgx_encl *encl, unsigned long addr) 313 { 314 vm_fault_t vmret = VM_FAULT_SIGBUS; 315 struct sgx_pageinfo pginfo = {0}; 316 struct sgx_encl_page *encl_page; 317 struct sgx_epc_page *epc_page; 318 struct sgx_va_page *va_page; 319 unsigned long phys_addr; 320 u64 secinfo_flags; 321 int ret; 322 323 if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) 324 return VM_FAULT_SIGBUS; 325 326 /* 327 * Ignore internal permission checking for dynamically added pages. 328 * They matter only for data added during the pre-initialization 329 * phase. The enclave decides the permissions by the means of 330 * EACCEPT, EACCEPTCOPY and EMODPE. 331 */ 332 secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; 333 encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); 334 if (IS_ERR(encl_page)) 335 return VM_FAULT_OOM; 336 337 mutex_lock(&encl->lock); 338 339 epc_page = sgx_alloc_epc_page(encl_page, false); 340 if (IS_ERR(epc_page)) { 341 if (PTR_ERR(epc_page) == -EBUSY) 342 vmret = VM_FAULT_NOPAGE; 343 goto err_out_unlock; 344 } 345 346 va_page = sgx_encl_grow(encl, false); 347 if (IS_ERR(va_page)) { 348 if (PTR_ERR(va_page) == -EBUSY) 349 vmret = VM_FAULT_NOPAGE; 350 goto err_out_epc; 351 } 352 353 if (va_page) 354 list_add(&va_page->list, &encl->va_pages); 355 356 ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), 357 encl_page, GFP_KERNEL); 358 /* 359 * If ret == -EBUSY then page was created in another flow while 360 * running without encl->lock 361 */ 362 if (ret) 363 goto err_out_shrink; 364 365 pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); 366 pginfo.addr = encl_page->desc & PAGE_MASK; 367 pginfo.metadata = 0; 368 369 ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); 370 if (ret) 371 goto err_out; 372 373 encl_page->encl = encl; 374 encl_page->epc_page = epc_page; 375 encl_page->type = SGX_PAGE_TYPE_REG; 376 encl->secs_child_cnt++; 377 378 sgx_mark_page_reclaimable(encl_page->epc_page); 379 380 phys_addr = sgx_get_epc_phys_addr(epc_page); 381 /* 382 * Do not undo everything when creating PTE entry fails - next #PF 383 * would find page ready for a PTE. 384 */ 385 vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); 386 if (vmret != VM_FAULT_NOPAGE) { 387 mutex_unlock(&encl->lock); 388 return VM_FAULT_SIGBUS; 389 } 390 mutex_unlock(&encl->lock); 391 return VM_FAULT_NOPAGE; 392 393 err_out: 394 xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); 395 396 err_out_shrink: 397 sgx_encl_shrink(encl, va_page); 398 err_out_epc: 399 sgx_encl_free_epc_page(epc_page); 400 err_out_unlock: 401 mutex_unlock(&encl->lock); 402 kfree(encl_page); 403 404 return vmret; 405 } 406 407 static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) 408 { 409 unsigned long addr = (unsigned long)vmf->address; 410 struct vm_area_struct *vma = vmf->vma; 411 struct sgx_encl_page *entry; 412 unsigned long phys_addr; 413 struct sgx_encl *encl; 414 vm_fault_t ret; 415 416 encl = vma->vm_private_data; 417 418 /* 419 * It's very unlikely but possible that allocating memory for the 420 * mm_list entry of a forked process failed in sgx_vma_open(). When 421 * this happens, vm_private_data is set to NULL. 422 */ 423 if (unlikely(!encl)) 424 return VM_FAULT_SIGBUS; 425 426 /* 427 * The page_array keeps track of all enclave pages, whether they 428 * are swapped out or not. If there is no entry for this page and 429 * the system supports SGX2 then it is possible to dynamically add 430 * a new enclave page. This is only possible for an initialized 431 * enclave that will be checked for right away. 432 */ 433 if (cpu_feature_enabled(X86_FEATURE_SGX2) && 434 (!xa_load(&encl->page_array, PFN_DOWN(addr)))) 435 return sgx_encl_eaug_page(vma, encl, addr); 436 437 mutex_lock(&encl->lock); 438 439 entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); 440 if (IS_ERR(entry)) { 441 mutex_unlock(&encl->lock); 442 443 if (PTR_ERR(entry) == -EBUSY) 444 return VM_FAULT_NOPAGE; 445 446 return VM_FAULT_SIGBUS; 447 } 448 449 phys_addr = sgx_get_epc_phys_addr(entry->epc_page); 450 451 ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); 452 if (ret != VM_FAULT_NOPAGE) { 453 mutex_unlock(&encl->lock); 454 455 return VM_FAULT_SIGBUS; 456 } 457 458 sgx_encl_test_and_clear_young(vma->vm_mm, entry); 459 mutex_unlock(&encl->lock); 460 461 return VM_FAULT_NOPAGE; 462 } 463 464 static void sgx_vma_open(struct vm_area_struct *vma) 465 { 466 struct sgx_encl *encl = vma->vm_private_data; 467 468 /* 469 * It's possible but unlikely that vm_private_data is NULL. This can 470 * happen in a grandchild of a process, when sgx_encl_mm_add() had 471 * failed to allocate memory in this callback. 472 */ 473 if (unlikely(!encl)) 474 return; 475 476 if (sgx_encl_mm_add(encl, vma->vm_mm)) 477 vma->vm_private_data = NULL; 478 } 479 480 481 /** 482 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed 483 * @encl: an enclave pointer 484 * @start: lower bound of the address range, inclusive 485 * @end: upper bound of the address range, exclusive 486 * @vm_flags: VMA flags 487 * 488 * Iterate through the enclave pages contained within [@start, @end) to verify 489 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} 490 * do not contain any permissions that are not contained in the build time 491 * permissions of any of the enclave pages within the given address range. 492 * 493 * An enclave creator must declare the strongest permissions that will be 494 * needed for each enclave page. This ensures that mappings have the identical 495 * or weaker permissions than the earlier declared permissions. 496 * 497 * Return: 0 on success, -EACCES otherwise 498 */ 499 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, 500 unsigned long end, unsigned long vm_flags) 501 { 502 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); 503 struct sgx_encl_page *page; 504 unsigned long count = 0; 505 int ret = 0; 506 507 XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); 508 509 /* Disallow mapping outside enclave's address range. */ 510 if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && 511 (start < encl->base || end > encl->base + encl->size)) 512 return -EACCES; 513 514 /* 515 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might 516 * conflict with the enclave page permissions. 517 */ 518 if (current->personality & READ_IMPLIES_EXEC) 519 return -EACCES; 520 521 mutex_lock(&encl->lock); 522 xas_lock(&xas); 523 xas_for_each(&xas, page, PFN_DOWN(end - 1)) { 524 if (~page->vm_max_prot_bits & vm_prot_bits) { 525 ret = -EACCES; 526 break; 527 } 528 529 /* Reschedule on every XA_CHECK_SCHED iteration. */ 530 if (!(++count % XA_CHECK_SCHED)) { 531 xas_pause(&xas); 532 xas_unlock(&xas); 533 mutex_unlock(&encl->lock); 534 535 cond_resched(); 536 537 mutex_lock(&encl->lock); 538 xas_lock(&xas); 539 } 540 } 541 xas_unlock(&xas); 542 mutex_unlock(&encl->lock); 543 544 return ret; 545 } 546 547 static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, 548 unsigned long end, unsigned long newflags) 549 { 550 return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); 551 } 552 553 static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, 554 unsigned long addr, void *data) 555 { 556 unsigned long offset = addr & ~PAGE_MASK; 557 int ret; 558 559 560 ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); 561 if (ret) 562 return -EIO; 563 564 return 0; 565 } 566 567 static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, 568 unsigned long addr, void *data) 569 { 570 unsigned long offset = addr & ~PAGE_MASK; 571 int ret; 572 573 ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); 574 if (ret) 575 return -EIO; 576 577 return 0; 578 } 579 580 /* 581 * Load an enclave page to EPC if required, and take encl->lock. 582 */ 583 static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, 584 unsigned long addr, 585 unsigned long vm_flags) 586 { 587 struct sgx_encl_page *entry; 588 589 for ( ; ; ) { 590 mutex_lock(&encl->lock); 591 592 entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); 593 if (PTR_ERR(entry) != -EBUSY) 594 break; 595 596 mutex_unlock(&encl->lock); 597 } 598 599 if (IS_ERR(entry)) 600 mutex_unlock(&encl->lock); 601 602 return entry; 603 } 604 605 static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, 606 void *buf, int len, int write) 607 { 608 struct sgx_encl *encl = vma->vm_private_data; 609 struct sgx_encl_page *entry = NULL; 610 char data[sizeof(unsigned long)]; 611 unsigned long align; 612 int offset; 613 int cnt; 614 int ret = 0; 615 int i; 616 617 /* 618 * If process was forked, VMA is still there but vm_private_data is set 619 * to NULL. 620 */ 621 if (!encl) 622 return -EFAULT; 623 624 if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) 625 return -EFAULT; 626 627 for (i = 0; i < len; i += cnt) { 628 entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, 629 vma->vm_flags); 630 if (IS_ERR(entry)) { 631 ret = PTR_ERR(entry); 632 break; 633 } 634 635 align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); 636 offset = (addr + i) & (sizeof(unsigned long) - 1); 637 cnt = sizeof(unsigned long) - offset; 638 cnt = min(cnt, len - i); 639 640 ret = sgx_encl_debug_read(encl, entry, align, data); 641 if (ret) 642 goto out; 643 644 if (write) { 645 memcpy(data + offset, buf + i, cnt); 646 ret = sgx_encl_debug_write(encl, entry, align, data); 647 if (ret) 648 goto out; 649 } else { 650 memcpy(buf + i, data + offset, cnt); 651 } 652 653 out: 654 mutex_unlock(&encl->lock); 655 656 if (ret) 657 break; 658 } 659 660 return ret < 0 ? ret : i; 661 } 662 663 const struct vm_operations_struct sgx_vm_ops = { 664 .fault = sgx_vma_fault, 665 .mprotect = sgx_vma_mprotect, 666 .open = sgx_vma_open, 667 .access = sgx_vma_access, 668 }; 669 670 /** 671 * sgx_encl_release - Destroy an enclave instance 672 * @ref: address of a kref inside &sgx_encl 673 * 674 * Used together with kref_put(). Frees all the resources associated with the 675 * enclave and the instance itself. 676 */ 677 void sgx_encl_release(struct kref *ref) 678 { 679 struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); 680 struct sgx_va_page *va_page; 681 struct sgx_encl_page *entry; 682 unsigned long index; 683 684 xa_for_each(&encl->page_array, index, entry) { 685 if (entry->epc_page) { 686 /* 687 * The page and its radix tree entry cannot be freed 688 * if the page is being held by the reclaimer. 689 */ 690 if (sgx_unmark_page_reclaimable(entry->epc_page)) 691 continue; 692 693 sgx_encl_free_epc_page(entry->epc_page); 694 encl->secs_child_cnt--; 695 entry->epc_page = NULL; 696 } 697 698 kfree(entry); 699 /* Invoke scheduler to prevent soft lockups. */ 700 cond_resched(); 701 } 702 703 xa_destroy(&encl->page_array); 704 705 if (!encl->secs_child_cnt && encl->secs.epc_page) { 706 sgx_encl_free_epc_page(encl->secs.epc_page); 707 encl->secs.epc_page = NULL; 708 } 709 710 while (!list_empty(&encl->va_pages)) { 711 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, 712 list); 713 list_del(&va_page->list); 714 sgx_encl_free_epc_page(va_page->epc_page); 715 kfree(va_page); 716 } 717 718 if (encl->backing) 719 fput(encl->backing); 720 721 cleanup_srcu_struct(&encl->srcu); 722 723 WARN_ON_ONCE(!list_empty(&encl->mm_list)); 724 725 /* Detect EPC page leak's. */ 726 WARN_ON_ONCE(encl->secs_child_cnt); 727 WARN_ON_ONCE(encl->secs.epc_page); 728 729 kfree(encl); 730 } 731 732 /* 733 * 'mm' is exiting and no longer needs mmu notifications. 734 */ 735 static void sgx_mmu_notifier_release(struct mmu_notifier *mn, 736 struct mm_struct *mm) 737 { 738 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); 739 struct sgx_encl_mm *tmp = NULL; 740 741 /* 742 * The enclave itself can remove encl_mm. Note, objects can't be moved 743 * off an RCU protected list, but deletion is ok. 744 */ 745 spin_lock(&encl_mm->encl->mm_lock); 746 list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { 747 if (tmp == encl_mm) { 748 list_del_rcu(&encl_mm->list); 749 break; 750 } 751 } 752 spin_unlock(&encl_mm->encl->mm_lock); 753 754 if (tmp == encl_mm) { 755 synchronize_srcu(&encl_mm->encl->srcu); 756 mmu_notifier_put(mn); 757 } 758 } 759 760 static void sgx_mmu_notifier_free(struct mmu_notifier *mn) 761 { 762 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); 763 764 /* 'encl_mm' is going away, put encl_mm->encl reference: */ 765 kref_put(&encl_mm->encl->refcount, sgx_encl_release); 766 767 kfree(encl_mm); 768 } 769 770 static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { 771 .release = sgx_mmu_notifier_release, 772 .free_notifier = sgx_mmu_notifier_free, 773 }; 774 775 static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, 776 struct mm_struct *mm) 777 { 778 struct sgx_encl_mm *encl_mm = NULL; 779 struct sgx_encl_mm *tmp; 780 int idx; 781 782 idx = srcu_read_lock(&encl->srcu); 783 784 list_for_each_entry_rcu(tmp, &encl->mm_list, list) { 785 if (tmp->mm == mm) { 786 encl_mm = tmp; 787 break; 788 } 789 } 790 791 srcu_read_unlock(&encl->srcu, idx); 792 793 return encl_mm; 794 } 795 796 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) 797 { 798 struct sgx_encl_mm *encl_mm; 799 int ret; 800 801 /* 802 * Even though a single enclave may be mapped into an mm more than once, 803 * each 'mm' only appears once on encl->mm_list. This is guaranteed by 804 * holding the mm's mmap lock for write before an mm can be added or 805 * remove to an encl->mm_list. 806 */ 807 mmap_assert_write_locked(mm); 808 809 /* 810 * It's possible that an entry already exists in the mm_list, because it 811 * is removed only on VFS release or process exit. 812 */ 813 if (sgx_encl_find_mm(encl, mm)) 814 return 0; 815 816 encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); 817 if (!encl_mm) 818 return -ENOMEM; 819 820 /* Grab a refcount for the encl_mm->encl reference: */ 821 kref_get(&encl->refcount); 822 encl_mm->encl = encl; 823 encl_mm->mm = mm; 824 encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; 825 826 ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); 827 if (ret) { 828 kfree(encl_mm); 829 return ret; 830 } 831 832 spin_lock(&encl->mm_lock); 833 list_add_rcu(&encl_mm->list, &encl->mm_list); 834 /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ 835 smp_wmb(); 836 encl->mm_list_version++; 837 spin_unlock(&encl->mm_lock); 838 839 return 0; 840 } 841 842 /** 843 * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave 844 * @encl: the enclave 845 * 846 * Some SGX functions require that no cached linear-to-physical address 847 * mappings are present before they can succeed. For example, ENCLS[EWB] 848 * copies a page from the enclave page cache to regular main memory but 849 * it fails if it cannot ensure that there are no cached 850 * linear-to-physical address mappings referring to the page. 851 * 852 * SGX hardware flushes all cached linear-to-physical mappings on a CPU 853 * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave 854 * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical 855 * address mappings are cleared but coordination with the tracking done within 856 * the SGX hardware is needed to support the SGX functions that depend on this 857 * cache clearing. 858 * 859 * When the ENCLS[ETRACK] function is issued on an enclave the hardware 860 * tracks threads operating inside the enclave at that time. The SGX 861 * hardware tracking require that all the identified threads must have 862 * exited the enclave in order to flush the mappings before a function such 863 * as ENCLS[EWB] will be permitted 864 * 865 * The following flow is used to support SGX functions that require that 866 * no cached linear-to-physical address mappings are present: 867 * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. 868 * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be 869 * accessing the enclave. 870 * 3) Send IPI to identified CPUs, kicking them out of the enclave and 871 * thus flushing all locally cached linear-to-physical address mappings. 872 * 4) Execute SGX function. 873 * 874 * Context: It is required to call this function after ENCLS[ETRACK]. 875 * This will ensure that if any new mm appears (racing with 876 * sgx_encl_mm_add()) then the new mm will enter into the 877 * enclave with fresh linear-to-physical address mappings. 878 * 879 * It is required that all IPIs are completed before a new 880 * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 881 * of the above flow with the enclave's mutex. 882 * 883 * Return: cpumask of CPUs that might be accessing @encl 884 */ 885 const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) 886 { 887 cpumask_t *cpumask = &encl->cpumask; 888 struct sgx_encl_mm *encl_mm; 889 int idx; 890 891 cpumask_clear(cpumask); 892 893 idx = srcu_read_lock(&encl->srcu); 894 895 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 896 if (!mmget_not_zero(encl_mm->mm)) 897 continue; 898 899 cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); 900 901 mmput_async(encl_mm->mm); 902 } 903 904 srcu_read_unlock(&encl->srcu, idx); 905 906 return cpumask; 907 } 908 909 static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, 910 pgoff_t index) 911 { 912 struct inode *inode = encl->backing->f_path.dentry->d_inode; 913 struct address_space *mapping = inode->i_mapping; 914 gfp_t gfpmask = mapping_gfp_mask(mapping); 915 916 return shmem_read_mapping_page_gfp(mapping, index, gfpmask); 917 } 918 919 /** 920 * sgx_encl_get_backing() - Pin the backing storage 921 * @encl: an enclave pointer 922 * @page_index: enclave page index 923 * @backing: data for accessing backing storage for the page 924 * 925 * Pin the backing storage pages for storing the encrypted contents and Paging 926 * Crypto MetaData (PCMD) of an enclave page. 927 * 928 * Return: 929 * 0 on success, 930 * -errno otherwise. 931 */ 932 static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, 933 struct sgx_backing *backing) 934 { 935 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 936 struct page *contents; 937 struct page *pcmd; 938 939 contents = sgx_encl_get_backing_page(encl, page_index); 940 if (IS_ERR(contents)) 941 return PTR_ERR(contents); 942 943 pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); 944 if (IS_ERR(pcmd)) { 945 put_page(contents); 946 return PTR_ERR(pcmd); 947 } 948 949 backing->contents = contents; 950 backing->pcmd = pcmd; 951 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); 952 953 return 0; 954 } 955 956 /* 957 * When called from ksgxd, returns the mem_cgroup of a struct mm stored 958 * in the enclave's mm_list. When not called from ksgxd, just returns 959 * the mem_cgroup of the current task. 960 */ 961 static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) 962 { 963 struct mem_cgroup *memcg = NULL; 964 struct sgx_encl_mm *encl_mm; 965 int idx; 966 967 /* 968 * If called from normal task context, return the mem_cgroup 969 * of the current task's mm. The remainder of the handling is for 970 * ksgxd. 971 */ 972 if (!current_is_ksgxd()) 973 return get_mem_cgroup_from_mm(current->mm); 974 975 /* 976 * Search the enclave's mm_list to find an mm associated with 977 * this enclave to charge the allocation to. 978 */ 979 idx = srcu_read_lock(&encl->srcu); 980 981 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 982 if (!mmget_not_zero(encl_mm->mm)) 983 continue; 984 985 memcg = get_mem_cgroup_from_mm(encl_mm->mm); 986 987 mmput_async(encl_mm->mm); 988 989 break; 990 } 991 992 srcu_read_unlock(&encl->srcu, idx); 993 994 /* 995 * In the rare case that there isn't an mm associated with 996 * the enclave, set memcg to the current active mem_cgroup. 997 * This will be the root mem_cgroup if there is no active 998 * mem_cgroup. 999 */ 1000 if (!memcg) 1001 return get_mem_cgroup_from_mm(NULL); 1002 1003 return memcg; 1004 } 1005 1006 /** 1007 * sgx_encl_alloc_backing() - allocate a new backing storage page 1008 * @encl: an enclave pointer 1009 * @page_index: enclave page index 1010 * @backing: data for accessing backing storage for the page 1011 * 1012 * When called from ksgxd, sets the active memcg from one of the 1013 * mms in the enclave's mm_list prior to any backing page allocation, 1014 * in order to ensure that shmem page allocations are charged to the 1015 * enclave. 1016 * 1017 * Return: 1018 * 0 on success, 1019 * -errno otherwise. 1020 */ 1021 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, 1022 struct sgx_backing *backing) 1023 { 1024 struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); 1025 struct mem_cgroup *memcg = set_active_memcg(encl_memcg); 1026 int ret; 1027 1028 ret = sgx_encl_get_backing(encl, page_index, backing); 1029 1030 set_active_memcg(memcg); 1031 mem_cgroup_put(encl_memcg); 1032 1033 return ret; 1034 } 1035 1036 /** 1037 * sgx_encl_lookup_backing() - retrieve an existing backing storage page 1038 * @encl: an enclave pointer 1039 * @page_index: enclave page index 1040 * @backing: data for accessing backing storage for the page 1041 * 1042 * Retrieve a backing page for loading data back into an EPC page with ELDU. 1043 * It is the caller's responsibility to ensure that it is appropriate to use 1044 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is 1045 * not used correctly, this will cause an allocation which is not accounted for. 1046 * 1047 * Return: 1048 * 0 on success, 1049 * -errno otherwise. 1050 */ 1051 int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, 1052 struct sgx_backing *backing) 1053 { 1054 return sgx_encl_get_backing(encl, page_index, backing); 1055 } 1056 1057 /** 1058 * sgx_encl_put_backing() - Unpin the backing storage 1059 * @backing: data for accessing backing storage for the page 1060 */ 1061 void sgx_encl_put_backing(struct sgx_backing *backing) 1062 { 1063 put_page(backing->pcmd); 1064 put_page(backing->contents); 1065 } 1066 1067 static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, 1068 void *data) 1069 { 1070 pte_t pte; 1071 int ret; 1072 1073 ret = pte_young(*ptep); 1074 if (ret) { 1075 pte = pte_mkold(*ptep); 1076 set_pte_at((struct mm_struct *)data, addr, ptep, pte); 1077 } 1078 1079 return ret; 1080 } 1081 1082 /** 1083 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit 1084 * @mm: mm_struct that is checked 1085 * @page: enclave page to be tested for recent access 1086 * 1087 * Checks the Access (A) bit from the PTE corresponding to the enclave page and 1088 * clears it. 1089 * 1090 * Return: 1 if the page has been recently accessed and 0 if not. 1091 */ 1092 int sgx_encl_test_and_clear_young(struct mm_struct *mm, 1093 struct sgx_encl_page *page) 1094 { 1095 unsigned long addr = page->desc & PAGE_MASK; 1096 struct sgx_encl *encl = page->encl; 1097 struct vm_area_struct *vma; 1098 int ret; 1099 1100 ret = sgx_encl_find(mm, addr, &vma); 1101 if (ret) 1102 return 0; 1103 1104 if (encl != vma->vm_private_data) 1105 return 0; 1106 1107 ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, 1108 sgx_encl_test_and_clear_young_cb, vma->vm_mm); 1109 if (ret < 0) 1110 return 0; 1111 1112 return ret; 1113 } 1114 1115 struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, 1116 unsigned long offset, 1117 u64 secinfo_flags) 1118 { 1119 struct sgx_encl_page *encl_page; 1120 unsigned long prot; 1121 1122 encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); 1123 if (!encl_page) 1124 return ERR_PTR(-ENOMEM); 1125 1126 encl_page->desc = encl->base + offset; 1127 encl_page->encl = encl; 1128 1129 prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | 1130 _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | 1131 _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); 1132 1133 /* 1134 * TCS pages must always RW set for CPU access while the SECINFO 1135 * permissions are *always* zero - the CPU ignores the user provided 1136 * values and silently overwrites them with zero permissions. 1137 */ 1138 if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) 1139 prot |= PROT_READ | PROT_WRITE; 1140 1141 /* Calculate maximum of the VM flags for the page. */ 1142 encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); 1143 1144 return encl_page; 1145 } 1146 1147 /** 1148 * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave 1149 * @encl: the enclave 1150 * @addr: page aligned pointer to single page for which PTEs will be removed 1151 * 1152 * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping 1153 * @addr from each VMA. Ensure that page fault handler is ready to handle 1154 * new mappings of @addr before calling this function. 1155 */ 1156 void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) 1157 { 1158 unsigned long mm_list_version; 1159 struct sgx_encl_mm *encl_mm; 1160 struct vm_area_struct *vma; 1161 int idx, ret; 1162 1163 do { 1164 mm_list_version = encl->mm_list_version; 1165 1166 /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ 1167 smp_rmb(); 1168 1169 idx = srcu_read_lock(&encl->srcu); 1170 1171 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 1172 if (!mmget_not_zero(encl_mm->mm)) 1173 continue; 1174 1175 mmap_read_lock(encl_mm->mm); 1176 1177 ret = sgx_encl_find(encl_mm->mm, addr, &vma); 1178 if (!ret && encl == vma->vm_private_data) 1179 zap_vma_ptes(vma, addr, PAGE_SIZE); 1180 1181 mmap_read_unlock(encl_mm->mm); 1182 1183 mmput_async(encl_mm->mm); 1184 } 1185 1186 srcu_read_unlock(&encl->srcu, idx); 1187 } while (unlikely(encl->mm_list_version != mm_list_version)); 1188 } 1189 1190 /** 1191 * sgx_alloc_va_page() - Allocate a Version Array (VA) page 1192 * @reclaim: Reclaim EPC pages directly if none available. Enclave 1193 * mutex should not be held if this is set. 1194 * 1195 * Allocate a free EPC page and convert it to a Version Array (VA) page. 1196 * 1197 * Return: 1198 * a VA page, 1199 * -errno otherwise 1200 */ 1201 struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) 1202 { 1203 struct sgx_epc_page *epc_page; 1204 int ret; 1205 1206 epc_page = sgx_alloc_epc_page(NULL, reclaim); 1207 if (IS_ERR(epc_page)) 1208 return ERR_CAST(epc_page); 1209 1210 ret = __epa(sgx_get_epc_virt_addr(epc_page)); 1211 if (ret) { 1212 WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); 1213 sgx_encl_free_epc_page(epc_page); 1214 return ERR_PTR(-EFAULT); 1215 } 1216 1217 return epc_page; 1218 } 1219 1220 /** 1221 * sgx_alloc_va_slot - allocate a VA slot 1222 * @va_page: a &struct sgx_va_page instance 1223 * 1224 * Allocates a slot from a &struct sgx_va_page instance. 1225 * 1226 * Return: offset of the slot inside the VA page 1227 */ 1228 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) 1229 { 1230 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); 1231 1232 if (slot < SGX_VA_SLOT_COUNT) 1233 set_bit(slot, va_page->slots); 1234 1235 return slot << 3; 1236 } 1237 1238 /** 1239 * sgx_free_va_slot - free a VA slot 1240 * @va_page: a &struct sgx_va_page instance 1241 * @offset: offset of the slot inside the VA page 1242 * 1243 * Frees a slot from a &struct sgx_va_page instance. 1244 */ 1245 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) 1246 { 1247 clear_bit(offset >> 3, va_page->slots); 1248 } 1249 1250 /** 1251 * sgx_va_page_full - is the VA page full? 1252 * @va_page: a &struct sgx_va_page instance 1253 * 1254 * Return: true if all slots have been taken 1255 */ 1256 bool sgx_va_page_full(struct sgx_va_page *va_page) 1257 { 1258 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); 1259 1260 return slot == SGX_VA_SLOT_COUNT; 1261 } 1262 1263 /** 1264 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave 1265 * @page: EPC page to be freed 1266 * 1267 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and 1268 * only upon success, it puts the page back to free page list. Otherwise, it 1269 * gives a WARNING to indicate page is leaked. 1270 */ 1271 void sgx_encl_free_epc_page(struct sgx_epc_page *page) 1272 { 1273 int ret; 1274 1275 WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); 1276 1277 ret = __eremove(sgx_get_epc_virt_addr(page)); 1278 if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) 1279 return; 1280 1281 sgx_free_epc_page(page); 1282 } 1283