1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-20 Intel Corporation. */ 3 4 #include <linux/lockdep.h> 5 #include <linux/mm.h> 6 #include <linux/mman.h> 7 #include <linux/shmem_fs.h> 8 #include <linux/suspend.h> 9 #include <linux/sched/mm.h> 10 #include <asm/sgx.h> 11 #include "encl.h" 12 #include "encls.h" 13 #include "sgx.h" 14 15 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) 16 /* 17 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to 18 * determine the page index associated with the first PCMD entry 19 * within a PCMD page. 20 */ 21 #define PCMD_FIRST_MASK GENMASK(4, 0) 22 23 /** 24 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with 25 * a PCMD page is in process of being reclaimed. 26 * @encl: Enclave to which PCMD page belongs 27 * @start_addr: Address of enclave page using first entry within the PCMD page 28 * 29 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is 30 * stored. The PCMD data of a reclaimed enclave page contains enough 31 * information for the processor to verify the page at the time 32 * it is loaded back into the Enclave Page Cache (EPC). 33 * 34 * The backing storage to which enclave pages are reclaimed is laid out as 35 * follows: 36 * Encrypted enclave pages:SECS page:PCMD pages 37 * 38 * Each PCMD page contains the PCMD metadata of 39 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. 40 * 41 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the 42 * process of getting data (and thus soon being non-empty). (b) is tested with 43 * a check if an enclave page sharing the PCMD page is in the process of being 44 * reclaimed. 45 * 46 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it 47 * intends to reclaim that enclave page - it means that the PCMD page 48 * associated with that enclave page is about to get some data and thus 49 * even if the PCMD page is empty, it should not be truncated. 50 * 51 * Context: Enclave mutex (&sgx_encl->lock) must be held. 52 * Return: 1 if the reclaimer is about to write to the PCMD page 53 * 0 if the reclaimer has no intention to write to the PCMD page 54 */ 55 static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, 56 unsigned long start_addr) 57 { 58 int reclaimed = 0; 59 int i; 60 61 /* 62 * PCMD_FIRST_MASK is based on number of PCMD entries within 63 * PCMD page being 32. 64 */ 65 BUILD_BUG_ON(PCMDS_PER_PAGE != 32); 66 67 for (i = 0; i < PCMDS_PER_PAGE; i++) { 68 struct sgx_encl_page *entry; 69 unsigned long addr; 70 71 addr = start_addr + i * PAGE_SIZE; 72 73 /* 74 * Stop when reaching the SECS page - it does not 75 * have a page_array entry and its reclaim is 76 * started and completed with enclave mutex held so 77 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED 78 * flag. 79 */ 80 if (addr == encl->base + encl->size) 81 break; 82 83 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 84 if (!entry) 85 continue; 86 87 /* 88 * VA page slot ID uses same bit as the flag so it is important 89 * to ensure that the page is not already in backing store. 90 */ 91 if (entry->epc_page && 92 (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { 93 reclaimed = 1; 94 break; 95 } 96 } 97 98 return reclaimed; 99 } 100 101 /* 102 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's 103 * follow right after the EPC data in the backing storage. In addition to the 104 * visible enclave pages, there's one extra page slot for SECS, before PCMD 105 * structs. 106 */ 107 static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, 108 unsigned long page_index) 109 { 110 pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); 111 112 return epc_end_off + page_index * sizeof(struct sgx_pcmd); 113 } 114 115 /* 116 * Free a page from the backing storage in the given page index. 117 */ 118 static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) 119 { 120 struct inode *inode = file_inode(encl->backing); 121 122 shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); 123 } 124 125 /* 126 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC 127 * Pages" in the SDM. 128 */ 129 static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, 130 struct sgx_epc_page *epc_page, 131 struct sgx_epc_page *secs_page) 132 { 133 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; 134 struct sgx_encl *encl = encl_page->encl; 135 pgoff_t page_index, page_pcmd_off; 136 unsigned long pcmd_first_page; 137 struct sgx_pageinfo pginfo; 138 struct sgx_backing b; 139 bool pcmd_page_empty; 140 u8 *pcmd_page; 141 int ret; 142 143 if (secs_page) 144 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); 145 else 146 page_index = PFN_DOWN(encl->size); 147 148 /* 149 * Address of enclave page using the first entry within the PCMD page. 150 */ 151 pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; 152 153 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 154 155 ret = sgx_encl_lookup_backing(encl, page_index, &b); 156 if (ret) 157 return ret; 158 159 pginfo.addr = encl_page->desc & PAGE_MASK; 160 pginfo.contents = (unsigned long)kmap_atomic(b.contents); 161 pcmd_page = kmap_atomic(b.pcmd); 162 pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; 163 164 if (secs_page) 165 pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); 166 else 167 pginfo.secs = 0; 168 169 ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), 170 sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); 171 if (ret) { 172 if (encls_failed(ret)) 173 ENCLS_WARN(ret, "ELDU"); 174 175 ret = -EFAULT; 176 } 177 178 memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); 179 set_page_dirty(b.pcmd); 180 181 /* 182 * The area for the PCMD in the page was zeroed above. Check if the 183 * whole page is now empty meaning that all PCMD's have been zeroed: 184 */ 185 pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); 186 187 kunmap_atomic(pcmd_page); 188 kunmap_atomic((void *)(unsigned long)pginfo.contents); 189 190 get_page(b.pcmd); 191 sgx_encl_put_backing(&b); 192 193 sgx_encl_truncate_backing_page(encl, page_index); 194 195 if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { 196 sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); 197 pcmd_page = kmap_atomic(b.pcmd); 198 if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) 199 pr_warn("PCMD page not empty after truncate.\n"); 200 kunmap_atomic(pcmd_page); 201 } 202 203 put_page(b.pcmd); 204 205 return ret; 206 } 207 208 static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, 209 struct sgx_epc_page *secs_page) 210 { 211 212 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; 213 struct sgx_encl *encl = encl_page->encl; 214 struct sgx_epc_page *epc_page; 215 int ret; 216 217 epc_page = sgx_alloc_epc_page(encl_page, false); 218 if (IS_ERR(epc_page)) 219 return epc_page; 220 221 ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); 222 if (ret) { 223 sgx_encl_free_epc_page(epc_page); 224 return ERR_PTR(ret); 225 } 226 227 sgx_free_va_slot(encl_page->va_page, va_offset); 228 list_move(&encl_page->va_page->list, &encl->va_pages); 229 encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; 230 encl_page->epc_page = epc_page; 231 232 return epc_page; 233 } 234 235 static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, 236 struct sgx_encl_page *entry) 237 { 238 struct sgx_epc_page *epc_page; 239 240 /* Entry successfully located. */ 241 if (entry->epc_page) { 242 if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) 243 return ERR_PTR(-EBUSY); 244 245 return entry; 246 } 247 248 if (!(encl->secs.epc_page)) { 249 epc_page = sgx_encl_eldu(&encl->secs, NULL); 250 if (IS_ERR(epc_page)) 251 return ERR_CAST(epc_page); 252 } 253 254 epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); 255 if (IS_ERR(epc_page)) 256 return ERR_CAST(epc_page); 257 258 encl->secs_child_cnt++; 259 sgx_mark_page_reclaimable(entry->epc_page); 260 261 return entry; 262 } 263 264 static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, 265 unsigned long addr, 266 unsigned long vm_flags) 267 { 268 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); 269 struct sgx_encl_page *entry; 270 271 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 272 if (!entry) 273 return ERR_PTR(-EFAULT); 274 275 /* 276 * Verify that the page has equal or higher build time 277 * permissions than the VMA permissions (i.e. the subset of {VM_READ, 278 * VM_WRITE, VM_EXECUTE} in vma->vm_flags). 279 */ 280 if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) 281 return ERR_PTR(-EFAULT); 282 283 return __sgx_encl_load_page(encl, entry); 284 } 285 286 struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, 287 unsigned long addr) 288 { 289 struct sgx_encl_page *entry; 290 291 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 292 if (!entry) 293 return ERR_PTR(-EFAULT); 294 295 return __sgx_encl_load_page(encl, entry); 296 } 297 298 /** 299 * sgx_encl_eaug_page() - Dynamically add page to initialized enclave 300 * @vma: VMA obtained from fault info from where page is accessed 301 * @encl: enclave accessing the page 302 * @addr: address that triggered the page fault 303 * 304 * When an initialized enclave accesses a page with no backing EPC page 305 * on a SGX2 system then the EPC can be added dynamically via the SGX2 306 * ENCLS[EAUG] instruction. 307 * 308 * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed 309 * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. 310 */ 311 static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, 312 struct sgx_encl *encl, unsigned long addr) 313 { 314 vm_fault_t vmret = VM_FAULT_SIGBUS; 315 struct sgx_pageinfo pginfo = {0}; 316 struct sgx_encl_page *encl_page; 317 struct sgx_epc_page *epc_page; 318 struct sgx_va_page *va_page; 319 unsigned long phys_addr; 320 u64 secinfo_flags; 321 int ret; 322 323 if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) 324 return VM_FAULT_SIGBUS; 325 326 /* 327 * Ignore internal permission checking for dynamically added pages. 328 * They matter only for data added during the pre-initialization 329 * phase. The enclave decides the permissions by the means of 330 * EACCEPT, EACCEPTCOPY and EMODPE. 331 */ 332 secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; 333 encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); 334 if (IS_ERR(encl_page)) 335 return VM_FAULT_OOM; 336 337 mutex_lock(&encl->lock); 338 339 epc_page = sgx_alloc_epc_page(encl_page, false); 340 if (IS_ERR(epc_page)) { 341 if (PTR_ERR(epc_page) == -EBUSY) 342 vmret = VM_FAULT_NOPAGE; 343 goto err_out_unlock; 344 } 345 346 va_page = sgx_encl_grow(encl, false); 347 if (IS_ERR(va_page)) 348 goto err_out_epc; 349 350 if (va_page) 351 list_add(&va_page->list, &encl->va_pages); 352 353 ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), 354 encl_page, GFP_KERNEL); 355 /* 356 * If ret == -EBUSY then page was created in another flow while 357 * running without encl->lock 358 */ 359 if (ret) 360 goto err_out_shrink; 361 362 pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); 363 pginfo.addr = encl_page->desc & PAGE_MASK; 364 pginfo.metadata = 0; 365 366 ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); 367 if (ret) 368 goto err_out; 369 370 encl_page->encl = encl; 371 encl_page->epc_page = epc_page; 372 encl_page->type = SGX_PAGE_TYPE_REG; 373 encl->secs_child_cnt++; 374 375 sgx_mark_page_reclaimable(encl_page->epc_page); 376 377 phys_addr = sgx_get_epc_phys_addr(epc_page); 378 /* 379 * Do not undo everything when creating PTE entry fails - next #PF 380 * would find page ready for a PTE. 381 */ 382 vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); 383 if (vmret != VM_FAULT_NOPAGE) { 384 mutex_unlock(&encl->lock); 385 return VM_FAULT_SIGBUS; 386 } 387 mutex_unlock(&encl->lock); 388 return VM_FAULT_NOPAGE; 389 390 err_out: 391 xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); 392 393 err_out_shrink: 394 sgx_encl_shrink(encl, va_page); 395 err_out_epc: 396 sgx_encl_free_epc_page(epc_page); 397 err_out_unlock: 398 mutex_unlock(&encl->lock); 399 kfree(encl_page); 400 401 return vmret; 402 } 403 404 static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) 405 { 406 unsigned long addr = (unsigned long)vmf->address; 407 struct vm_area_struct *vma = vmf->vma; 408 struct sgx_encl_page *entry; 409 unsigned long phys_addr; 410 struct sgx_encl *encl; 411 vm_fault_t ret; 412 413 encl = vma->vm_private_data; 414 415 /* 416 * It's very unlikely but possible that allocating memory for the 417 * mm_list entry of a forked process failed in sgx_vma_open(). When 418 * this happens, vm_private_data is set to NULL. 419 */ 420 if (unlikely(!encl)) 421 return VM_FAULT_SIGBUS; 422 423 /* 424 * The page_array keeps track of all enclave pages, whether they 425 * are swapped out or not. If there is no entry for this page and 426 * the system supports SGX2 then it is possible to dynamically add 427 * a new enclave page. This is only possible for an initialized 428 * enclave that will be checked for right away. 429 */ 430 if (cpu_feature_enabled(X86_FEATURE_SGX2) && 431 (!xa_load(&encl->page_array, PFN_DOWN(addr)))) 432 return sgx_encl_eaug_page(vma, encl, addr); 433 434 mutex_lock(&encl->lock); 435 436 entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); 437 if (IS_ERR(entry)) { 438 mutex_unlock(&encl->lock); 439 440 if (PTR_ERR(entry) == -EBUSY) 441 return VM_FAULT_NOPAGE; 442 443 return VM_FAULT_SIGBUS; 444 } 445 446 phys_addr = sgx_get_epc_phys_addr(entry->epc_page); 447 448 ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); 449 if (ret != VM_FAULT_NOPAGE) { 450 mutex_unlock(&encl->lock); 451 452 return VM_FAULT_SIGBUS; 453 } 454 455 sgx_encl_test_and_clear_young(vma->vm_mm, entry); 456 mutex_unlock(&encl->lock); 457 458 return VM_FAULT_NOPAGE; 459 } 460 461 static void sgx_vma_open(struct vm_area_struct *vma) 462 { 463 struct sgx_encl *encl = vma->vm_private_data; 464 465 /* 466 * It's possible but unlikely that vm_private_data is NULL. This can 467 * happen in a grandchild of a process, when sgx_encl_mm_add() had 468 * failed to allocate memory in this callback. 469 */ 470 if (unlikely(!encl)) 471 return; 472 473 if (sgx_encl_mm_add(encl, vma->vm_mm)) 474 vma->vm_private_data = NULL; 475 } 476 477 478 /** 479 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed 480 * @encl: an enclave pointer 481 * @start: lower bound of the address range, inclusive 482 * @end: upper bound of the address range, exclusive 483 * @vm_flags: VMA flags 484 * 485 * Iterate through the enclave pages contained within [@start, @end) to verify 486 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} 487 * do not contain any permissions that are not contained in the build time 488 * permissions of any of the enclave pages within the given address range. 489 * 490 * An enclave creator must declare the strongest permissions that will be 491 * needed for each enclave page. This ensures that mappings have the identical 492 * or weaker permissions than the earlier declared permissions. 493 * 494 * Return: 0 on success, -EACCES otherwise 495 */ 496 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, 497 unsigned long end, unsigned long vm_flags) 498 { 499 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); 500 struct sgx_encl_page *page; 501 unsigned long count = 0; 502 int ret = 0; 503 504 XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); 505 506 /* Disallow mapping outside enclave's address range. */ 507 if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && 508 (start < encl->base || end > encl->base + encl->size)) 509 return -EACCES; 510 511 /* 512 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might 513 * conflict with the enclave page permissions. 514 */ 515 if (current->personality & READ_IMPLIES_EXEC) 516 return -EACCES; 517 518 mutex_lock(&encl->lock); 519 xas_lock(&xas); 520 xas_for_each(&xas, page, PFN_DOWN(end - 1)) { 521 if (~page->vm_max_prot_bits & vm_prot_bits) { 522 ret = -EACCES; 523 break; 524 } 525 526 /* Reschedule on every XA_CHECK_SCHED iteration. */ 527 if (!(++count % XA_CHECK_SCHED)) { 528 xas_pause(&xas); 529 xas_unlock(&xas); 530 mutex_unlock(&encl->lock); 531 532 cond_resched(); 533 534 mutex_lock(&encl->lock); 535 xas_lock(&xas); 536 } 537 } 538 xas_unlock(&xas); 539 mutex_unlock(&encl->lock); 540 541 return ret; 542 } 543 544 static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, 545 unsigned long end, unsigned long newflags) 546 { 547 return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); 548 } 549 550 static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, 551 unsigned long addr, void *data) 552 { 553 unsigned long offset = addr & ~PAGE_MASK; 554 int ret; 555 556 557 ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); 558 if (ret) 559 return -EIO; 560 561 return 0; 562 } 563 564 static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, 565 unsigned long addr, void *data) 566 { 567 unsigned long offset = addr & ~PAGE_MASK; 568 int ret; 569 570 ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); 571 if (ret) 572 return -EIO; 573 574 return 0; 575 } 576 577 /* 578 * Load an enclave page to EPC if required, and take encl->lock. 579 */ 580 static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, 581 unsigned long addr, 582 unsigned long vm_flags) 583 { 584 struct sgx_encl_page *entry; 585 586 for ( ; ; ) { 587 mutex_lock(&encl->lock); 588 589 entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); 590 if (PTR_ERR(entry) != -EBUSY) 591 break; 592 593 mutex_unlock(&encl->lock); 594 } 595 596 if (IS_ERR(entry)) 597 mutex_unlock(&encl->lock); 598 599 return entry; 600 } 601 602 static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, 603 void *buf, int len, int write) 604 { 605 struct sgx_encl *encl = vma->vm_private_data; 606 struct sgx_encl_page *entry = NULL; 607 char data[sizeof(unsigned long)]; 608 unsigned long align; 609 int offset; 610 int cnt; 611 int ret = 0; 612 int i; 613 614 /* 615 * If process was forked, VMA is still there but vm_private_data is set 616 * to NULL. 617 */ 618 if (!encl) 619 return -EFAULT; 620 621 if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) 622 return -EFAULT; 623 624 for (i = 0; i < len; i += cnt) { 625 entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, 626 vma->vm_flags); 627 if (IS_ERR(entry)) { 628 ret = PTR_ERR(entry); 629 break; 630 } 631 632 align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); 633 offset = (addr + i) & (sizeof(unsigned long) - 1); 634 cnt = sizeof(unsigned long) - offset; 635 cnt = min(cnt, len - i); 636 637 ret = sgx_encl_debug_read(encl, entry, align, data); 638 if (ret) 639 goto out; 640 641 if (write) { 642 memcpy(data + offset, buf + i, cnt); 643 ret = sgx_encl_debug_write(encl, entry, align, data); 644 if (ret) 645 goto out; 646 } else { 647 memcpy(buf + i, data + offset, cnt); 648 } 649 650 out: 651 mutex_unlock(&encl->lock); 652 653 if (ret) 654 break; 655 } 656 657 return ret < 0 ? ret : i; 658 } 659 660 const struct vm_operations_struct sgx_vm_ops = { 661 .fault = sgx_vma_fault, 662 .mprotect = sgx_vma_mprotect, 663 .open = sgx_vma_open, 664 .access = sgx_vma_access, 665 }; 666 667 /** 668 * sgx_encl_release - Destroy an enclave instance 669 * @ref: address of a kref inside &sgx_encl 670 * 671 * Used together with kref_put(). Frees all the resources associated with the 672 * enclave and the instance itself. 673 */ 674 void sgx_encl_release(struct kref *ref) 675 { 676 struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); 677 struct sgx_va_page *va_page; 678 struct sgx_encl_page *entry; 679 unsigned long index; 680 681 xa_for_each(&encl->page_array, index, entry) { 682 if (entry->epc_page) { 683 /* 684 * The page and its radix tree entry cannot be freed 685 * if the page is being held by the reclaimer. 686 */ 687 if (sgx_unmark_page_reclaimable(entry->epc_page)) 688 continue; 689 690 sgx_encl_free_epc_page(entry->epc_page); 691 encl->secs_child_cnt--; 692 entry->epc_page = NULL; 693 } 694 695 kfree(entry); 696 /* Invoke scheduler to prevent soft lockups. */ 697 cond_resched(); 698 } 699 700 xa_destroy(&encl->page_array); 701 702 if (!encl->secs_child_cnt && encl->secs.epc_page) { 703 sgx_encl_free_epc_page(encl->secs.epc_page); 704 encl->secs.epc_page = NULL; 705 } 706 707 while (!list_empty(&encl->va_pages)) { 708 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, 709 list); 710 list_del(&va_page->list); 711 sgx_encl_free_epc_page(va_page->epc_page); 712 kfree(va_page); 713 } 714 715 if (encl->backing) 716 fput(encl->backing); 717 718 cleanup_srcu_struct(&encl->srcu); 719 720 WARN_ON_ONCE(!list_empty(&encl->mm_list)); 721 722 /* Detect EPC page leak's. */ 723 WARN_ON_ONCE(encl->secs_child_cnt); 724 WARN_ON_ONCE(encl->secs.epc_page); 725 726 kfree(encl); 727 } 728 729 /* 730 * 'mm' is exiting and no longer needs mmu notifications. 731 */ 732 static void sgx_mmu_notifier_release(struct mmu_notifier *mn, 733 struct mm_struct *mm) 734 { 735 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); 736 struct sgx_encl_mm *tmp = NULL; 737 738 /* 739 * The enclave itself can remove encl_mm. Note, objects can't be moved 740 * off an RCU protected list, but deletion is ok. 741 */ 742 spin_lock(&encl_mm->encl->mm_lock); 743 list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { 744 if (tmp == encl_mm) { 745 list_del_rcu(&encl_mm->list); 746 break; 747 } 748 } 749 spin_unlock(&encl_mm->encl->mm_lock); 750 751 if (tmp == encl_mm) { 752 synchronize_srcu(&encl_mm->encl->srcu); 753 mmu_notifier_put(mn); 754 } 755 } 756 757 static void sgx_mmu_notifier_free(struct mmu_notifier *mn) 758 { 759 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); 760 761 /* 'encl_mm' is going away, put encl_mm->encl reference: */ 762 kref_put(&encl_mm->encl->refcount, sgx_encl_release); 763 764 kfree(encl_mm); 765 } 766 767 static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { 768 .release = sgx_mmu_notifier_release, 769 .free_notifier = sgx_mmu_notifier_free, 770 }; 771 772 static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, 773 struct mm_struct *mm) 774 { 775 struct sgx_encl_mm *encl_mm = NULL; 776 struct sgx_encl_mm *tmp; 777 int idx; 778 779 idx = srcu_read_lock(&encl->srcu); 780 781 list_for_each_entry_rcu(tmp, &encl->mm_list, list) { 782 if (tmp->mm == mm) { 783 encl_mm = tmp; 784 break; 785 } 786 } 787 788 srcu_read_unlock(&encl->srcu, idx); 789 790 return encl_mm; 791 } 792 793 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) 794 { 795 struct sgx_encl_mm *encl_mm; 796 int ret; 797 798 /* 799 * Even though a single enclave may be mapped into an mm more than once, 800 * each 'mm' only appears once on encl->mm_list. This is guaranteed by 801 * holding the mm's mmap lock for write before an mm can be added or 802 * remove to an encl->mm_list. 803 */ 804 mmap_assert_write_locked(mm); 805 806 /* 807 * It's possible that an entry already exists in the mm_list, because it 808 * is removed only on VFS release or process exit. 809 */ 810 if (sgx_encl_find_mm(encl, mm)) 811 return 0; 812 813 encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); 814 if (!encl_mm) 815 return -ENOMEM; 816 817 /* Grab a refcount for the encl_mm->encl reference: */ 818 kref_get(&encl->refcount); 819 encl_mm->encl = encl; 820 encl_mm->mm = mm; 821 encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; 822 823 ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); 824 if (ret) { 825 kfree(encl_mm); 826 return ret; 827 } 828 829 spin_lock(&encl->mm_lock); 830 list_add_rcu(&encl_mm->list, &encl->mm_list); 831 /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ 832 smp_wmb(); 833 encl->mm_list_version++; 834 spin_unlock(&encl->mm_lock); 835 836 return 0; 837 } 838 839 /** 840 * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave 841 * @encl: the enclave 842 * 843 * Some SGX functions require that no cached linear-to-physical address 844 * mappings are present before they can succeed. For example, ENCLS[EWB] 845 * copies a page from the enclave page cache to regular main memory but 846 * it fails if it cannot ensure that there are no cached 847 * linear-to-physical address mappings referring to the page. 848 * 849 * SGX hardware flushes all cached linear-to-physical mappings on a CPU 850 * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave 851 * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical 852 * address mappings are cleared but coordination with the tracking done within 853 * the SGX hardware is needed to support the SGX functions that depend on this 854 * cache clearing. 855 * 856 * When the ENCLS[ETRACK] function is issued on an enclave the hardware 857 * tracks threads operating inside the enclave at that time. The SGX 858 * hardware tracking require that all the identified threads must have 859 * exited the enclave in order to flush the mappings before a function such 860 * as ENCLS[EWB] will be permitted 861 * 862 * The following flow is used to support SGX functions that require that 863 * no cached linear-to-physical address mappings are present: 864 * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. 865 * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be 866 * accessing the enclave. 867 * 3) Send IPI to identified CPUs, kicking them out of the enclave and 868 * thus flushing all locally cached linear-to-physical address mappings. 869 * 4) Execute SGX function. 870 * 871 * Context: It is required to call this function after ENCLS[ETRACK]. 872 * This will ensure that if any new mm appears (racing with 873 * sgx_encl_mm_add()) then the new mm will enter into the 874 * enclave with fresh linear-to-physical address mappings. 875 * 876 * It is required that all IPIs are completed before a new 877 * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 878 * of the above flow with the enclave's mutex. 879 * 880 * Return: cpumask of CPUs that might be accessing @encl 881 */ 882 const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) 883 { 884 cpumask_t *cpumask = &encl->cpumask; 885 struct sgx_encl_mm *encl_mm; 886 int idx; 887 888 cpumask_clear(cpumask); 889 890 idx = srcu_read_lock(&encl->srcu); 891 892 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 893 if (!mmget_not_zero(encl_mm->mm)) 894 continue; 895 896 cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); 897 898 mmput_async(encl_mm->mm); 899 } 900 901 srcu_read_unlock(&encl->srcu, idx); 902 903 return cpumask; 904 } 905 906 static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, 907 pgoff_t index) 908 { 909 struct inode *inode = encl->backing->f_path.dentry->d_inode; 910 struct address_space *mapping = inode->i_mapping; 911 gfp_t gfpmask = mapping_gfp_mask(mapping); 912 913 return shmem_read_mapping_page_gfp(mapping, index, gfpmask); 914 } 915 916 /** 917 * sgx_encl_get_backing() - Pin the backing storage 918 * @encl: an enclave pointer 919 * @page_index: enclave page index 920 * @backing: data for accessing backing storage for the page 921 * 922 * Pin the backing storage pages for storing the encrypted contents and Paging 923 * Crypto MetaData (PCMD) of an enclave page. 924 * 925 * Return: 926 * 0 on success, 927 * -errno otherwise. 928 */ 929 static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, 930 struct sgx_backing *backing) 931 { 932 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 933 struct page *contents; 934 struct page *pcmd; 935 936 contents = sgx_encl_get_backing_page(encl, page_index); 937 if (IS_ERR(contents)) 938 return PTR_ERR(contents); 939 940 pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); 941 if (IS_ERR(pcmd)) { 942 put_page(contents); 943 return PTR_ERR(pcmd); 944 } 945 946 backing->contents = contents; 947 backing->pcmd = pcmd; 948 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); 949 950 return 0; 951 } 952 953 /* 954 * When called from ksgxd, returns the mem_cgroup of a struct mm stored 955 * in the enclave's mm_list. When not called from ksgxd, just returns 956 * the mem_cgroup of the current task. 957 */ 958 static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) 959 { 960 struct mem_cgroup *memcg = NULL; 961 struct sgx_encl_mm *encl_mm; 962 int idx; 963 964 /* 965 * If called from normal task context, return the mem_cgroup 966 * of the current task's mm. The remainder of the handling is for 967 * ksgxd. 968 */ 969 if (!current_is_ksgxd()) 970 return get_mem_cgroup_from_mm(current->mm); 971 972 /* 973 * Search the enclave's mm_list to find an mm associated with 974 * this enclave to charge the allocation to. 975 */ 976 idx = srcu_read_lock(&encl->srcu); 977 978 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 979 if (!mmget_not_zero(encl_mm->mm)) 980 continue; 981 982 memcg = get_mem_cgroup_from_mm(encl_mm->mm); 983 984 mmput_async(encl_mm->mm); 985 986 break; 987 } 988 989 srcu_read_unlock(&encl->srcu, idx); 990 991 /* 992 * In the rare case that there isn't an mm associated with 993 * the enclave, set memcg to the current active mem_cgroup. 994 * This will be the root mem_cgroup if there is no active 995 * mem_cgroup. 996 */ 997 if (!memcg) 998 return get_mem_cgroup_from_mm(NULL); 999 1000 return memcg; 1001 } 1002 1003 /** 1004 * sgx_encl_alloc_backing() - allocate a new backing storage page 1005 * @encl: an enclave pointer 1006 * @page_index: enclave page index 1007 * @backing: data for accessing backing storage for the page 1008 * 1009 * When called from ksgxd, sets the active memcg from one of the 1010 * mms in the enclave's mm_list prior to any backing page allocation, 1011 * in order to ensure that shmem page allocations are charged to the 1012 * enclave. 1013 * 1014 * Return: 1015 * 0 on success, 1016 * -errno otherwise. 1017 */ 1018 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, 1019 struct sgx_backing *backing) 1020 { 1021 struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); 1022 struct mem_cgroup *memcg = set_active_memcg(encl_memcg); 1023 int ret; 1024 1025 ret = sgx_encl_get_backing(encl, page_index, backing); 1026 1027 set_active_memcg(memcg); 1028 mem_cgroup_put(encl_memcg); 1029 1030 return ret; 1031 } 1032 1033 /** 1034 * sgx_encl_lookup_backing() - retrieve an existing backing storage page 1035 * @encl: an enclave pointer 1036 * @page_index: enclave page index 1037 * @backing: data for accessing backing storage for the page 1038 * 1039 * Retrieve a backing page for loading data back into an EPC page with ELDU. 1040 * It is the caller's responsibility to ensure that it is appropriate to use 1041 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is 1042 * not used correctly, this will cause an allocation which is not accounted for. 1043 * 1044 * Return: 1045 * 0 on success, 1046 * -errno otherwise. 1047 */ 1048 int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, 1049 struct sgx_backing *backing) 1050 { 1051 return sgx_encl_get_backing(encl, page_index, backing); 1052 } 1053 1054 /** 1055 * sgx_encl_put_backing() - Unpin the backing storage 1056 * @backing: data for accessing backing storage for the page 1057 */ 1058 void sgx_encl_put_backing(struct sgx_backing *backing) 1059 { 1060 put_page(backing->pcmd); 1061 put_page(backing->contents); 1062 } 1063 1064 static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, 1065 void *data) 1066 { 1067 pte_t pte; 1068 int ret; 1069 1070 ret = pte_young(*ptep); 1071 if (ret) { 1072 pte = pte_mkold(*ptep); 1073 set_pte_at((struct mm_struct *)data, addr, ptep, pte); 1074 } 1075 1076 return ret; 1077 } 1078 1079 /** 1080 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit 1081 * @mm: mm_struct that is checked 1082 * @page: enclave page to be tested for recent access 1083 * 1084 * Checks the Access (A) bit from the PTE corresponding to the enclave page and 1085 * clears it. 1086 * 1087 * Return: 1 if the page has been recently accessed and 0 if not. 1088 */ 1089 int sgx_encl_test_and_clear_young(struct mm_struct *mm, 1090 struct sgx_encl_page *page) 1091 { 1092 unsigned long addr = page->desc & PAGE_MASK; 1093 struct sgx_encl *encl = page->encl; 1094 struct vm_area_struct *vma; 1095 int ret; 1096 1097 ret = sgx_encl_find(mm, addr, &vma); 1098 if (ret) 1099 return 0; 1100 1101 if (encl != vma->vm_private_data) 1102 return 0; 1103 1104 ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, 1105 sgx_encl_test_and_clear_young_cb, vma->vm_mm); 1106 if (ret < 0) 1107 return 0; 1108 1109 return ret; 1110 } 1111 1112 struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, 1113 unsigned long offset, 1114 u64 secinfo_flags) 1115 { 1116 struct sgx_encl_page *encl_page; 1117 unsigned long prot; 1118 1119 encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); 1120 if (!encl_page) 1121 return ERR_PTR(-ENOMEM); 1122 1123 encl_page->desc = encl->base + offset; 1124 encl_page->encl = encl; 1125 1126 prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | 1127 _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | 1128 _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); 1129 1130 /* 1131 * TCS pages must always RW set for CPU access while the SECINFO 1132 * permissions are *always* zero - the CPU ignores the user provided 1133 * values and silently overwrites them with zero permissions. 1134 */ 1135 if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) 1136 prot |= PROT_READ | PROT_WRITE; 1137 1138 /* Calculate maximum of the VM flags for the page. */ 1139 encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); 1140 1141 return encl_page; 1142 } 1143 1144 /** 1145 * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave 1146 * @encl: the enclave 1147 * @addr: page aligned pointer to single page for which PTEs will be removed 1148 * 1149 * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping 1150 * @addr from each VMA. Ensure that page fault handler is ready to handle 1151 * new mappings of @addr before calling this function. 1152 */ 1153 void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) 1154 { 1155 unsigned long mm_list_version; 1156 struct sgx_encl_mm *encl_mm; 1157 struct vm_area_struct *vma; 1158 int idx, ret; 1159 1160 do { 1161 mm_list_version = encl->mm_list_version; 1162 1163 /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ 1164 smp_rmb(); 1165 1166 idx = srcu_read_lock(&encl->srcu); 1167 1168 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 1169 if (!mmget_not_zero(encl_mm->mm)) 1170 continue; 1171 1172 mmap_read_lock(encl_mm->mm); 1173 1174 ret = sgx_encl_find(encl_mm->mm, addr, &vma); 1175 if (!ret && encl == vma->vm_private_data) 1176 zap_vma_ptes(vma, addr, PAGE_SIZE); 1177 1178 mmap_read_unlock(encl_mm->mm); 1179 1180 mmput_async(encl_mm->mm); 1181 } 1182 1183 srcu_read_unlock(&encl->srcu, idx); 1184 } while (unlikely(encl->mm_list_version != mm_list_version)); 1185 } 1186 1187 /** 1188 * sgx_alloc_va_page() - Allocate a Version Array (VA) page 1189 * @reclaim: Reclaim EPC pages directly if none available. Enclave 1190 * mutex should not be held if this is set. 1191 * 1192 * Allocate a free EPC page and convert it to a Version Array (VA) page. 1193 * 1194 * Return: 1195 * a VA page, 1196 * -errno otherwise 1197 */ 1198 struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) 1199 { 1200 struct sgx_epc_page *epc_page; 1201 int ret; 1202 1203 epc_page = sgx_alloc_epc_page(NULL, reclaim); 1204 if (IS_ERR(epc_page)) 1205 return ERR_CAST(epc_page); 1206 1207 ret = __epa(sgx_get_epc_virt_addr(epc_page)); 1208 if (ret) { 1209 WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); 1210 sgx_encl_free_epc_page(epc_page); 1211 return ERR_PTR(-EFAULT); 1212 } 1213 1214 return epc_page; 1215 } 1216 1217 /** 1218 * sgx_alloc_va_slot - allocate a VA slot 1219 * @va_page: a &struct sgx_va_page instance 1220 * 1221 * Allocates a slot from a &struct sgx_va_page instance. 1222 * 1223 * Return: offset of the slot inside the VA page 1224 */ 1225 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) 1226 { 1227 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); 1228 1229 if (slot < SGX_VA_SLOT_COUNT) 1230 set_bit(slot, va_page->slots); 1231 1232 return slot << 3; 1233 } 1234 1235 /** 1236 * sgx_free_va_slot - free a VA slot 1237 * @va_page: a &struct sgx_va_page instance 1238 * @offset: offset of the slot inside the VA page 1239 * 1240 * Frees a slot from a &struct sgx_va_page instance. 1241 */ 1242 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) 1243 { 1244 clear_bit(offset >> 3, va_page->slots); 1245 } 1246 1247 /** 1248 * sgx_va_page_full - is the VA page full? 1249 * @va_page: a &struct sgx_va_page instance 1250 * 1251 * Return: true if all slots have been taken 1252 */ 1253 bool sgx_va_page_full(struct sgx_va_page *va_page) 1254 { 1255 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); 1256 1257 return slot == SGX_VA_SLOT_COUNT; 1258 } 1259 1260 /** 1261 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave 1262 * @page: EPC page to be freed 1263 * 1264 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and 1265 * only upon success, it puts the page back to free page list. Otherwise, it 1266 * gives a WARNING to indicate page is leaked. 1267 */ 1268 void sgx_encl_free_epc_page(struct sgx_epc_page *page) 1269 { 1270 int ret; 1271 1272 WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); 1273 1274 ret = __eremove(sgx_get_epc_virt_addr(page)); 1275 if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) 1276 return; 1277 1278 sgx_free_epc_page(page); 1279 } 1280