1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-20 Intel Corporation. */ 3 4 #include <linux/lockdep.h> 5 #include <linux/mm.h> 6 #include <linux/mman.h> 7 #include <linux/shmem_fs.h> 8 #include <linux/suspend.h> 9 #include <linux/sched/mm.h> 10 #include <asm/sgx.h> 11 #include "encl.h" 12 #include "encls.h" 13 #include "sgx.h" 14 15 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) 16 /* 17 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to 18 * determine the page index associated with the first PCMD entry 19 * within a PCMD page. 20 */ 21 #define PCMD_FIRST_MASK GENMASK(4, 0) 22 23 /** 24 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with 25 * a PCMD page is in process of being reclaimed. 26 * @encl: Enclave to which PCMD page belongs 27 * @start_addr: Address of enclave page using first entry within the PCMD page 28 * 29 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is 30 * stored. The PCMD data of a reclaimed enclave page contains enough 31 * information for the processor to verify the page at the time 32 * it is loaded back into the Enclave Page Cache (EPC). 33 * 34 * The backing storage to which enclave pages are reclaimed is laid out as 35 * follows: 36 * Encrypted enclave pages:SECS page:PCMD pages 37 * 38 * Each PCMD page contains the PCMD metadata of 39 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. 40 * 41 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the 42 * process of getting data (and thus soon being non-empty). (b) is tested with 43 * a check if an enclave page sharing the PCMD page is in the process of being 44 * reclaimed. 45 * 46 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it 47 * intends to reclaim that enclave page - it means that the PCMD page 48 * associated with that enclave page is about to get some data and thus 49 * even if the PCMD page is empty, it should not be truncated. 50 * 51 * Context: Enclave mutex (&sgx_encl->lock) must be held. 52 * Return: 1 if the reclaimer is about to write to the PCMD page 53 * 0 if the reclaimer has no intention to write to the PCMD page 54 */ 55 static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, 56 unsigned long start_addr) 57 { 58 int reclaimed = 0; 59 int i; 60 61 /* 62 * PCMD_FIRST_MASK is based on number of PCMD entries within 63 * PCMD page being 32. 64 */ 65 BUILD_BUG_ON(PCMDS_PER_PAGE != 32); 66 67 for (i = 0; i < PCMDS_PER_PAGE; i++) { 68 struct sgx_encl_page *entry; 69 unsigned long addr; 70 71 addr = start_addr + i * PAGE_SIZE; 72 73 /* 74 * Stop when reaching the SECS page - it does not 75 * have a page_array entry and its reclaim is 76 * started and completed with enclave mutex held so 77 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED 78 * flag. 79 */ 80 if (addr == encl->base + encl->size) 81 break; 82 83 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 84 if (!entry) 85 continue; 86 87 /* 88 * VA page slot ID uses same bit as the flag so it is important 89 * to ensure that the page is not already in backing store. 90 */ 91 if (entry->epc_page && 92 (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { 93 reclaimed = 1; 94 break; 95 } 96 } 97 98 return reclaimed; 99 } 100 101 /* 102 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's 103 * follow right after the EPC data in the backing storage. In addition to the 104 * visible enclave pages, there's one extra page slot for SECS, before PCMD 105 * structs. 106 */ 107 static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, 108 unsigned long page_index) 109 { 110 pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); 111 112 return epc_end_off + page_index * sizeof(struct sgx_pcmd); 113 } 114 115 /* 116 * Free a page from the backing storage in the given page index. 117 */ 118 static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) 119 { 120 struct inode *inode = file_inode(encl->backing); 121 122 shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); 123 } 124 125 /* 126 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC 127 * Pages" in the SDM. 128 */ 129 static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, 130 struct sgx_epc_page *epc_page, 131 struct sgx_epc_page *secs_page) 132 { 133 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; 134 struct sgx_encl *encl = encl_page->encl; 135 pgoff_t page_index, page_pcmd_off; 136 unsigned long pcmd_first_page; 137 struct sgx_pageinfo pginfo; 138 struct sgx_backing b; 139 bool pcmd_page_empty; 140 u8 *pcmd_page; 141 int ret; 142 143 if (secs_page) 144 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); 145 else 146 page_index = PFN_DOWN(encl->size); 147 148 /* 149 * Address of enclave page using the first entry within the PCMD page. 150 */ 151 pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; 152 153 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 154 155 ret = sgx_encl_lookup_backing(encl, page_index, &b); 156 if (ret) 157 return ret; 158 159 pginfo.addr = encl_page->desc & PAGE_MASK; 160 pginfo.contents = (unsigned long)kmap_atomic(b.contents); 161 pcmd_page = kmap_atomic(b.pcmd); 162 pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; 163 164 if (secs_page) 165 pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); 166 else 167 pginfo.secs = 0; 168 169 ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), 170 sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); 171 if (ret) { 172 if (encls_failed(ret)) 173 ENCLS_WARN(ret, "ELDU"); 174 175 ret = -EFAULT; 176 } 177 178 memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); 179 set_page_dirty(b.pcmd); 180 181 /* 182 * The area for the PCMD in the page was zeroed above. Check if the 183 * whole page is now empty meaning that all PCMD's have been zeroed: 184 */ 185 pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); 186 187 kunmap_atomic(pcmd_page); 188 kunmap_atomic((void *)(unsigned long)pginfo.contents); 189 190 get_page(b.pcmd); 191 sgx_encl_put_backing(&b); 192 193 sgx_encl_truncate_backing_page(encl, page_index); 194 195 if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { 196 sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); 197 pcmd_page = kmap_atomic(b.pcmd); 198 if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) 199 pr_warn("PCMD page not empty after truncate.\n"); 200 kunmap_atomic(pcmd_page); 201 } 202 203 put_page(b.pcmd); 204 205 return ret; 206 } 207 208 static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, 209 struct sgx_epc_page *secs_page) 210 { 211 212 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; 213 struct sgx_encl *encl = encl_page->encl; 214 struct sgx_epc_page *epc_page; 215 int ret; 216 217 epc_page = sgx_alloc_epc_page(encl_page, false); 218 if (IS_ERR(epc_page)) 219 return epc_page; 220 221 ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); 222 if (ret) { 223 sgx_encl_free_epc_page(epc_page); 224 return ERR_PTR(ret); 225 } 226 227 sgx_free_va_slot(encl_page->va_page, va_offset); 228 list_move(&encl_page->va_page->list, &encl->va_pages); 229 encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; 230 encl_page->epc_page = epc_page; 231 232 return epc_page; 233 } 234 235 static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, 236 unsigned long addr, 237 unsigned long vm_flags) 238 { 239 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); 240 struct sgx_epc_page *epc_page; 241 struct sgx_encl_page *entry; 242 243 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 244 if (!entry) 245 return ERR_PTR(-EFAULT); 246 247 /* 248 * Verify that the faulted page has equal or higher build time 249 * permissions than the VMA permissions (i.e. the subset of {VM_READ, 250 * VM_WRITE, VM_EXECUTE} in vma->vm_flags). 251 */ 252 if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) 253 return ERR_PTR(-EFAULT); 254 255 /* Entry successfully located. */ 256 if (entry->epc_page) { 257 if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) 258 return ERR_PTR(-EBUSY); 259 260 return entry; 261 } 262 263 if (!(encl->secs.epc_page)) { 264 epc_page = sgx_encl_eldu(&encl->secs, NULL); 265 if (IS_ERR(epc_page)) 266 return ERR_CAST(epc_page); 267 } 268 269 epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); 270 if (IS_ERR(epc_page)) 271 return ERR_CAST(epc_page); 272 273 encl->secs_child_cnt++; 274 sgx_mark_page_reclaimable(entry->epc_page); 275 276 return entry; 277 } 278 279 static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) 280 { 281 unsigned long addr = (unsigned long)vmf->address; 282 struct vm_area_struct *vma = vmf->vma; 283 struct sgx_encl_page *entry; 284 unsigned long phys_addr; 285 struct sgx_encl *encl; 286 vm_fault_t ret; 287 288 encl = vma->vm_private_data; 289 290 /* 291 * It's very unlikely but possible that allocating memory for the 292 * mm_list entry of a forked process failed in sgx_vma_open(). When 293 * this happens, vm_private_data is set to NULL. 294 */ 295 if (unlikely(!encl)) 296 return VM_FAULT_SIGBUS; 297 298 mutex_lock(&encl->lock); 299 300 entry = sgx_encl_load_page(encl, addr, vma->vm_flags); 301 if (IS_ERR(entry)) { 302 mutex_unlock(&encl->lock); 303 304 if (PTR_ERR(entry) == -EBUSY) 305 return VM_FAULT_NOPAGE; 306 307 return VM_FAULT_SIGBUS; 308 } 309 310 phys_addr = sgx_get_epc_phys_addr(entry->epc_page); 311 312 ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); 313 if (ret != VM_FAULT_NOPAGE) { 314 mutex_unlock(&encl->lock); 315 316 return VM_FAULT_SIGBUS; 317 } 318 319 sgx_encl_test_and_clear_young(vma->vm_mm, entry); 320 mutex_unlock(&encl->lock); 321 322 return VM_FAULT_NOPAGE; 323 } 324 325 static void sgx_vma_open(struct vm_area_struct *vma) 326 { 327 struct sgx_encl *encl = vma->vm_private_data; 328 329 /* 330 * It's possible but unlikely that vm_private_data is NULL. This can 331 * happen in a grandchild of a process, when sgx_encl_mm_add() had 332 * failed to allocate memory in this callback. 333 */ 334 if (unlikely(!encl)) 335 return; 336 337 if (sgx_encl_mm_add(encl, vma->vm_mm)) 338 vma->vm_private_data = NULL; 339 } 340 341 342 /** 343 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed 344 * @encl: an enclave pointer 345 * @start: lower bound of the address range, inclusive 346 * @end: upper bound of the address range, exclusive 347 * @vm_flags: VMA flags 348 * 349 * Iterate through the enclave pages contained within [@start, @end) to verify 350 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} 351 * do not contain any permissions that are not contained in the build time 352 * permissions of any of the enclave pages within the given address range. 353 * 354 * An enclave creator must declare the strongest permissions that will be 355 * needed for each enclave page. This ensures that mappings have the identical 356 * or weaker permissions than the earlier declared permissions. 357 * 358 * Return: 0 on success, -EACCES otherwise 359 */ 360 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, 361 unsigned long end, unsigned long vm_flags) 362 { 363 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); 364 struct sgx_encl_page *page; 365 unsigned long count = 0; 366 int ret = 0; 367 368 XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); 369 370 /* 371 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might 372 * conflict with the enclave page permissions. 373 */ 374 if (current->personality & READ_IMPLIES_EXEC) 375 return -EACCES; 376 377 mutex_lock(&encl->lock); 378 xas_lock(&xas); 379 xas_for_each(&xas, page, PFN_DOWN(end - 1)) { 380 if (~page->vm_max_prot_bits & vm_prot_bits) { 381 ret = -EACCES; 382 break; 383 } 384 385 /* Reschedule on every XA_CHECK_SCHED iteration. */ 386 if (!(++count % XA_CHECK_SCHED)) { 387 xas_pause(&xas); 388 xas_unlock(&xas); 389 mutex_unlock(&encl->lock); 390 391 cond_resched(); 392 393 mutex_lock(&encl->lock); 394 xas_lock(&xas); 395 } 396 } 397 xas_unlock(&xas); 398 mutex_unlock(&encl->lock); 399 400 return ret; 401 } 402 403 static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, 404 unsigned long end, unsigned long newflags) 405 { 406 return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); 407 } 408 409 static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, 410 unsigned long addr, void *data) 411 { 412 unsigned long offset = addr & ~PAGE_MASK; 413 int ret; 414 415 416 ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); 417 if (ret) 418 return -EIO; 419 420 return 0; 421 } 422 423 static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, 424 unsigned long addr, void *data) 425 { 426 unsigned long offset = addr & ~PAGE_MASK; 427 int ret; 428 429 ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); 430 if (ret) 431 return -EIO; 432 433 return 0; 434 } 435 436 /* 437 * Load an enclave page to EPC if required, and take encl->lock. 438 */ 439 static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, 440 unsigned long addr, 441 unsigned long vm_flags) 442 { 443 struct sgx_encl_page *entry; 444 445 for ( ; ; ) { 446 mutex_lock(&encl->lock); 447 448 entry = sgx_encl_load_page(encl, addr, vm_flags); 449 if (PTR_ERR(entry) != -EBUSY) 450 break; 451 452 mutex_unlock(&encl->lock); 453 } 454 455 if (IS_ERR(entry)) 456 mutex_unlock(&encl->lock); 457 458 return entry; 459 } 460 461 static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, 462 void *buf, int len, int write) 463 { 464 struct sgx_encl *encl = vma->vm_private_data; 465 struct sgx_encl_page *entry = NULL; 466 char data[sizeof(unsigned long)]; 467 unsigned long align; 468 int offset; 469 int cnt; 470 int ret = 0; 471 int i; 472 473 /* 474 * If process was forked, VMA is still there but vm_private_data is set 475 * to NULL. 476 */ 477 if (!encl) 478 return -EFAULT; 479 480 if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) 481 return -EFAULT; 482 483 for (i = 0; i < len; i += cnt) { 484 entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, 485 vma->vm_flags); 486 if (IS_ERR(entry)) { 487 ret = PTR_ERR(entry); 488 break; 489 } 490 491 align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); 492 offset = (addr + i) & (sizeof(unsigned long) - 1); 493 cnt = sizeof(unsigned long) - offset; 494 cnt = min(cnt, len - i); 495 496 ret = sgx_encl_debug_read(encl, entry, align, data); 497 if (ret) 498 goto out; 499 500 if (write) { 501 memcpy(data + offset, buf + i, cnt); 502 ret = sgx_encl_debug_write(encl, entry, align, data); 503 if (ret) 504 goto out; 505 } else { 506 memcpy(buf + i, data + offset, cnt); 507 } 508 509 out: 510 mutex_unlock(&encl->lock); 511 512 if (ret) 513 break; 514 } 515 516 return ret < 0 ? ret : i; 517 } 518 519 const struct vm_operations_struct sgx_vm_ops = { 520 .fault = sgx_vma_fault, 521 .mprotect = sgx_vma_mprotect, 522 .open = sgx_vma_open, 523 .access = sgx_vma_access, 524 }; 525 526 /** 527 * sgx_encl_release - Destroy an enclave instance 528 * @ref: address of a kref inside &sgx_encl 529 * 530 * Used together with kref_put(). Frees all the resources associated with the 531 * enclave and the instance itself. 532 */ 533 void sgx_encl_release(struct kref *ref) 534 { 535 struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); 536 struct sgx_va_page *va_page; 537 struct sgx_encl_page *entry; 538 unsigned long index; 539 540 xa_for_each(&encl->page_array, index, entry) { 541 if (entry->epc_page) { 542 /* 543 * The page and its radix tree entry cannot be freed 544 * if the page is being held by the reclaimer. 545 */ 546 if (sgx_unmark_page_reclaimable(entry->epc_page)) 547 continue; 548 549 sgx_encl_free_epc_page(entry->epc_page); 550 encl->secs_child_cnt--; 551 entry->epc_page = NULL; 552 } 553 554 kfree(entry); 555 /* Invoke scheduler to prevent soft lockups. */ 556 cond_resched(); 557 } 558 559 xa_destroy(&encl->page_array); 560 561 if (!encl->secs_child_cnt && encl->secs.epc_page) { 562 sgx_encl_free_epc_page(encl->secs.epc_page); 563 encl->secs.epc_page = NULL; 564 } 565 566 while (!list_empty(&encl->va_pages)) { 567 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, 568 list); 569 list_del(&va_page->list); 570 sgx_encl_free_epc_page(va_page->epc_page); 571 kfree(va_page); 572 } 573 574 if (encl->backing) 575 fput(encl->backing); 576 577 cleanup_srcu_struct(&encl->srcu); 578 579 WARN_ON_ONCE(!list_empty(&encl->mm_list)); 580 581 /* Detect EPC page leak's. */ 582 WARN_ON_ONCE(encl->secs_child_cnt); 583 WARN_ON_ONCE(encl->secs.epc_page); 584 585 kfree(encl); 586 } 587 588 /* 589 * 'mm' is exiting and no longer needs mmu notifications. 590 */ 591 static void sgx_mmu_notifier_release(struct mmu_notifier *mn, 592 struct mm_struct *mm) 593 { 594 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); 595 struct sgx_encl_mm *tmp = NULL; 596 597 /* 598 * The enclave itself can remove encl_mm. Note, objects can't be moved 599 * off an RCU protected list, but deletion is ok. 600 */ 601 spin_lock(&encl_mm->encl->mm_lock); 602 list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { 603 if (tmp == encl_mm) { 604 list_del_rcu(&encl_mm->list); 605 break; 606 } 607 } 608 spin_unlock(&encl_mm->encl->mm_lock); 609 610 if (tmp == encl_mm) { 611 synchronize_srcu(&encl_mm->encl->srcu); 612 mmu_notifier_put(mn); 613 } 614 } 615 616 static void sgx_mmu_notifier_free(struct mmu_notifier *mn) 617 { 618 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); 619 620 /* 'encl_mm' is going away, put encl_mm->encl reference: */ 621 kref_put(&encl_mm->encl->refcount, sgx_encl_release); 622 623 kfree(encl_mm); 624 } 625 626 static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { 627 .release = sgx_mmu_notifier_release, 628 .free_notifier = sgx_mmu_notifier_free, 629 }; 630 631 static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, 632 struct mm_struct *mm) 633 { 634 struct sgx_encl_mm *encl_mm = NULL; 635 struct sgx_encl_mm *tmp; 636 int idx; 637 638 idx = srcu_read_lock(&encl->srcu); 639 640 list_for_each_entry_rcu(tmp, &encl->mm_list, list) { 641 if (tmp->mm == mm) { 642 encl_mm = tmp; 643 break; 644 } 645 } 646 647 srcu_read_unlock(&encl->srcu, idx); 648 649 return encl_mm; 650 } 651 652 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) 653 { 654 struct sgx_encl_mm *encl_mm; 655 int ret; 656 657 /* 658 * Even though a single enclave may be mapped into an mm more than once, 659 * each 'mm' only appears once on encl->mm_list. This is guaranteed by 660 * holding the mm's mmap lock for write before an mm can be added or 661 * remove to an encl->mm_list. 662 */ 663 mmap_assert_write_locked(mm); 664 665 /* 666 * It's possible that an entry already exists in the mm_list, because it 667 * is removed only on VFS release or process exit. 668 */ 669 if (sgx_encl_find_mm(encl, mm)) 670 return 0; 671 672 encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); 673 if (!encl_mm) 674 return -ENOMEM; 675 676 /* Grab a refcount for the encl_mm->encl reference: */ 677 kref_get(&encl->refcount); 678 encl_mm->encl = encl; 679 encl_mm->mm = mm; 680 encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; 681 682 ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); 683 if (ret) { 684 kfree(encl_mm); 685 return ret; 686 } 687 688 spin_lock(&encl->mm_lock); 689 list_add_rcu(&encl_mm->list, &encl->mm_list); 690 /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ 691 smp_wmb(); 692 encl->mm_list_version++; 693 spin_unlock(&encl->mm_lock); 694 695 return 0; 696 } 697 698 static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, 699 pgoff_t index) 700 { 701 struct inode *inode = encl->backing->f_path.dentry->d_inode; 702 struct address_space *mapping = inode->i_mapping; 703 gfp_t gfpmask = mapping_gfp_mask(mapping); 704 705 return shmem_read_mapping_page_gfp(mapping, index, gfpmask); 706 } 707 708 /** 709 * sgx_encl_get_backing() - Pin the backing storage 710 * @encl: an enclave pointer 711 * @page_index: enclave page index 712 * @backing: data for accessing backing storage for the page 713 * 714 * Pin the backing storage pages for storing the encrypted contents and Paging 715 * Crypto MetaData (PCMD) of an enclave page. 716 * 717 * Return: 718 * 0 on success, 719 * -errno otherwise. 720 */ 721 static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, 722 struct sgx_backing *backing) 723 { 724 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 725 struct page *contents; 726 struct page *pcmd; 727 728 contents = sgx_encl_get_backing_page(encl, page_index); 729 if (IS_ERR(contents)) 730 return PTR_ERR(contents); 731 732 pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); 733 if (IS_ERR(pcmd)) { 734 put_page(contents); 735 return PTR_ERR(pcmd); 736 } 737 738 backing->page_index = page_index; 739 backing->contents = contents; 740 backing->pcmd = pcmd; 741 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); 742 743 return 0; 744 } 745 746 /* 747 * When called from ksgxd, returns the mem_cgroup of a struct mm stored 748 * in the enclave's mm_list. When not called from ksgxd, just returns 749 * the mem_cgroup of the current task. 750 */ 751 static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) 752 { 753 struct mem_cgroup *memcg = NULL; 754 struct sgx_encl_mm *encl_mm; 755 int idx; 756 757 /* 758 * If called from normal task context, return the mem_cgroup 759 * of the current task's mm. The remainder of the handling is for 760 * ksgxd. 761 */ 762 if (!current_is_ksgxd()) 763 return get_mem_cgroup_from_mm(current->mm); 764 765 /* 766 * Search the enclave's mm_list to find an mm associated with 767 * this enclave to charge the allocation to. 768 */ 769 idx = srcu_read_lock(&encl->srcu); 770 771 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 772 if (!mmget_not_zero(encl_mm->mm)) 773 continue; 774 775 memcg = get_mem_cgroup_from_mm(encl_mm->mm); 776 777 mmput_async(encl_mm->mm); 778 779 break; 780 } 781 782 srcu_read_unlock(&encl->srcu, idx); 783 784 /* 785 * In the rare case that there isn't an mm associated with 786 * the enclave, set memcg to the current active mem_cgroup. 787 * This will be the root mem_cgroup if there is no active 788 * mem_cgroup. 789 */ 790 if (!memcg) 791 return get_mem_cgroup_from_mm(NULL); 792 793 return memcg; 794 } 795 796 /** 797 * sgx_encl_alloc_backing() - allocate a new backing storage page 798 * @encl: an enclave pointer 799 * @page_index: enclave page index 800 * @backing: data for accessing backing storage for the page 801 * 802 * When called from ksgxd, sets the active memcg from one of the 803 * mms in the enclave's mm_list prior to any backing page allocation, 804 * in order to ensure that shmem page allocations are charged to the 805 * enclave. 806 * 807 * Return: 808 * 0 on success, 809 * -errno otherwise. 810 */ 811 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, 812 struct sgx_backing *backing) 813 { 814 struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); 815 struct mem_cgroup *memcg = set_active_memcg(encl_memcg); 816 int ret; 817 818 ret = sgx_encl_get_backing(encl, page_index, backing); 819 820 set_active_memcg(memcg); 821 mem_cgroup_put(encl_memcg); 822 823 return ret; 824 } 825 826 /** 827 * sgx_encl_lookup_backing() - retrieve an existing backing storage page 828 * @encl: an enclave pointer 829 * @page_index: enclave page index 830 * @backing: data for accessing backing storage for the page 831 * 832 * Retrieve a backing page for loading data back into an EPC page with ELDU. 833 * It is the caller's responsibility to ensure that it is appropriate to use 834 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is 835 * not used correctly, this will cause an allocation which is not accounted for. 836 * 837 * Return: 838 * 0 on success, 839 * -errno otherwise. 840 */ 841 int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, 842 struct sgx_backing *backing) 843 { 844 return sgx_encl_get_backing(encl, page_index, backing); 845 } 846 847 /** 848 * sgx_encl_put_backing() - Unpin the backing storage 849 * @backing: data for accessing backing storage for the page 850 */ 851 void sgx_encl_put_backing(struct sgx_backing *backing) 852 { 853 put_page(backing->pcmd); 854 put_page(backing->contents); 855 } 856 857 static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, 858 void *data) 859 { 860 pte_t pte; 861 int ret; 862 863 ret = pte_young(*ptep); 864 if (ret) { 865 pte = pte_mkold(*ptep); 866 set_pte_at((struct mm_struct *)data, addr, ptep, pte); 867 } 868 869 return ret; 870 } 871 872 /** 873 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit 874 * @mm: mm_struct that is checked 875 * @page: enclave page to be tested for recent access 876 * 877 * Checks the Access (A) bit from the PTE corresponding to the enclave page and 878 * clears it. 879 * 880 * Return: 1 if the page has been recently accessed and 0 if not. 881 */ 882 int sgx_encl_test_and_clear_young(struct mm_struct *mm, 883 struct sgx_encl_page *page) 884 { 885 unsigned long addr = page->desc & PAGE_MASK; 886 struct sgx_encl *encl = page->encl; 887 struct vm_area_struct *vma; 888 int ret; 889 890 ret = sgx_encl_find(mm, addr, &vma); 891 if (ret) 892 return 0; 893 894 if (encl != vma->vm_private_data) 895 return 0; 896 897 ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, 898 sgx_encl_test_and_clear_young_cb, vma->vm_mm); 899 if (ret < 0) 900 return 0; 901 902 return ret; 903 } 904 905 /** 906 * sgx_alloc_va_page() - Allocate a Version Array (VA) page 907 * 908 * Allocate a free EPC page and convert it to a Version Array (VA) page. 909 * 910 * Return: 911 * a VA page, 912 * -errno otherwise 913 */ 914 struct sgx_epc_page *sgx_alloc_va_page(void) 915 { 916 struct sgx_epc_page *epc_page; 917 int ret; 918 919 epc_page = sgx_alloc_epc_page(NULL, true); 920 if (IS_ERR(epc_page)) 921 return ERR_CAST(epc_page); 922 923 ret = __epa(sgx_get_epc_virt_addr(epc_page)); 924 if (ret) { 925 WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); 926 sgx_encl_free_epc_page(epc_page); 927 return ERR_PTR(-EFAULT); 928 } 929 930 return epc_page; 931 } 932 933 /** 934 * sgx_alloc_va_slot - allocate a VA slot 935 * @va_page: a &struct sgx_va_page instance 936 * 937 * Allocates a slot from a &struct sgx_va_page instance. 938 * 939 * Return: offset of the slot inside the VA page 940 */ 941 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) 942 { 943 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); 944 945 if (slot < SGX_VA_SLOT_COUNT) 946 set_bit(slot, va_page->slots); 947 948 return slot << 3; 949 } 950 951 /** 952 * sgx_free_va_slot - free a VA slot 953 * @va_page: a &struct sgx_va_page instance 954 * @offset: offset of the slot inside the VA page 955 * 956 * Frees a slot from a &struct sgx_va_page instance. 957 */ 958 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) 959 { 960 clear_bit(offset >> 3, va_page->slots); 961 } 962 963 /** 964 * sgx_va_page_full - is the VA page full? 965 * @va_page: a &struct sgx_va_page instance 966 * 967 * Return: true if all slots have been taken 968 */ 969 bool sgx_va_page_full(struct sgx_va_page *va_page) 970 { 971 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); 972 973 return slot == SGX_VA_SLOT_COUNT; 974 } 975 976 /** 977 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave 978 * @page: EPC page to be freed 979 * 980 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and 981 * only upon success, it puts the page back to free page list. Otherwise, it 982 * gives a WARNING to indicate page is leaked. 983 */ 984 void sgx_encl_free_epc_page(struct sgx_epc_page *page) 985 { 986 int ret; 987 988 WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); 989 990 ret = __eremove(sgx_get_epc_virt_addr(page)); 991 if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) 992 return; 993 994 sgx_free_epc_page(page); 995 } 996