1 /* 2 * Copyright 2013 Red Hat Inc. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * Authors: Jérôme Glisse <jglisse@redhat.com> 15 */ 16 /* 17 * Refer to include/linux/hmm.h for information about heterogeneous memory 18 * management or HMM for short. 19 */ 20 #include <linux/mm.h> 21 #include <linux/hmm.h> 22 #include <linux/init.h> 23 #include <linux/rmap.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/sched.h> 27 #include <linux/mmzone.h> 28 #include <linux/pagemap.h> 29 #include <linux/swapops.h> 30 #include <linux/hugetlb.h> 31 #include <linux/memremap.h> 32 #include <linux/jump_label.h> 33 #include <linux/mmu_notifier.h> 34 #include <linux/memory_hotplug.h> 35 36 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) 37 38 #if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) 39 /* 40 * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h 41 */ 42 DEFINE_STATIC_KEY_FALSE(device_private_key); 43 EXPORT_SYMBOL(device_private_key); 44 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 45 46 47 #if IS_ENABLED(CONFIG_HMM_MIRROR) 48 static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 49 50 /* 51 * struct hmm - HMM per mm struct 52 * 53 * @mm: mm struct this HMM struct is bound to 54 * @lock: lock protecting ranges list 55 * @sequence: we track updates to the CPU page table with a sequence number 56 * @ranges: list of range being snapshotted 57 * @mirrors: list of mirrors for this mm 58 * @mmu_notifier: mmu notifier to track updates to CPU page table 59 * @mirrors_sem: read/write semaphore protecting the mirrors list 60 */ 61 struct hmm { 62 struct mm_struct *mm; 63 spinlock_t lock; 64 atomic_t sequence; 65 struct list_head ranges; 66 struct list_head mirrors; 67 struct mmu_notifier mmu_notifier; 68 struct rw_semaphore mirrors_sem; 69 }; 70 71 /* 72 * hmm_register - register HMM against an mm (HMM internal) 73 * 74 * @mm: mm struct to attach to 75 * 76 * This is not intended to be used directly by device drivers. It allocates an 77 * HMM struct if mm does not have one, and initializes it. 78 */ 79 static struct hmm *hmm_register(struct mm_struct *mm) 80 { 81 struct hmm *hmm = READ_ONCE(mm->hmm); 82 bool cleanup = false; 83 84 /* 85 * The hmm struct can only be freed once the mm_struct goes away, 86 * hence we should always have pre-allocated an new hmm struct 87 * above. 88 */ 89 if (hmm) 90 return hmm; 91 92 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 93 if (!hmm) 94 return NULL; 95 INIT_LIST_HEAD(&hmm->mirrors); 96 init_rwsem(&hmm->mirrors_sem); 97 atomic_set(&hmm->sequence, 0); 98 hmm->mmu_notifier.ops = NULL; 99 INIT_LIST_HEAD(&hmm->ranges); 100 spin_lock_init(&hmm->lock); 101 hmm->mm = mm; 102 103 /* 104 * We should only get here if hold the mmap_sem in write mode ie on 105 * registration of first mirror through hmm_mirror_register() 106 */ 107 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; 108 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { 109 kfree(hmm); 110 return NULL; 111 } 112 113 spin_lock(&mm->page_table_lock); 114 if (!mm->hmm) 115 mm->hmm = hmm; 116 else 117 cleanup = true; 118 spin_unlock(&mm->page_table_lock); 119 120 if (cleanup) { 121 mmu_notifier_unregister(&hmm->mmu_notifier, mm); 122 kfree(hmm); 123 } 124 125 return mm->hmm; 126 } 127 128 void hmm_mm_destroy(struct mm_struct *mm) 129 { 130 kfree(mm->hmm); 131 } 132 133 static void hmm_invalidate_range(struct hmm *hmm, 134 enum hmm_update_type action, 135 unsigned long start, 136 unsigned long end) 137 { 138 struct hmm_mirror *mirror; 139 struct hmm_range *range; 140 141 spin_lock(&hmm->lock); 142 list_for_each_entry(range, &hmm->ranges, list) { 143 unsigned long addr, idx, npages; 144 145 if (end < range->start || start >= range->end) 146 continue; 147 148 range->valid = false; 149 addr = max(start, range->start); 150 idx = (addr - range->start) >> PAGE_SHIFT; 151 npages = (min(range->end, end) - addr) >> PAGE_SHIFT; 152 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); 153 } 154 spin_unlock(&hmm->lock); 155 156 down_read(&hmm->mirrors_sem); 157 list_for_each_entry(mirror, &hmm->mirrors, list) 158 mirror->ops->sync_cpu_device_pagetables(mirror, action, 159 start, end); 160 up_read(&hmm->mirrors_sem); 161 } 162 163 static void hmm_invalidate_range_start(struct mmu_notifier *mn, 164 struct mm_struct *mm, 165 unsigned long start, 166 unsigned long end) 167 { 168 struct hmm *hmm = mm->hmm; 169 170 VM_BUG_ON(!hmm); 171 172 atomic_inc(&hmm->sequence); 173 } 174 175 static void hmm_invalidate_range_end(struct mmu_notifier *mn, 176 struct mm_struct *mm, 177 unsigned long start, 178 unsigned long end) 179 { 180 struct hmm *hmm = mm->hmm; 181 182 VM_BUG_ON(!hmm); 183 184 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); 185 } 186 187 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 188 .invalidate_range_start = hmm_invalidate_range_start, 189 .invalidate_range_end = hmm_invalidate_range_end, 190 }; 191 192 /* 193 * hmm_mirror_register() - register a mirror against an mm 194 * 195 * @mirror: new mirror struct to register 196 * @mm: mm to register against 197 * 198 * To start mirroring a process address space, the device driver must register 199 * an HMM mirror struct. 200 * 201 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! 202 */ 203 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 204 { 205 /* Sanity check */ 206 if (!mm || !mirror || !mirror->ops) 207 return -EINVAL; 208 209 mirror->hmm = hmm_register(mm); 210 if (!mirror->hmm) 211 return -ENOMEM; 212 213 down_write(&mirror->hmm->mirrors_sem); 214 list_add(&mirror->list, &mirror->hmm->mirrors); 215 up_write(&mirror->hmm->mirrors_sem); 216 217 return 0; 218 } 219 EXPORT_SYMBOL(hmm_mirror_register); 220 221 /* 222 * hmm_mirror_unregister() - unregister a mirror 223 * 224 * @mirror: new mirror struct to register 225 * 226 * Stop mirroring a process address space, and cleanup. 227 */ 228 void hmm_mirror_unregister(struct hmm_mirror *mirror) 229 { 230 struct hmm *hmm = mirror->hmm; 231 232 down_write(&hmm->mirrors_sem); 233 list_del(&mirror->list); 234 up_write(&hmm->mirrors_sem); 235 } 236 EXPORT_SYMBOL(hmm_mirror_unregister); 237 238 struct hmm_vma_walk { 239 struct hmm_range *range; 240 unsigned long last; 241 bool fault; 242 bool block; 243 bool write; 244 }; 245 246 static int hmm_vma_do_fault(struct mm_walk *walk, 247 unsigned long addr, 248 hmm_pfn_t *pfn) 249 { 250 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 251 struct hmm_vma_walk *hmm_vma_walk = walk->private; 252 struct vm_area_struct *vma = walk->vma; 253 int r; 254 255 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 256 flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; 257 r = handle_mm_fault(vma, addr, flags); 258 if (r & VM_FAULT_RETRY) 259 return -EBUSY; 260 if (r & VM_FAULT_ERROR) { 261 *pfn = HMM_PFN_ERROR; 262 return -EFAULT; 263 } 264 265 return -EAGAIN; 266 } 267 268 static void hmm_pfns_special(hmm_pfn_t *pfns, 269 unsigned long addr, 270 unsigned long end) 271 { 272 for (; addr < end; addr += PAGE_SIZE, pfns++) 273 *pfns = HMM_PFN_SPECIAL; 274 } 275 276 static int hmm_pfns_bad(unsigned long addr, 277 unsigned long end, 278 struct mm_walk *walk) 279 { 280 struct hmm_range *range = walk->private; 281 hmm_pfn_t *pfns = range->pfns; 282 unsigned long i; 283 284 i = (addr - range->start) >> PAGE_SHIFT; 285 for (; addr < end; addr += PAGE_SIZE, i++) 286 pfns[i] = HMM_PFN_ERROR; 287 288 return 0; 289 } 290 291 static void hmm_pfns_clear(hmm_pfn_t *pfns, 292 unsigned long addr, 293 unsigned long end) 294 { 295 for (; addr < end; addr += PAGE_SIZE, pfns++) 296 *pfns = 0; 297 } 298 299 static int hmm_vma_walk_hole(unsigned long addr, 300 unsigned long end, 301 struct mm_walk *walk) 302 { 303 struct hmm_vma_walk *hmm_vma_walk = walk->private; 304 struct hmm_range *range = hmm_vma_walk->range; 305 hmm_pfn_t *pfns = range->pfns; 306 unsigned long i; 307 308 hmm_vma_walk->last = addr; 309 i = (addr - range->start) >> PAGE_SHIFT; 310 for (; addr < end; addr += PAGE_SIZE, i++) { 311 pfns[i] = HMM_PFN_EMPTY; 312 if (hmm_vma_walk->fault) { 313 int ret; 314 315 ret = hmm_vma_do_fault(walk, addr, &pfns[i]); 316 if (ret != -EAGAIN) 317 return ret; 318 } 319 } 320 321 return hmm_vma_walk->fault ? -EAGAIN : 0; 322 } 323 324 static int hmm_vma_walk_clear(unsigned long addr, 325 unsigned long end, 326 struct mm_walk *walk) 327 { 328 struct hmm_vma_walk *hmm_vma_walk = walk->private; 329 struct hmm_range *range = hmm_vma_walk->range; 330 hmm_pfn_t *pfns = range->pfns; 331 unsigned long i; 332 333 hmm_vma_walk->last = addr; 334 i = (addr - range->start) >> PAGE_SHIFT; 335 for (; addr < end; addr += PAGE_SIZE, i++) { 336 pfns[i] = 0; 337 if (hmm_vma_walk->fault) { 338 int ret; 339 340 ret = hmm_vma_do_fault(walk, addr, &pfns[i]); 341 if (ret != -EAGAIN) 342 return ret; 343 } 344 } 345 346 return hmm_vma_walk->fault ? -EAGAIN : 0; 347 } 348 349 static int hmm_vma_walk_pmd(pmd_t *pmdp, 350 unsigned long start, 351 unsigned long end, 352 struct mm_walk *walk) 353 { 354 struct hmm_vma_walk *hmm_vma_walk = walk->private; 355 struct hmm_range *range = hmm_vma_walk->range; 356 struct vm_area_struct *vma = walk->vma; 357 hmm_pfn_t *pfns = range->pfns; 358 unsigned long addr = start, i; 359 bool write_fault; 360 hmm_pfn_t flag; 361 pte_t *ptep; 362 363 i = (addr - range->start) >> PAGE_SHIFT; 364 flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; 365 write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; 366 367 again: 368 if (pmd_none(*pmdp)) 369 return hmm_vma_walk_hole(start, end, walk); 370 371 if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) 372 return hmm_pfns_bad(start, end, walk); 373 374 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { 375 unsigned long pfn; 376 pmd_t pmd; 377 378 /* 379 * No need to take pmd_lock here, even if some other threads 380 * is splitting the huge pmd we will get that event through 381 * mmu_notifier callback. 382 * 383 * So just read pmd value and check again its a transparent 384 * huge or device mapping one and compute corresponding pfn 385 * values. 386 */ 387 pmd = pmd_read_atomic(pmdp); 388 barrier(); 389 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 390 goto again; 391 if (pmd_protnone(pmd)) 392 return hmm_vma_walk_clear(start, end, walk); 393 394 if (write_fault && !pmd_write(pmd)) 395 return hmm_vma_walk_clear(start, end, walk); 396 397 pfn = pmd_pfn(pmd) + pte_index(addr); 398 flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; 399 for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 400 pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; 401 return 0; 402 } 403 404 if (pmd_bad(*pmdp)) 405 return hmm_pfns_bad(start, end, walk); 406 407 ptep = pte_offset_map(pmdp, addr); 408 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 409 pte_t pte = *ptep; 410 411 pfns[i] = 0; 412 413 if (pte_none(pte)) { 414 pfns[i] = HMM_PFN_EMPTY; 415 if (hmm_vma_walk->fault) 416 goto fault; 417 continue; 418 } 419 420 if (!pte_present(pte)) { 421 swp_entry_t entry = pte_to_swp_entry(pte); 422 423 if (!non_swap_entry(entry)) { 424 if (hmm_vma_walk->fault) 425 goto fault; 426 continue; 427 } 428 429 /* 430 * This is a special swap entry, ignore migration, use 431 * device and report anything else as error. 432 */ 433 if (is_device_private_entry(entry)) { 434 pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); 435 if (is_write_device_private_entry(entry)) { 436 pfns[i] |= HMM_PFN_WRITE; 437 } else if (write_fault) 438 goto fault; 439 pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; 440 pfns[i] |= flag; 441 } else if (is_migration_entry(entry)) { 442 if (hmm_vma_walk->fault) { 443 pte_unmap(ptep); 444 hmm_vma_walk->last = addr; 445 migration_entry_wait(vma->vm_mm, 446 pmdp, addr); 447 return -EAGAIN; 448 } 449 continue; 450 } else { 451 /* Report error for everything else */ 452 pfns[i] = HMM_PFN_ERROR; 453 } 454 continue; 455 } 456 457 if (write_fault && !pte_write(pte)) 458 goto fault; 459 460 pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; 461 pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; 462 continue; 463 464 fault: 465 pte_unmap(ptep); 466 /* Fault all pages in range */ 467 return hmm_vma_walk_clear(start, end, walk); 468 } 469 pte_unmap(ptep - 1); 470 471 return 0; 472 } 473 474 /* 475 * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses 476 * @vma: virtual memory area containing the virtual address range 477 * @range: used to track snapshot validity 478 * @start: range virtual start address (inclusive) 479 * @end: range virtual end address (exclusive) 480 * @entries: array of hmm_pfn_t: provided by the caller, filled in by function 481 * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success 482 * 483 * This snapshots the CPU page table for a range of virtual addresses. Snapshot 484 * validity is tracked by range struct. See hmm_vma_range_done() for further 485 * information. 486 * 487 * The range struct is initialized here. It tracks the CPU page table, but only 488 * if the function returns success (0), in which case the caller must then call 489 * hmm_vma_range_done() to stop CPU page table update tracking on this range. 490 * 491 * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS 492 * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! 493 */ 494 int hmm_vma_get_pfns(struct vm_area_struct *vma, 495 struct hmm_range *range, 496 unsigned long start, 497 unsigned long end, 498 hmm_pfn_t *pfns) 499 { 500 struct hmm_vma_walk hmm_vma_walk; 501 struct mm_walk mm_walk; 502 struct hmm *hmm; 503 504 /* FIXME support hugetlb fs */ 505 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { 506 hmm_pfns_special(pfns, start, end); 507 return -EINVAL; 508 } 509 510 /* Sanity check, this really should not happen ! */ 511 if (start < vma->vm_start || start >= vma->vm_end) 512 return -EINVAL; 513 if (end < vma->vm_start || end > vma->vm_end) 514 return -EINVAL; 515 516 hmm = hmm_register(vma->vm_mm); 517 if (!hmm) 518 return -ENOMEM; 519 /* Caller must have registered a mirror, via hmm_mirror_register() ! */ 520 if (!hmm->mmu_notifier.ops) 521 return -EINVAL; 522 523 /* Initialize range to track CPU page table update */ 524 range->start = start; 525 range->pfns = pfns; 526 range->end = end; 527 spin_lock(&hmm->lock); 528 range->valid = true; 529 list_add_rcu(&range->list, &hmm->ranges); 530 spin_unlock(&hmm->lock); 531 532 hmm_vma_walk.fault = false; 533 hmm_vma_walk.range = range; 534 mm_walk.private = &hmm_vma_walk; 535 536 mm_walk.vma = vma; 537 mm_walk.mm = vma->vm_mm; 538 mm_walk.pte_entry = NULL; 539 mm_walk.test_walk = NULL; 540 mm_walk.hugetlb_entry = NULL; 541 mm_walk.pmd_entry = hmm_vma_walk_pmd; 542 mm_walk.pte_hole = hmm_vma_walk_hole; 543 544 walk_page_range(start, end, &mm_walk); 545 return 0; 546 } 547 EXPORT_SYMBOL(hmm_vma_get_pfns); 548 549 /* 550 * hmm_vma_range_done() - stop tracking change to CPU page table over a range 551 * @vma: virtual memory area containing the virtual address range 552 * @range: range being tracked 553 * Returns: false if range data has been invalidated, true otherwise 554 * 555 * Range struct is used to track updates to the CPU page table after a call to 556 * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done 557 * using the data, or wants to lock updates to the data it got from those 558 * functions, it must call the hmm_vma_range_done() function, which will then 559 * stop tracking CPU page table updates. 560 * 561 * Note that device driver must still implement general CPU page table update 562 * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using 563 * the mmu_notifier API directly. 564 * 565 * CPU page table update tracking done through hmm_range is only temporary and 566 * to be used while trying to duplicate CPU page table contents for a range of 567 * virtual addresses. 568 * 569 * There are two ways to use this : 570 * again: 571 * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); 572 * trans = device_build_page_table_update_transaction(pfns); 573 * device_page_table_lock(); 574 * if (!hmm_vma_range_done(vma, range)) { 575 * device_page_table_unlock(); 576 * goto again; 577 * } 578 * device_commit_transaction(trans); 579 * device_page_table_unlock(); 580 * 581 * Or: 582 * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); 583 * device_page_table_lock(); 584 * hmm_vma_range_done(vma, range); 585 * device_update_page_table(pfns); 586 * device_page_table_unlock(); 587 */ 588 bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) 589 { 590 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; 591 struct hmm *hmm; 592 593 if (range->end <= range->start) { 594 BUG(); 595 return false; 596 } 597 598 hmm = hmm_register(vma->vm_mm); 599 if (!hmm) { 600 memset(range->pfns, 0, sizeof(*range->pfns) * npages); 601 return false; 602 } 603 604 spin_lock(&hmm->lock); 605 list_del_rcu(&range->list); 606 spin_unlock(&hmm->lock); 607 608 return range->valid; 609 } 610 EXPORT_SYMBOL(hmm_vma_range_done); 611 612 /* 613 * hmm_vma_fault() - try to fault some address in a virtual address range 614 * @vma: virtual memory area containing the virtual address range 615 * @range: use to track pfns array content validity 616 * @start: fault range virtual start address (inclusive) 617 * @end: fault range virtual end address (exclusive) 618 * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted 619 * @write: is it a write fault 620 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 621 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) 622 * 623 * This is similar to a regular CPU page fault except that it will not trigger 624 * any memory migration if the memory being faulted is not accessible by CPUs. 625 * 626 * On error, for one virtual address in the range, the function will set the 627 * hmm_pfn_t error flag for the corresponding pfn entry. 628 * 629 * Expected use pattern: 630 * retry: 631 * down_read(&mm->mmap_sem); 632 * // Find vma and address device wants to fault, initialize hmm_pfn_t 633 * // array accordingly 634 * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); 635 * switch (ret) { 636 * case -EAGAIN: 637 * hmm_vma_range_done(vma, range); 638 * // You might want to rate limit or yield to play nicely, you may 639 * // also commit any valid pfn in the array assuming that you are 640 * // getting true from hmm_vma_range_monitor_end() 641 * goto retry; 642 * case 0: 643 * break; 644 * default: 645 * // Handle error ! 646 * up_read(&mm->mmap_sem) 647 * return; 648 * } 649 * // Take device driver lock that serialize device page table update 650 * driver_lock_device_page_table_update(); 651 * hmm_vma_range_done(vma, range); 652 * // Commit pfns we got from hmm_vma_fault() 653 * driver_unlock_device_page_table_update(); 654 * up_read(&mm->mmap_sem) 655 * 656 * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) 657 * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! 658 * 659 * YOU HAVE BEEN WARNED ! 660 */ 661 int hmm_vma_fault(struct vm_area_struct *vma, 662 struct hmm_range *range, 663 unsigned long start, 664 unsigned long end, 665 hmm_pfn_t *pfns, 666 bool write, 667 bool block) 668 { 669 struct hmm_vma_walk hmm_vma_walk; 670 struct mm_walk mm_walk; 671 struct hmm *hmm; 672 int ret; 673 674 /* Sanity check, this really should not happen ! */ 675 if (start < vma->vm_start || start >= vma->vm_end) 676 return -EINVAL; 677 if (end < vma->vm_start || end > vma->vm_end) 678 return -EINVAL; 679 680 hmm = hmm_register(vma->vm_mm); 681 if (!hmm) { 682 hmm_pfns_clear(pfns, start, end); 683 return -ENOMEM; 684 } 685 /* Caller must have registered a mirror using hmm_mirror_register() */ 686 if (!hmm->mmu_notifier.ops) 687 return -EINVAL; 688 689 /* Initialize range to track CPU page table update */ 690 range->start = start; 691 range->pfns = pfns; 692 range->end = end; 693 spin_lock(&hmm->lock); 694 range->valid = true; 695 list_add_rcu(&range->list, &hmm->ranges); 696 spin_unlock(&hmm->lock); 697 698 /* FIXME support hugetlb fs */ 699 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { 700 hmm_pfns_special(pfns, start, end); 701 return 0; 702 } 703 704 hmm_vma_walk.fault = true; 705 hmm_vma_walk.write = write; 706 hmm_vma_walk.block = block; 707 hmm_vma_walk.range = range; 708 mm_walk.private = &hmm_vma_walk; 709 hmm_vma_walk.last = range->start; 710 711 mm_walk.vma = vma; 712 mm_walk.mm = vma->vm_mm; 713 mm_walk.pte_entry = NULL; 714 mm_walk.test_walk = NULL; 715 mm_walk.hugetlb_entry = NULL; 716 mm_walk.pmd_entry = hmm_vma_walk_pmd; 717 mm_walk.pte_hole = hmm_vma_walk_hole; 718 719 do { 720 ret = walk_page_range(start, end, &mm_walk); 721 start = hmm_vma_walk.last; 722 } while (ret == -EAGAIN); 723 724 if (ret) { 725 unsigned long i; 726 727 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 728 hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); 729 hmm_vma_range_done(vma, range); 730 } 731 return ret; 732 } 733 EXPORT_SYMBOL(hmm_vma_fault); 734 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 735 736 737 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) 738 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, 739 unsigned long addr) 740 { 741 struct page *page; 742 743 page = alloc_page_vma(GFP_HIGHUSER, vma, addr); 744 if (!page) 745 return NULL; 746 lock_page(page); 747 return page; 748 } 749 EXPORT_SYMBOL(hmm_vma_alloc_locked_page); 750 751 752 static void hmm_devmem_ref_release(struct percpu_ref *ref) 753 { 754 struct hmm_devmem *devmem; 755 756 devmem = container_of(ref, struct hmm_devmem, ref); 757 complete(&devmem->completion); 758 } 759 760 static void hmm_devmem_ref_exit(void *data) 761 { 762 struct percpu_ref *ref = data; 763 struct hmm_devmem *devmem; 764 765 devmem = container_of(ref, struct hmm_devmem, ref); 766 percpu_ref_exit(ref); 767 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); 768 } 769 770 static void hmm_devmem_ref_kill(void *data) 771 { 772 struct percpu_ref *ref = data; 773 struct hmm_devmem *devmem; 774 775 devmem = container_of(ref, struct hmm_devmem, ref); 776 percpu_ref_kill(ref); 777 wait_for_completion(&devmem->completion); 778 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); 779 } 780 781 static int hmm_devmem_fault(struct vm_area_struct *vma, 782 unsigned long addr, 783 const struct page *page, 784 unsigned int flags, 785 pmd_t *pmdp) 786 { 787 struct hmm_devmem *devmem = page->pgmap->data; 788 789 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); 790 } 791 792 static void hmm_devmem_free(struct page *page, void *data) 793 { 794 struct hmm_devmem *devmem = data; 795 796 devmem->ops->free(devmem, page); 797 } 798 799 static DEFINE_MUTEX(hmm_devmem_lock); 800 static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); 801 802 static void hmm_devmem_radix_release(struct resource *resource) 803 { 804 resource_size_t key, align_start, align_size; 805 806 align_start = resource->start & ~(PA_SECTION_SIZE - 1); 807 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); 808 809 mutex_lock(&hmm_devmem_lock); 810 for (key = resource->start; 811 key <= resource->end; 812 key += PA_SECTION_SIZE) 813 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); 814 mutex_unlock(&hmm_devmem_lock); 815 } 816 817 static void hmm_devmem_release(struct device *dev, void *data) 818 { 819 struct hmm_devmem *devmem = data; 820 struct resource *resource = devmem->resource; 821 unsigned long start_pfn, npages; 822 struct zone *zone; 823 struct page *page; 824 825 if (percpu_ref_tryget_live(&devmem->ref)) { 826 dev_WARN(dev, "%s: page mapping is still live!\n", __func__); 827 percpu_ref_put(&devmem->ref); 828 } 829 830 /* pages are dead and unused, undo the arch mapping */ 831 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; 832 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; 833 834 page = pfn_to_page(start_pfn); 835 zone = page_zone(page); 836 837 mem_hotplug_begin(); 838 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) 839 __remove_pages(zone, start_pfn, npages, NULL); 840 else 841 arch_remove_memory(start_pfn << PAGE_SHIFT, 842 npages << PAGE_SHIFT, NULL); 843 mem_hotplug_done(); 844 845 hmm_devmem_radix_release(resource); 846 } 847 848 static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) 849 { 850 WARN_ON_ONCE(!rcu_read_lock_held()); 851 852 return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); 853 } 854 855 static int hmm_devmem_pages_create(struct hmm_devmem *devmem) 856 { 857 resource_size_t key, align_start, align_size, align_end; 858 struct device *device = devmem->device; 859 int ret, nid, is_ram; 860 unsigned long pfn; 861 862 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); 863 align_size = ALIGN(devmem->resource->start + 864 resource_size(devmem->resource), 865 PA_SECTION_SIZE) - align_start; 866 867 is_ram = region_intersects(align_start, align_size, 868 IORESOURCE_SYSTEM_RAM, 869 IORES_DESC_NONE); 870 if (is_ram == REGION_MIXED) { 871 WARN_ONCE(1, "%s attempted on mixed region %pr\n", 872 __func__, devmem->resource); 873 return -ENXIO; 874 } 875 if (is_ram == REGION_INTERSECTS) 876 return -ENXIO; 877 878 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) 879 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; 880 else 881 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 882 883 devmem->pagemap.res = *devmem->resource; 884 devmem->pagemap.page_fault = hmm_devmem_fault; 885 devmem->pagemap.page_free = hmm_devmem_free; 886 devmem->pagemap.dev = devmem->device; 887 devmem->pagemap.ref = &devmem->ref; 888 devmem->pagemap.data = devmem; 889 890 mutex_lock(&hmm_devmem_lock); 891 align_end = align_start + align_size - 1; 892 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { 893 struct hmm_devmem *dup; 894 895 rcu_read_lock(); 896 dup = hmm_devmem_find(key); 897 rcu_read_unlock(); 898 if (dup) { 899 dev_err(device, "%s: collides with mapping for %s\n", 900 __func__, dev_name(dup->device)); 901 mutex_unlock(&hmm_devmem_lock); 902 ret = -EBUSY; 903 goto error; 904 } 905 ret = radix_tree_insert(&hmm_devmem_radix, 906 key >> PA_SECTION_SHIFT, 907 devmem); 908 if (ret) { 909 dev_err(device, "%s: failed: %d\n", __func__, ret); 910 mutex_unlock(&hmm_devmem_lock); 911 goto error_radix; 912 } 913 } 914 mutex_unlock(&hmm_devmem_lock); 915 916 nid = dev_to_node(device); 917 if (nid < 0) 918 nid = numa_mem_id(); 919 920 mem_hotplug_begin(); 921 /* 922 * For device private memory we call add_pages() as we only need to 923 * allocate and initialize struct page for the device memory. More- 924 * over the device memory is un-accessible thus we do not want to 925 * create a linear mapping for the memory like arch_add_memory() 926 * would do. 927 * 928 * For device public memory, which is accesible by the CPU, we do 929 * want the linear mapping and thus use arch_add_memory(). 930 */ 931 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) 932 ret = arch_add_memory(nid, align_start, align_size, NULL, 933 false); 934 else 935 ret = add_pages(nid, align_start >> PAGE_SHIFT, 936 align_size >> PAGE_SHIFT, NULL, false); 937 if (ret) { 938 mem_hotplug_done(); 939 goto error_add_memory; 940 } 941 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], 942 align_start >> PAGE_SHIFT, 943 align_size >> PAGE_SHIFT, NULL); 944 mem_hotplug_done(); 945 946 for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { 947 struct page *page = pfn_to_page(pfn); 948 949 page->pgmap = &devmem->pagemap; 950 } 951 return 0; 952 953 error_add_memory: 954 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); 955 error_radix: 956 hmm_devmem_radix_release(devmem->resource); 957 error: 958 return ret; 959 } 960 961 static int hmm_devmem_match(struct device *dev, void *data, void *match_data) 962 { 963 struct hmm_devmem *devmem = data; 964 965 return devmem->resource == match_data; 966 } 967 968 static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) 969 { 970 devres_release(devmem->device, &hmm_devmem_release, 971 &hmm_devmem_match, devmem->resource); 972 } 973 974 /* 975 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory 976 * 977 * @ops: memory event device driver callback (see struct hmm_devmem_ops) 978 * @device: device struct to bind the resource too 979 * @size: size in bytes of the device memory to add 980 * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise 981 * 982 * This function first finds an empty range of physical address big enough to 983 * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which 984 * in turn allocates struct pages. It does not do anything beyond that; all 985 * events affecting the memory will go through the various callbacks provided 986 * by hmm_devmem_ops struct. 987 * 988 * Device driver should call this function during device initialization and 989 * is then responsible of memory management. HMM only provides helpers. 990 */ 991 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, 992 struct device *device, 993 unsigned long size) 994 { 995 struct hmm_devmem *devmem; 996 resource_size_t addr; 997 int ret; 998 999 static_branch_enable(&device_private_key); 1000 1001 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1002 GFP_KERNEL, dev_to_node(device)); 1003 if (!devmem) 1004 return ERR_PTR(-ENOMEM); 1005 1006 init_completion(&devmem->completion); 1007 devmem->pfn_first = -1UL; 1008 devmem->pfn_last = -1UL; 1009 devmem->resource = NULL; 1010 devmem->device = device; 1011 devmem->ops = ops; 1012 1013 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1014 0, GFP_KERNEL); 1015 if (ret) 1016 goto error_percpu_ref; 1017 1018 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1019 if (ret) 1020 goto error_devm_add_action; 1021 1022 size = ALIGN(size, PA_SECTION_SIZE); 1023 addr = min((unsigned long)iomem_resource.end, 1024 (1UL << MAX_PHYSMEM_BITS) - 1); 1025 addr = addr - size + 1UL; 1026 1027 /* 1028 * FIXME add a new helper to quickly walk resource tree and find free 1029 * range 1030 * 1031 * FIXME what about ioport_resource resource ? 1032 */ 1033 for (; addr > size && addr >= iomem_resource.start; addr -= size) { 1034 ret = region_intersects(addr, size, 0, IORES_DESC_NONE); 1035 if (ret != REGION_DISJOINT) 1036 continue; 1037 1038 devmem->resource = devm_request_mem_region(device, addr, size, 1039 dev_name(device)); 1040 if (!devmem->resource) { 1041 ret = -ENOMEM; 1042 goto error_no_resource; 1043 } 1044 break; 1045 } 1046 if (!devmem->resource) { 1047 ret = -ERANGE; 1048 goto error_no_resource; 1049 } 1050 1051 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1052 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1053 devmem->pfn_last = devmem->pfn_first + 1054 (resource_size(devmem->resource) >> PAGE_SHIFT); 1055 1056 ret = hmm_devmem_pages_create(devmem); 1057 if (ret) 1058 goto error_pages; 1059 1060 devres_add(device, devmem); 1061 1062 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1063 if (ret) { 1064 hmm_devmem_remove(devmem); 1065 return ERR_PTR(ret); 1066 } 1067 1068 return devmem; 1069 1070 error_pages: 1071 devm_release_mem_region(device, devmem->resource->start, 1072 resource_size(devmem->resource)); 1073 error_no_resource: 1074 error_devm_add_action: 1075 hmm_devmem_ref_kill(&devmem->ref); 1076 hmm_devmem_ref_exit(&devmem->ref); 1077 error_percpu_ref: 1078 devres_free(devmem); 1079 return ERR_PTR(ret); 1080 } 1081 EXPORT_SYMBOL(hmm_devmem_add); 1082 1083 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 1084 struct device *device, 1085 struct resource *res) 1086 { 1087 struct hmm_devmem *devmem; 1088 int ret; 1089 1090 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) 1091 return ERR_PTR(-EINVAL); 1092 1093 static_branch_enable(&device_private_key); 1094 1095 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1096 GFP_KERNEL, dev_to_node(device)); 1097 if (!devmem) 1098 return ERR_PTR(-ENOMEM); 1099 1100 init_completion(&devmem->completion); 1101 devmem->pfn_first = -1UL; 1102 devmem->pfn_last = -1UL; 1103 devmem->resource = res; 1104 devmem->device = device; 1105 devmem->ops = ops; 1106 1107 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1108 0, GFP_KERNEL); 1109 if (ret) 1110 goto error_percpu_ref; 1111 1112 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1113 if (ret) 1114 goto error_devm_add_action; 1115 1116 1117 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1118 devmem->pfn_last = devmem->pfn_first + 1119 (resource_size(devmem->resource) >> PAGE_SHIFT); 1120 1121 ret = hmm_devmem_pages_create(devmem); 1122 if (ret) 1123 goto error_devm_add_action; 1124 1125 devres_add(device, devmem); 1126 1127 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1128 if (ret) { 1129 hmm_devmem_remove(devmem); 1130 return ERR_PTR(ret); 1131 } 1132 1133 return devmem; 1134 1135 error_devm_add_action: 1136 hmm_devmem_ref_kill(&devmem->ref); 1137 hmm_devmem_ref_exit(&devmem->ref); 1138 error_percpu_ref: 1139 devres_free(devmem); 1140 return ERR_PTR(ret); 1141 } 1142 EXPORT_SYMBOL(hmm_devmem_add_resource); 1143 1144 /* 1145 * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) 1146 * 1147 * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory 1148 * 1149 * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf 1150 * of the device driver. It will free struct page and remove the resource that 1151 * reserved the physical address range for this device memory. 1152 */ 1153 void hmm_devmem_remove(struct hmm_devmem *devmem) 1154 { 1155 resource_size_t start, size; 1156 struct device *device; 1157 bool cdm = false; 1158 1159 if (!devmem) 1160 return; 1161 1162 device = devmem->device; 1163 start = devmem->resource->start; 1164 size = resource_size(devmem->resource); 1165 1166 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; 1167 hmm_devmem_ref_kill(&devmem->ref); 1168 hmm_devmem_ref_exit(&devmem->ref); 1169 hmm_devmem_pages_remove(devmem); 1170 1171 if (!cdm) 1172 devm_release_mem_region(device, start, size); 1173 } 1174 EXPORT_SYMBOL(hmm_devmem_remove); 1175 1176 /* 1177 * A device driver that wants to handle multiple devices memory through a 1178 * single fake device can use hmm_device to do so. This is purely a helper 1179 * and it is not needed to make use of any HMM functionality. 1180 */ 1181 #define HMM_DEVICE_MAX 256 1182 1183 static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); 1184 static DEFINE_SPINLOCK(hmm_device_lock); 1185 static struct class *hmm_device_class; 1186 static dev_t hmm_device_devt; 1187 1188 static void hmm_device_release(struct device *device) 1189 { 1190 struct hmm_device *hmm_device; 1191 1192 hmm_device = container_of(device, struct hmm_device, device); 1193 spin_lock(&hmm_device_lock); 1194 clear_bit(hmm_device->minor, hmm_device_mask); 1195 spin_unlock(&hmm_device_lock); 1196 1197 kfree(hmm_device); 1198 } 1199 1200 struct hmm_device *hmm_device_new(void *drvdata) 1201 { 1202 struct hmm_device *hmm_device; 1203 1204 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); 1205 if (!hmm_device) 1206 return ERR_PTR(-ENOMEM); 1207 1208 spin_lock(&hmm_device_lock); 1209 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); 1210 if (hmm_device->minor >= HMM_DEVICE_MAX) { 1211 spin_unlock(&hmm_device_lock); 1212 kfree(hmm_device); 1213 return ERR_PTR(-EBUSY); 1214 } 1215 set_bit(hmm_device->minor, hmm_device_mask); 1216 spin_unlock(&hmm_device_lock); 1217 1218 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); 1219 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), 1220 hmm_device->minor); 1221 hmm_device->device.release = hmm_device_release; 1222 dev_set_drvdata(&hmm_device->device, drvdata); 1223 hmm_device->device.class = hmm_device_class; 1224 device_initialize(&hmm_device->device); 1225 1226 return hmm_device; 1227 } 1228 EXPORT_SYMBOL(hmm_device_new); 1229 1230 void hmm_device_put(struct hmm_device *hmm_device) 1231 { 1232 put_device(&hmm_device->device); 1233 } 1234 EXPORT_SYMBOL(hmm_device_put); 1235 1236 static int __init hmm_init(void) 1237 { 1238 int ret; 1239 1240 ret = alloc_chrdev_region(&hmm_device_devt, 0, 1241 HMM_DEVICE_MAX, 1242 "hmm_device"); 1243 if (ret) 1244 return ret; 1245 1246 hmm_device_class = class_create(THIS_MODULE, "hmm_device"); 1247 if (IS_ERR(hmm_device_class)) { 1248 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); 1249 return PTR_ERR(hmm_device_class); 1250 } 1251 return 0; 1252 } 1253 1254 device_initcall(hmm_init); 1255 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 1256