1 /* 2 * Copyright 2013 Red Hat Inc. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * Authors: Jérôme Glisse <jglisse@redhat.com> 15 */ 16 /* 17 * Refer to include/linux/hmm.h for information about heterogeneous memory 18 * management or HMM for short. 19 */ 20 #include <linux/mm.h> 21 #include <linux/hmm.h> 22 #include <linux/init.h> 23 #include <linux/rmap.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/sched.h> 27 #include <linux/mmzone.h> 28 #include <linux/pagemap.h> 29 #include <linux/swapops.h> 30 #include <linux/hugetlb.h> 31 #include <linux/memremap.h> 32 #include <linux/jump_label.h> 33 #include <linux/mmu_notifier.h> 34 #include <linux/memory_hotplug.h> 35 36 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) 37 38 #if IS_ENABLED(CONFIG_HMM_MIRROR) 39 static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 40 41 /* 42 * struct hmm - HMM per mm struct 43 * 44 * @mm: mm struct this HMM struct is bound to 45 * @lock: lock protecting ranges list 46 * @ranges: list of range being snapshotted 47 * @mirrors: list of mirrors for this mm 48 * @mmu_notifier: mmu notifier to track updates to CPU page table 49 * @mirrors_sem: read/write semaphore protecting the mirrors list 50 */ 51 struct hmm { 52 struct mm_struct *mm; 53 spinlock_t lock; 54 struct list_head ranges; 55 struct list_head mirrors; 56 struct mmu_notifier mmu_notifier; 57 struct rw_semaphore mirrors_sem; 58 }; 59 60 /* 61 * hmm_register - register HMM against an mm (HMM internal) 62 * 63 * @mm: mm struct to attach to 64 * 65 * This is not intended to be used directly by device drivers. It allocates an 66 * HMM struct if mm does not have one, and initializes it. 67 */ 68 static struct hmm *hmm_register(struct mm_struct *mm) 69 { 70 struct hmm *hmm = READ_ONCE(mm->hmm); 71 bool cleanup = false; 72 73 /* 74 * The hmm struct can only be freed once the mm_struct goes away, 75 * hence we should always have pre-allocated an new hmm struct 76 * above. 77 */ 78 if (hmm) 79 return hmm; 80 81 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 82 if (!hmm) 83 return NULL; 84 INIT_LIST_HEAD(&hmm->mirrors); 85 init_rwsem(&hmm->mirrors_sem); 86 hmm->mmu_notifier.ops = NULL; 87 INIT_LIST_HEAD(&hmm->ranges); 88 spin_lock_init(&hmm->lock); 89 hmm->mm = mm; 90 91 spin_lock(&mm->page_table_lock); 92 if (!mm->hmm) 93 mm->hmm = hmm; 94 else 95 cleanup = true; 96 spin_unlock(&mm->page_table_lock); 97 98 if (cleanup) 99 goto error; 100 101 /* 102 * We should only get here if hold the mmap_sem in write mode ie on 103 * registration of first mirror through hmm_mirror_register() 104 */ 105 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; 106 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) 107 goto error_mm; 108 109 return mm->hmm; 110 111 error_mm: 112 spin_lock(&mm->page_table_lock); 113 if (mm->hmm == hmm) 114 mm->hmm = NULL; 115 spin_unlock(&mm->page_table_lock); 116 error: 117 kfree(hmm); 118 return NULL; 119 } 120 121 void hmm_mm_destroy(struct mm_struct *mm) 122 { 123 kfree(mm->hmm); 124 } 125 126 static int hmm_invalidate_range(struct hmm *hmm, bool device, 127 const struct hmm_update *update) 128 { 129 struct hmm_mirror *mirror; 130 struct hmm_range *range; 131 132 spin_lock(&hmm->lock); 133 list_for_each_entry(range, &hmm->ranges, list) { 134 unsigned long addr, idx, npages; 135 136 if (update->end < range->start || update->start >= range->end) 137 continue; 138 139 range->valid = false; 140 addr = max(update->start, range->start); 141 idx = (addr - range->start) >> PAGE_SHIFT; 142 npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; 143 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); 144 } 145 spin_unlock(&hmm->lock); 146 147 if (!device) 148 return 0; 149 150 down_read(&hmm->mirrors_sem); 151 list_for_each_entry(mirror, &hmm->mirrors, list) { 152 int ret; 153 154 ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); 155 if (!update->blockable && ret == -EAGAIN) { 156 up_read(&hmm->mirrors_sem); 157 return -EAGAIN; 158 } 159 } 160 up_read(&hmm->mirrors_sem); 161 162 return 0; 163 } 164 165 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 166 { 167 struct hmm_mirror *mirror; 168 struct hmm *hmm = mm->hmm; 169 170 down_write(&hmm->mirrors_sem); 171 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, 172 list); 173 while (mirror) { 174 list_del_init(&mirror->list); 175 if (mirror->ops->release) { 176 /* 177 * Drop mirrors_sem so callback can wait on any pending 178 * work that might itself trigger mmu_notifier callback 179 * and thus would deadlock with us. 180 */ 181 up_write(&hmm->mirrors_sem); 182 mirror->ops->release(mirror); 183 down_write(&hmm->mirrors_sem); 184 } 185 mirror = list_first_entry_or_null(&hmm->mirrors, 186 struct hmm_mirror, list); 187 } 188 up_write(&hmm->mirrors_sem); 189 } 190 191 static int hmm_invalidate_range_start(struct mmu_notifier *mn, 192 struct mm_struct *mm, 193 unsigned long start, 194 unsigned long end, 195 bool blockable) 196 { 197 struct hmm_update update; 198 struct hmm *hmm = mm->hmm; 199 200 VM_BUG_ON(!hmm); 201 202 update.start = start; 203 update.end = end; 204 update.event = HMM_UPDATE_INVALIDATE; 205 update.blockable = blockable; 206 return hmm_invalidate_range(hmm, true, &update); 207 } 208 209 static void hmm_invalidate_range_end(struct mmu_notifier *mn, 210 struct mm_struct *mm, 211 unsigned long start, 212 unsigned long end) 213 { 214 struct hmm_update update; 215 struct hmm *hmm = mm->hmm; 216 217 VM_BUG_ON(!hmm); 218 219 update.start = start; 220 update.end = end; 221 update.event = HMM_UPDATE_INVALIDATE; 222 update.blockable = true; 223 hmm_invalidate_range(hmm, false, &update); 224 } 225 226 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 227 .release = hmm_release, 228 .invalidate_range_start = hmm_invalidate_range_start, 229 .invalidate_range_end = hmm_invalidate_range_end, 230 }; 231 232 /* 233 * hmm_mirror_register() - register a mirror against an mm 234 * 235 * @mirror: new mirror struct to register 236 * @mm: mm to register against 237 * 238 * To start mirroring a process address space, the device driver must register 239 * an HMM mirror struct. 240 * 241 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! 242 */ 243 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 244 { 245 /* Sanity check */ 246 if (!mm || !mirror || !mirror->ops) 247 return -EINVAL; 248 249 again: 250 mirror->hmm = hmm_register(mm); 251 if (!mirror->hmm) 252 return -ENOMEM; 253 254 down_write(&mirror->hmm->mirrors_sem); 255 if (mirror->hmm->mm == NULL) { 256 /* 257 * A racing hmm_mirror_unregister() is about to destroy the hmm 258 * struct. Try again to allocate a new one. 259 */ 260 up_write(&mirror->hmm->mirrors_sem); 261 mirror->hmm = NULL; 262 goto again; 263 } else { 264 list_add(&mirror->list, &mirror->hmm->mirrors); 265 up_write(&mirror->hmm->mirrors_sem); 266 } 267 268 return 0; 269 } 270 EXPORT_SYMBOL(hmm_mirror_register); 271 272 /* 273 * hmm_mirror_unregister() - unregister a mirror 274 * 275 * @mirror: new mirror struct to register 276 * 277 * Stop mirroring a process address space, and cleanup. 278 */ 279 void hmm_mirror_unregister(struct hmm_mirror *mirror) 280 { 281 bool should_unregister = false; 282 struct mm_struct *mm; 283 struct hmm *hmm; 284 285 if (mirror->hmm == NULL) 286 return; 287 288 hmm = mirror->hmm; 289 down_write(&hmm->mirrors_sem); 290 list_del_init(&mirror->list); 291 should_unregister = list_empty(&hmm->mirrors); 292 mirror->hmm = NULL; 293 mm = hmm->mm; 294 hmm->mm = NULL; 295 up_write(&hmm->mirrors_sem); 296 297 if (!should_unregister || mm == NULL) 298 return; 299 300 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); 301 302 spin_lock(&mm->page_table_lock); 303 if (mm->hmm == hmm) 304 mm->hmm = NULL; 305 spin_unlock(&mm->page_table_lock); 306 307 kfree(hmm); 308 } 309 EXPORT_SYMBOL(hmm_mirror_unregister); 310 311 struct hmm_vma_walk { 312 struct hmm_range *range; 313 unsigned long last; 314 bool fault; 315 bool block; 316 }; 317 318 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 319 bool write_fault, uint64_t *pfn) 320 { 321 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 322 struct hmm_vma_walk *hmm_vma_walk = walk->private; 323 struct hmm_range *range = hmm_vma_walk->range; 324 struct vm_area_struct *vma = walk->vma; 325 vm_fault_t ret; 326 327 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 328 flags |= write_fault ? FAULT_FLAG_WRITE : 0; 329 ret = handle_mm_fault(vma, addr, flags); 330 if (ret & VM_FAULT_RETRY) 331 return -EBUSY; 332 if (ret & VM_FAULT_ERROR) { 333 *pfn = range->values[HMM_PFN_ERROR]; 334 return -EFAULT; 335 } 336 337 return -EAGAIN; 338 } 339 340 static int hmm_pfns_bad(unsigned long addr, 341 unsigned long end, 342 struct mm_walk *walk) 343 { 344 struct hmm_vma_walk *hmm_vma_walk = walk->private; 345 struct hmm_range *range = hmm_vma_walk->range; 346 uint64_t *pfns = range->pfns; 347 unsigned long i; 348 349 i = (addr - range->start) >> PAGE_SHIFT; 350 for (; addr < end; addr += PAGE_SIZE, i++) 351 pfns[i] = range->values[HMM_PFN_ERROR]; 352 353 return 0; 354 } 355 356 /* 357 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 358 * @start: range virtual start address (inclusive) 359 * @end: range virtual end address (exclusive) 360 * @fault: should we fault or not ? 361 * @write_fault: write fault ? 362 * @walk: mm_walk structure 363 * Returns: 0 on success, -EAGAIN after page fault, or page fault error 364 * 365 * This function will be called whenever pmd_none() or pte_none() returns true, 366 * or whenever there is no page directory covering the virtual address range. 367 */ 368 static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 369 bool fault, bool write_fault, 370 struct mm_walk *walk) 371 { 372 struct hmm_vma_walk *hmm_vma_walk = walk->private; 373 struct hmm_range *range = hmm_vma_walk->range; 374 uint64_t *pfns = range->pfns; 375 unsigned long i; 376 377 hmm_vma_walk->last = addr; 378 i = (addr - range->start) >> PAGE_SHIFT; 379 for (; addr < end; addr += PAGE_SIZE, i++) { 380 pfns[i] = range->values[HMM_PFN_NONE]; 381 if (fault || write_fault) { 382 int ret; 383 384 ret = hmm_vma_do_fault(walk, addr, write_fault, 385 &pfns[i]); 386 if (ret != -EAGAIN) 387 return ret; 388 } 389 } 390 391 return (fault || write_fault) ? -EAGAIN : 0; 392 } 393 394 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 395 uint64_t pfns, uint64_t cpu_flags, 396 bool *fault, bool *write_fault) 397 { 398 struct hmm_range *range = hmm_vma_walk->range; 399 400 *fault = *write_fault = false; 401 if (!hmm_vma_walk->fault) 402 return; 403 404 /* We aren't ask to do anything ... */ 405 if (!(pfns & range->flags[HMM_PFN_VALID])) 406 return; 407 /* If this is device memory than only fault if explicitly requested */ 408 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 409 /* Do we fault on device memory ? */ 410 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 411 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 412 *fault = true; 413 } 414 return; 415 } 416 417 /* If CPU page table is not valid then we need to fault */ 418 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 419 /* Need to write fault ? */ 420 if ((pfns & range->flags[HMM_PFN_WRITE]) && 421 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 422 *write_fault = true; 423 *fault = true; 424 } 425 } 426 427 static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 428 const uint64_t *pfns, unsigned long npages, 429 uint64_t cpu_flags, bool *fault, 430 bool *write_fault) 431 { 432 unsigned long i; 433 434 if (!hmm_vma_walk->fault) { 435 *fault = *write_fault = false; 436 return; 437 } 438 439 for (i = 0; i < npages; ++i) { 440 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 441 fault, write_fault); 442 if ((*fault) || (*write_fault)) 443 return; 444 } 445 } 446 447 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 448 struct mm_walk *walk) 449 { 450 struct hmm_vma_walk *hmm_vma_walk = walk->private; 451 struct hmm_range *range = hmm_vma_walk->range; 452 bool fault, write_fault; 453 unsigned long i, npages; 454 uint64_t *pfns; 455 456 i = (addr - range->start) >> PAGE_SHIFT; 457 npages = (end - addr) >> PAGE_SHIFT; 458 pfns = &range->pfns[i]; 459 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 460 0, &fault, &write_fault); 461 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 462 } 463 464 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 465 { 466 if (pmd_protnone(pmd)) 467 return 0; 468 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 469 range->flags[HMM_PFN_WRITE] : 470 range->flags[HMM_PFN_VALID]; 471 } 472 473 static int hmm_vma_handle_pmd(struct mm_walk *walk, 474 unsigned long addr, 475 unsigned long end, 476 uint64_t *pfns, 477 pmd_t pmd) 478 { 479 struct hmm_vma_walk *hmm_vma_walk = walk->private; 480 struct hmm_range *range = hmm_vma_walk->range; 481 unsigned long pfn, npages, i; 482 bool fault, write_fault; 483 uint64_t cpu_flags; 484 485 npages = (end - addr) >> PAGE_SHIFT; 486 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 487 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 488 &fault, &write_fault); 489 490 if (pmd_protnone(pmd) || fault || write_fault) 491 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 492 493 pfn = pmd_pfn(pmd) + pte_index(addr); 494 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 495 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; 496 hmm_vma_walk->last = end; 497 return 0; 498 } 499 500 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 501 { 502 if (pte_none(pte) || !pte_present(pte)) 503 return 0; 504 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 505 range->flags[HMM_PFN_WRITE] : 506 range->flags[HMM_PFN_VALID]; 507 } 508 509 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 510 unsigned long end, pmd_t *pmdp, pte_t *ptep, 511 uint64_t *pfn) 512 { 513 struct hmm_vma_walk *hmm_vma_walk = walk->private; 514 struct hmm_range *range = hmm_vma_walk->range; 515 struct vm_area_struct *vma = walk->vma; 516 bool fault, write_fault; 517 uint64_t cpu_flags; 518 pte_t pte = *ptep; 519 uint64_t orig_pfn = *pfn; 520 521 *pfn = range->values[HMM_PFN_NONE]; 522 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 523 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 524 &fault, &write_fault); 525 526 if (pte_none(pte)) { 527 if (fault || write_fault) 528 goto fault; 529 return 0; 530 } 531 532 if (!pte_present(pte)) { 533 swp_entry_t entry = pte_to_swp_entry(pte); 534 535 if (!non_swap_entry(entry)) { 536 if (fault || write_fault) 537 goto fault; 538 return 0; 539 } 540 541 /* 542 * This is a special swap entry, ignore migration, use 543 * device and report anything else as error. 544 */ 545 if (is_device_private_entry(entry)) { 546 cpu_flags = range->flags[HMM_PFN_VALID] | 547 range->flags[HMM_PFN_DEVICE_PRIVATE]; 548 cpu_flags |= is_write_device_private_entry(entry) ? 549 range->flags[HMM_PFN_WRITE] : 0; 550 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 551 &fault, &write_fault); 552 if (fault || write_fault) 553 goto fault; 554 *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); 555 *pfn |= cpu_flags; 556 return 0; 557 } 558 559 if (is_migration_entry(entry)) { 560 if (fault || write_fault) { 561 pte_unmap(ptep); 562 hmm_vma_walk->last = addr; 563 migration_entry_wait(vma->vm_mm, 564 pmdp, addr); 565 return -EAGAIN; 566 } 567 return 0; 568 } 569 570 /* Report error for everything else */ 571 *pfn = range->values[HMM_PFN_ERROR]; 572 return -EFAULT; 573 } 574 575 if (fault || write_fault) 576 goto fault; 577 578 *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; 579 return 0; 580 581 fault: 582 pte_unmap(ptep); 583 /* Fault any virtual address we were asked to fault */ 584 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 585 } 586 587 static int hmm_vma_walk_pmd(pmd_t *pmdp, 588 unsigned long start, 589 unsigned long end, 590 struct mm_walk *walk) 591 { 592 struct hmm_vma_walk *hmm_vma_walk = walk->private; 593 struct hmm_range *range = hmm_vma_walk->range; 594 struct vm_area_struct *vma = walk->vma; 595 uint64_t *pfns = range->pfns; 596 unsigned long addr = start, i; 597 pte_t *ptep; 598 pmd_t pmd; 599 600 601 again: 602 pmd = READ_ONCE(*pmdp); 603 if (pmd_none(pmd)) 604 return hmm_vma_walk_hole(start, end, walk); 605 606 if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) 607 return hmm_pfns_bad(start, end, walk); 608 609 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 610 bool fault, write_fault; 611 unsigned long npages; 612 uint64_t *pfns; 613 614 i = (addr - range->start) >> PAGE_SHIFT; 615 npages = (end - addr) >> PAGE_SHIFT; 616 pfns = &range->pfns[i]; 617 618 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 619 0, &fault, &write_fault); 620 if (fault || write_fault) { 621 hmm_vma_walk->last = addr; 622 pmd_migration_entry_wait(vma->vm_mm, pmdp); 623 return -EAGAIN; 624 } 625 return 0; 626 } else if (!pmd_present(pmd)) 627 return hmm_pfns_bad(start, end, walk); 628 629 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 630 /* 631 * No need to take pmd_lock here, even if some other threads 632 * is splitting the huge pmd we will get that event through 633 * mmu_notifier callback. 634 * 635 * So just read pmd value and check again its a transparent 636 * huge or device mapping one and compute corresponding pfn 637 * values. 638 */ 639 pmd = pmd_read_atomic(pmdp); 640 barrier(); 641 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 642 goto again; 643 644 i = (addr - range->start) >> PAGE_SHIFT; 645 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 646 } 647 648 /* 649 * We have handled all the valid case above ie either none, migration, 650 * huge or transparent huge. At this point either it is a valid pmd 651 * entry pointing to pte directory or it is a bad pmd that will not 652 * recover. 653 */ 654 if (pmd_bad(pmd)) 655 return hmm_pfns_bad(start, end, walk); 656 657 ptep = pte_offset_map(pmdp, addr); 658 i = (addr - range->start) >> PAGE_SHIFT; 659 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 660 int r; 661 662 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 663 if (r) { 664 /* hmm_vma_handle_pte() did unmap pte directory */ 665 hmm_vma_walk->last = addr; 666 return r; 667 } 668 } 669 pte_unmap(ptep - 1); 670 671 hmm_vma_walk->last = addr; 672 return 0; 673 } 674 675 static void hmm_pfns_clear(struct hmm_range *range, 676 uint64_t *pfns, 677 unsigned long addr, 678 unsigned long end) 679 { 680 for (; addr < end; addr += PAGE_SIZE, pfns++) 681 *pfns = range->values[HMM_PFN_NONE]; 682 } 683 684 static void hmm_pfns_special(struct hmm_range *range) 685 { 686 unsigned long addr = range->start, i = 0; 687 688 for (; addr < range->end; addr += PAGE_SIZE, i++) 689 range->pfns[i] = range->values[HMM_PFN_SPECIAL]; 690 } 691 692 /* 693 * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses 694 * @range: range being snapshotted 695 * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 696 * vma permission, 0 success 697 * 698 * This snapshots the CPU page table for a range of virtual addresses. Snapshot 699 * validity is tracked by range struct. See hmm_vma_range_done() for further 700 * information. 701 * 702 * The range struct is initialized here. It tracks the CPU page table, but only 703 * if the function returns success (0), in which case the caller must then call 704 * hmm_vma_range_done() to stop CPU page table update tracking on this range. 705 * 706 * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS 707 * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! 708 */ 709 int hmm_vma_get_pfns(struct hmm_range *range) 710 { 711 struct vm_area_struct *vma = range->vma; 712 struct hmm_vma_walk hmm_vma_walk; 713 struct mm_walk mm_walk; 714 struct hmm *hmm; 715 716 /* Sanity check, this really should not happen ! */ 717 if (range->start < vma->vm_start || range->start >= vma->vm_end) 718 return -EINVAL; 719 if (range->end < vma->vm_start || range->end > vma->vm_end) 720 return -EINVAL; 721 722 hmm = hmm_register(vma->vm_mm); 723 if (!hmm) 724 return -ENOMEM; 725 /* Caller must have registered a mirror, via hmm_mirror_register() ! */ 726 if (!hmm->mmu_notifier.ops) 727 return -EINVAL; 728 729 /* FIXME support hugetlb fs */ 730 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 731 vma_is_dax(vma)) { 732 hmm_pfns_special(range); 733 return -EINVAL; 734 } 735 736 if (!(vma->vm_flags & VM_READ)) { 737 /* 738 * If vma do not allow read access, then assume that it does 739 * not allow write access, either. Architecture that allow 740 * write without read access are not supported by HMM, because 741 * operations such has atomic access would not work. 742 */ 743 hmm_pfns_clear(range, range->pfns, range->start, range->end); 744 return -EPERM; 745 } 746 747 /* Initialize range to track CPU page table update */ 748 spin_lock(&hmm->lock); 749 range->valid = true; 750 list_add_rcu(&range->list, &hmm->ranges); 751 spin_unlock(&hmm->lock); 752 753 hmm_vma_walk.fault = false; 754 hmm_vma_walk.range = range; 755 mm_walk.private = &hmm_vma_walk; 756 757 mm_walk.vma = vma; 758 mm_walk.mm = vma->vm_mm; 759 mm_walk.pte_entry = NULL; 760 mm_walk.test_walk = NULL; 761 mm_walk.hugetlb_entry = NULL; 762 mm_walk.pmd_entry = hmm_vma_walk_pmd; 763 mm_walk.pte_hole = hmm_vma_walk_hole; 764 765 walk_page_range(range->start, range->end, &mm_walk); 766 return 0; 767 } 768 EXPORT_SYMBOL(hmm_vma_get_pfns); 769 770 /* 771 * hmm_vma_range_done() - stop tracking change to CPU page table over a range 772 * @range: range being tracked 773 * Returns: false if range data has been invalidated, true otherwise 774 * 775 * Range struct is used to track updates to the CPU page table after a call to 776 * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done 777 * using the data, or wants to lock updates to the data it got from those 778 * functions, it must call the hmm_vma_range_done() function, which will then 779 * stop tracking CPU page table updates. 780 * 781 * Note that device driver must still implement general CPU page table update 782 * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using 783 * the mmu_notifier API directly. 784 * 785 * CPU page table update tracking done through hmm_range is only temporary and 786 * to be used while trying to duplicate CPU page table contents for a range of 787 * virtual addresses. 788 * 789 * There are two ways to use this : 790 * again: 791 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 792 * trans = device_build_page_table_update_transaction(pfns); 793 * device_page_table_lock(); 794 * if (!hmm_vma_range_done(range)) { 795 * device_page_table_unlock(); 796 * goto again; 797 * } 798 * device_commit_transaction(trans); 799 * device_page_table_unlock(); 800 * 801 * Or: 802 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 803 * device_page_table_lock(); 804 * hmm_vma_range_done(range); 805 * device_update_page_table(range->pfns); 806 * device_page_table_unlock(); 807 */ 808 bool hmm_vma_range_done(struct hmm_range *range) 809 { 810 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; 811 struct hmm *hmm; 812 813 if (range->end <= range->start) { 814 BUG(); 815 return false; 816 } 817 818 hmm = hmm_register(range->vma->vm_mm); 819 if (!hmm) { 820 memset(range->pfns, 0, sizeof(*range->pfns) * npages); 821 return false; 822 } 823 824 spin_lock(&hmm->lock); 825 list_del_rcu(&range->list); 826 spin_unlock(&hmm->lock); 827 828 return range->valid; 829 } 830 EXPORT_SYMBOL(hmm_vma_range_done); 831 832 /* 833 * hmm_vma_fault() - try to fault some address in a virtual address range 834 * @range: range being faulted 835 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 836 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) 837 * 838 * This is similar to a regular CPU page fault except that it will not trigger 839 * any memory migration if the memory being faulted is not accessible by CPUs. 840 * 841 * On error, for one virtual address in the range, the function will mark the 842 * corresponding HMM pfn entry with an error flag. 843 * 844 * Expected use pattern: 845 * retry: 846 * down_read(&mm->mmap_sem); 847 * // Find vma and address device wants to fault, initialize hmm_pfn_t 848 * // array accordingly 849 * ret = hmm_vma_fault(range, write, block); 850 * switch (ret) { 851 * case -EAGAIN: 852 * hmm_vma_range_done(range); 853 * // You might want to rate limit or yield to play nicely, you may 854 * // also commit any valid pfn in the array assuming that you are 855 * // getting true from hmm_vma_range_monitor_end() 856 * goto retry; 857 * case 0: 858 * break; 859 * case -ENOMEM: 860 * case -EINVAL: 861 * case -EPERM: 862 * default: 863 * // Handle error ! 864 * up_read(&mm->mmap_sem) 865 * return; 866 * } 867 * // Take device driver lock that serialize device page table update 868 * driver_lock_device_page_table_update(); 869 * hmm_vma_range_done(range); 870 * // Commit pfns we got from hmm_vma_fault() 871 * driver_unlock_device_page_table_update(); 872 * up_read(&mm->mmap_sem) 873 * 874 * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) 875 * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! 876 * 877 * YOU HAVE BEEN WARNED ! 878 */ 879 int hmm_vma_fault(struct hmm_range *range, bool block) 880 { 881 struct vm_area_struct *vma = range->vma; 882 unsigned long start = range->start; 883 struct hmm_vma_walk hmm_vma_walk; 884 struct mm_walk mm_walk; 885 struct hmm *hmm; 886 int ret; 887 888 /* Sanity check, this really should not happen ! */ 889 if (range->start < vma->vm_start || range->start >= vma->vm_end) 890 return -EINVAL; 891 if (range->end < vma->vm_start || range->end > vma->vm_end) 892 return -EINVAL; 893 894 hmm = hmm_register(vma->vm_mm); 895 if (!hmm) { 896 hmm_pfns_clear(range, range->pfns, range->start, range->end); 897 return -ENOMEM; 898 } 899 /* Caller must have registered a mirror using hmm_mirror_register() */ 900 if (!hmm->mmu_notifier.ops) 901 return -EINVAL; 902 903 /* FIXME support hugetlb fs */ 904 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 905 vma_is_dax(vma)) { 906 hmm_pfns_special(range); 907 return -EINVAL; 908 } 909 910 if (!(vma->vm_flags & VM_READ)) { 911 /* 912 * If vma do not allow read access, then assume that it does 913 * not allow write access, either. Architecture that allow 914 * write without read access are not supported by HMM, because 915 * operations such has atomic access would not work. 916 */ 917 hmm_pfns_clear(range, range->pfns, range->start, range->end); 918 return -EPERM; 919 } 920 921 /* Initialize range to track CPU page table update */ 922 spin_lock(&hmm->lock); 923 range->valid = true; 924 list_add_rcu(&range->list, &hmm->ranges); 925 spin_unlock(&hmm->lock); 926 927 hmm_vma_walk.fault = true; 928 hmm_vma_walk.block = block; 929 hmm_vma_walk.range = range; 930 mm_walk.private = &hmm_vma_walk; 931 hmm_vma_walk.last = range->start; 932 933 mm_walk.vma = vma; 934 mm_walk.mm = vma->vm_mm; 935 mm_walk.pte_entry = NULL; 936 mm_walk.test_walk = NULL; 937 mm_walk.hugetlb_entry = NULL; 938 mm_walk.pmd_entry = hmm_vma_walk_pmd; 939 mm_walk.pte_hole = hmm_vma_walk_hole; 940 941 do { 942 ret = walk_page_range(start, range->end, &mm_walk); 943 start = hmm_vma_walk.last; 944 } while (ret == -EAGAIN); 945 946 if (ret) { 947 unsigned long i; 948 949 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 950 hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, 951 range->end); 952 hmm_vma_range_done(range); 953 } 954 return ret; 955 } 956 EXPORT_SYMBOL(hmm_vma_fault); 957 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 958 959 960 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) 961 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, 962 unsigned long addr) 963 { 964 struct page *page; 965 966 page = alloc_page_vma(GFP_HIGHUSER, vma, addr); 967 if (!page) 968 return NULL; 969 lock_page(page); 970 return page; 971 } 972 EXPORT_SYMBOL(hmm_vma_alloc_locked_page); 973 974 975 static void hmm_devmem_ref_release(struct percpu_ref *ref) 976 { 977 struct hmm_devmem *devmem; 978 979 devmem = container_of(ref, struct hmm_devmem, ref); 980 complete(&devmem->completion); 981 } 982 983 static void hmm_devmem_ref_exit(void *data) 984 { 985 struct percpu_ref *ref = data; 986 struct hmm_devmem *devmem; 987 988 devmem = container_of(ref, struct hmm_devmem, ref); 989 percpu_ref_exit(ref); 990 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); 991 } 992 993 static void hmm_devmem_ref_kill(void *data) 994 { 995 struct percpu_ref *ref = data; 996 struct hmm_devmem *devmem; 997 998 devmem = container_of(ref, struct hmm_devmem, ref); 999 percpu_ref_kill(ref); 1000 wait_for_completion(&devmem->completion); 1001 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); 1002 } 1003 1004 static int hmm_devmem_fault(struct vm_area_struct *vma, 1005 unsigned long addr, 1006 const struct page *page, 1007 unsigned int flags, 1008 pmd_t *pmdp) 1009 { 1010 struct hmm_devmem *devmem = page->pgmap->data; 1011 1012 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); 1013 } 1014 1015 static void hmm_devmem_free(struct page *page, void *data) 1016 { 1017 struct hmm_devmem *devmem = data; 1018 1019 page->mapping = NULL; 1020 1021 devmem->ops->free(devmem, page); 1022 } 1023 1024 static DEFINE_MUTEX(hmm_devmem_lock); 1025 static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); 1026 1027 static void hmm_devmem_radix_release(struct resource *resource) 1028 { 1029 resource_size_t key; 1030 1031 mutex_lock(&hmm_devmem_lock); 1032 for (key = resource->start; 1033 key <= resource->end; 1034 key += PA_SECTION_SIZE) 1035 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); 1036 mutex_unlock(&hmm_devmem_lock); 1037 } 1038 1039 static void hmm_devmem_release(struct device *dev, void *data) 1040 { 1041 struct hmm_devmem *devmem = data; 1042 struct resource *resource = devmem->resource; 1043 unsigned long start_pfn, npages; 1044 struct zone *zone; 1045 struct page *page; 1046 1047 if (percpu_ref_tryget_live(&devmem->ref)) { 1048 dev_WARN(dev, "%s: page mapping is still live!\n", __func__); 1049 percpu_ref_put(&devmem->ref); 1050 } 1051 1052 /* pages are dead and unused, undo the arch mapping */ 1053 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; 1054 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; 1055 1056 page = pfn_to_page(start_pfn); 1057 zone = page_zone(page); 1058 1059 mem_hotplug_begin(); 1060 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) 1061 __remove_pages(zone, start_pfn, npages, NULL); 1062 else 1063 arch_remove_memory(start_pfn << PAGE_SHIFT, 1064 npages << PAGE_SHIFT, NULL); 1065 mem_hotplug_done(); 1066 1067 hmm_devmem_radix_release(resource); 1068 } 1069 1070 static int hmm_devmem_pages_create(struct hmm_devmem *devmem) 1071 { 1072 resource_size_t key, align_start, align_size, align_end; 1073 struct device *device = devmem->device; 1074 int ret, nid, is_ram; 1075 1076 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); 1077 align_size = ALIGN(devmem->resource->start + 1078 resource_size(devmem->resource), 1079 PA_SECTION_SIZE) - align_start; 1080 1081 is_ram = region_intersects(align_start, align_size, 1082 IORESOURCE_SYSTEM_RAM, 1083 IORES_DESC_NONE); 1084 if (is_ram == REGION_MIXED) { 1085 WARN_ONCE(1, "%s attempted on mixed region %pr\n", 1086 __func__, devmem->resource); 1087 return -ENXIO; 1088 } 1089 if (is_ram == REGION_INTERSECTS) 1090 return -ENXIO; 1091 1092 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) 1093 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; 1094 else 1095 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 1096 1097 devmem->pagemap.res = *devmem->resource; 1098 devmem->pagemap.page_fault = hmm_devmem_fault; 1099 devmem->pagemap.page_free = hmm_devmem_free; 1100 devmem->pagemap.dev = devmem->device; 1101 devmem->pagemap.ref = &devmem->ref; 1102 devmem->pagemap.data = devmem; 1103 1104 mutex_lock(&hmm_devmem_lock); 1105 align_end = align_start + align_size - 1; 1106 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { 1107 struct hmm_devmem *dup; 1108 1109 dup = radix_tree_lookup(&hmm_devmem_radix, 1110 key >> PA_SECTION_SHIFT); 1111 if (dup) { 1112 dev_err(device, "%s: collides with mapping for %s\n", 1113 __func__, dev_name(dup->device)); 1114 mutex_unlock(&hmm_devmem_lock); 1115 ret = -EBUSY; 1116 goto error; 1117 } 1118 ret = radix_tree_insert(&hmm_devmem_radix, 1119 key >> PA_SECTION_SHIFT, 1120 devmem); 1121 if (ret) { 1122 dev_err(device, "%s: failed: %d\n", __func__, ret); 1123 mutex_unlock(&hmm_devmem_lock); 1124 goto error_radix; 1125 } 1126 } 1127 mutex_unlock(&hmm_devmem_lock); 1128 1129 nid = dev_to_node(device); 1130 if (nid < 0) 1131 nid = numa_mem_id(); 1132 1133 mem_hotplug_begin(); 1134 /* 1135 * For device private memory we call add_pages() as we only need to 1136 * allocate and initialize struct page for the device memory. More- 1137 * over the device memory is un-accessible thus we do not want to 1138 * create a linear mapping for the memory like arch_add_memory() 1139 * would do. 1140 * 1141 * For device public memory, which is accesible by the CPU, we do 1142 * want the linear mapping and thus use arch_add_memory(). 1143 */ 1144 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) 1145 ret = arch_add_memory(nid, align_start, align_size, NULL, 1146 false); 1147 else 1148 ret = add_pages(nid, align_start >> PAGE_SHIFT, 1149 align_size >> PAGE_SHIFT, NULL, false); 1150 if (ret) { 1151 mem_hotplug_done(); 1152 goto error_add_memory; 1153 } 1154 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], 1155 align_start >> PAGE_SHIFT, 1156 align_size >> PAGE_SHIFT, NULL); 1157 mem_hotplug_done(); 1158 1159 /* 1160 * Initialization of the pages has been deferred until now in order 1161 * to allow us to do the work while not holding the hotplug lock. 1162 */ 1163 memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], 1164 align_start >> PAGE_SHIFT, 1165 align_size >> PAGE_SHIFT, &devmem->pagemap); 1166 1167 return 0; 1168 1169 error_add_memory: 1170 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); 1171 error_radix: 1172 hmm_devmem_radix_release(devmem->resource); 1173 error: 1174 return ret; 1175 } 1176 1177 static int hmm_devmem_match(struct device *dev, void *data, void *match_data) 1178 { 1179 struct hmm_devmem *devmem = data; 1180 1181 return devmem->resource == match_data; 1182 } 1183 1184 static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) 1185 { 1186 devres_release(devmem->device, &hmm_devmem_release, 1187 &hmm_devmem_match, devmem->resource); 1188 } 1189 1190 /* 1191 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory 1192 * 1193 * @ops: memory event device driver callback (see struct hmm_devmem_ops) 1194 * @device: device struct to bind the resource too 1195 * @size: size in bytes of the device memory to add 1196 * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise 1197 * 1198 * This function first finds an empty range of physical address big enough to 1199 * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which 1200 * in turn allocates struct pages. It does not do anything beyond that; all 1201 * events affecting the memory will go through the various callbacks provided 1202 * by hmm_devmem_ops struct. 1203 * 1204 * Device driver should call this function during device initialization and 1205 * is then responsible of memory management. HMM only provides helpers. 1206 */ 1207 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, 1208 struct device *device, 1209 unsigned long size) 1210 { 1211 struct hmm_devmem *devmem; 1212 resource_size_t addr; 1213 int ret; 1214 1215 dev_pagemap_get_ops(); 1216 1217 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1218 GFP_KERNEL, dev_to_node(device)); 1219 if (!devmem) 1220 return ERR_PTR(-ENOMEM); 1221 1222 init_completion(&devmem->completion); 1223 devmem->pfn_first = -1UL; 1224 devmem->pfn_last = -1UL; 1225 devmem->resource = NULL; 1226 devmem->device = device; 1227 devmem->ops = ops; 1228 1229 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1230 0, GFP_KERNEL); 1231 if (ret) 1232 goto error_percpu_ref; 1233 1234 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1235 if (ret) 1236 goto error_devm_add_action; 1237 1238 size = ALIGN(size, PA_SECTION_SIZE); 1239 addr = min((unsigned long)iomem_resource.end, 1240 (1UL << MAX_PHYSMEM_BITS) - 1); 1241 addr = addr - size + 1UL; 1242 1243 /* 1244 * FIXME add a new helper to quickly walk resource tree and find free 1245 * range 1246 * 1247 * FIXME what about ioport_resource resource ? 1248 */ 1249 for (; addr > size && addr >= iomem_resource.start; addr -= size) { 1250 ret = region_intersects(addr, size, 0, IORES_DESC_NONE); 1251 if (ret != REGION_DISJOINT) 1252 continue; 1253 1254 devmem->resource = devm_request_mem_region(device, addr, size, 1255 dev_name(device)); 1256 if (!devmem->resource) { 1257 ret = -ENOMEM; 1258 goto error_no_resource; 1259 } 1260 break; 1261 } 1262 if (!devmem->resource) { 1263 ret = -ERANGE; 1264 goto error_no_resource; 1265 } 1266 1267 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1268 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1269 devmem->pfn_last = devmem->pfn_first + 1270 (resource_size(devmem->resource) >> PAGE_SHIFT); 1271 1272 ret = hmm_devmem_pages_create(devmem); 1273 if (ret) 1274 goto error_pages; 1275 1276 devres_add(device, devmem); 1277 1278 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1279 if (ret) { 1280 hmm_devmem_remove(devmem); 1281 return ERR_PTR(ret); 1282 } 1283 1284 return devmem; 1285 1286 error_pages: 1287 devm_release_mem_region(device, devmem->resource->start, 1288 resource_size(devmem->resource)); 1289 error_no_resource: 1290 error_devm_add_action: 1291 hmm_devmem_ref_kill(&devmem->ref); 1292 hmm_devmem_ref_exit(&devmem->ref); 1293 error_percpu_ref: 1294 devres_free(devmem); 1295 return ERR_PTR(ret); 1296 } 1297 EXPORT_SYMBOL(hmm_devmem_add); 1298 1299 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 1300 struct device *device, 1301 struct resource *res) 1302 { 1303 struct hmm_devmem *devmem; 1304 int ret; 1305 1306 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) 1307 return ERR_PTR(-EINVAL); 1308 1309 dev_pagemap_get_ops(); 1310 1311 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1312 GFP_KERNEL, dev_to_node(device)); 1313 if (!devmem) 1314 return ERR_PTR(-ENOMEM); 1315 1316 init_completion(&devmem->completion); 1317 devmem->pfn_first = -1UL; 1318 devmem->pfn_last = -1UL; 1319 devmem->resource = res; 1320 devmem->device = device; 1321 devmem->ops = ops; 1322 1323 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1324 0, GFP_KERNEL); 1325 if (ret) 1326 goto error_percpu_ref; 1327 1328 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1329 if (ret) 1330 goto error_devm_add_action; 1331 1332 1333 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1334 devmem->pfn_last = devmem->pfn_first + 1335 (resource_size(devmem->resource) >> PAGE_SHIFT); 1336 1337 ret = hmm_devmem_pages_create(devmem); 1338 if (ret) 1339 goto error_devm_add_action; 1340 1341 devres_add(device, devmem); 1342 1343 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1344 if (ret) { 1345 hmm_devmem_remove(devmem); 1346 return ERR_PTR(ret); 1347 } 1348 1349 return devmem; 1350 1351 error_devm_add_action: 1352 hmm_devmem_ref_kill(&devmem->ref); 1353 hmm_devmem_ref_exit(&devmem->ref); 1354 error_percpu_ref: 1355 devres_free(devmem); 1356 return ERR_PTR(ret); 1357 } 1358 EXPORT_SYMBOL(hmm_devmem_add_resource); 1359 1360 /* 1361 * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) 1362 * 1363 * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory 1364 * 1365 * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf 1366 * of the device driver. It will free struct page and remove the resource that 1367 * reserved the physical address range for this device memory. 1368 */ 1369 void hmm_devmem_remove(struct hmm_devmem *devmem) 1370 { 1371 resource_size_t start, size; 1372 struct device *device; 1373 bool cdm = false; 1374 1375 if (!devmem) 1376 return; 1377 1378 device = devmem->device; 1379 start = devmem->resource->start; 1380 size = resource_size(devmem->resource); 1381 1382 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; 1383 hmm_devmem_ref_kill(&devmem->ref); 1384 hmm_devmem_ref_exit(&devmem->ref); 1385 hmm_devmem_pages_remove(devmem); 1386 1387 if (!cdm) 1388 devm_release_mem_region(device, start, size); 1389 } 1390 EXPORT_SYMBOL(hmm_devmem_remove); 1391 1392 /* 1393 * A device driver that wants to handle multiple devices memory through a 1394 * single fake device can use hmm_device to do so. This is purely a helper 1395 * and it is not needed to make use of any HMM functionality. 1396 */ 1397 #define HMM_DEVICE_MAX 256 1398 1399 static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); 1400 static DEFINE_SPINLOCK(hmm_device_lock); 1401 static struct class *hmm_device_class; 1402 static dev_t hmm_device_devt; 1403 1404 static void hmm_device_release(struct device *device) 1405 { 1406 struct hmm_device *hmm_device; 1407 1408 hmm_device = container_of(device, struct hmm_device, device); 1409 spin_lock(&hmm_device_lock); 1410 clear_bit(hmm_device->minor, hmm_device_mask); 1411 spin_unlock(&hmm_device_lock); 1412 1413 kfree(hmm_device); 1414 } 1415 1416 struct hmm_device *hmm_device_new(void *drvdata) 1417 { 1418 struct hmm_device *hmm_device; 1419 1420 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); 1421 if (!hmm_device) 1422 return ERR_PTR(-ENOMEM); 1423 1424 spin_lock(&hmm_device_lock); 1425 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); 1426 if (hmm_device->minor >= HMM_DEVICE_MAX) { 1427 spin_unlock(&hmm_device_lock); 1428 kfree(hmm_device); 1429 return ERR_PTR(-EBUSY); 1430 } 1431 set_bit(hmm_device->minor, hmm_device_mask); 1432 spin_unlock(&hmm_device_lock); 1433 1434 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); 1435 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), 1436 hmm_device->minor); 1437 hmm_device->device.release = hmm_device_release; 1438 dev_set_drvdata(&hmm_device->device, drvdata); 1439 hmm_device->device.class = hmm_device_class; 1440 device_initialize(&hmm_device->device); 1441 1442 return hmm_device; 1443 } 1444 EXPORT_SYMBOL(hmm_device_new); 1445 1446 void hmm_device_put(struct hmm_device *hmm_device) 1447 { 1448 put_device(&hmm_device->device); 1449 } 1450 EXPORT_SYMBOL(hmm_device_put); 1451 1452 static int __init hmm_init(void) 1453 { 1454 int ret; 1455 1456 ret = alloc_chrdev_region(&hmm_device_devt, 0, 1457 HMM_DEVICE_MAX, 1458 "hmm_device"); 1459 if (ret) 1460 return ret; 1461 1462 hmm_device_class = class_create(THIS_MODULE, "hmm_device"); 1463 if (IS_ERR(hmm_device_class)) { 1464 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); 1465 return PTR_ERR(hmm_device_class); 1466 } 1467 return 0; 1468 } 1469 1470 device_initcall(hmm_init); 1471 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 1472