1 /* 2 * Copyright 2013 Red Hat Inc. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * Authors: Jérôme Glisse <jglisse@redhat.com> 15 */ 16 /* 17 * Refer to include/linux/hmm.h for information about heterogeneous memory 18 * management or HMM for short. 19 */ 20 #include <linux/mm.h> 21 #include <linux/hmm.h> 22 #include <linux/init.h> 23 #include <linux/rmap.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/sched.h> 27 #include <linux/mmzone.h> 28 #include <linux/pagemap.h> 29 #include <linux/swapops.h> 30 #include <linux/hugetlb.h> 31 #include <linux/memremap.h> 32 #include <linux/jump_label.h> 33 #include <linux/mmu_notifier.h> 34 #include <linux/memory_hotplug.h> 35 36 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) 37 38 #if IS_ENABLED(CONFIG_HMM_MIRROR) 39 static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 40 41 /* 42 * struct hmm - HMM per mm struct 43 * 44 * @mm: mm struct this HMM struct is bound to 45 * @lock: lock protecting ranges list 46 * @sequence: we track updates to the CPU page table with a sequence number 47 * @ranges: list of range being snapshotted 48 * @mirrors: list of mirrors for this mm 49 * @mmu_notifier: mmu notifier to track updates to CPU page table 50 * @mirrors_sem: read/write semaphore protecting the mirrors list 51 */ 52 struct hmm { 53 struct mm_struct *mm; 54 spinlock_t lock; 55 atomic_t sequence; 56 struct list_head ranges; 57 struct list_head mirrors; 58 struct mmu_notifier mmu_notifier; 59 struct rw_semaphore mirrors_sem; 60 }; 61 62 /* 63 * hmm_register - register HMM against an mm (HMM internal) 64 * 65 * @mm: mm struct to attach to 66 * 67 * This is not intended to be used directly by device drivers. It allocates an 68 * HMM struct if mm does not have one, and initializes it. 69 */ 70 static struct hmm *hmm_register(struct mm_struct *mm) 71 { 72 struct hmm *hmm = READ_ONCE(mm->hmm); 73 bool cleanup = false; 74 75 /* 76 * The hmm struct can only be freed once the mm_struct goes away, 77 * hence we should always have pre-allocated an new hmm struct 78 * above. 79 */ 80 if (hmm) 81 return hmm; 82 83 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 84 if (!hmm) 85 return NULL; 86 INIT_LIST_HEAD(&hmm->mirrors); 87 init_rwsem(&hmm->mirrors_sem); 88 atomic_set(&hmm->sequence, 0); 89 hmm->mmu_notifier.ops = NULL; 90 INIT_LIST_HEAD(&hmm->ranges); 91 spin_lock_init(&hmm->lock); 92 hmm->mm = mm; 93 94 /* 95 * We should only get here if hold the mmap_sem in write mode ie on 96 * registration of first mirror through hmm_mirror_register() 97 */ 98 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; 99 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { 100 kfree(hmm); 101 return NULL; 102 } 103 104 spin_lock(&mm->page_table_lock); 105 if (!mm->hmm) 106 mm->hmm = hmm; 107 else 108 cleanup = true; 109 spin_unlock(&mm->page_table_lock); 110 111 if (cleanup) { 112 mmu_notifier_unregister(&hmm->mmu_notifier, mm); 113 kfree(hmm); 114 } 115 116 return mm->hmm; 117 } 118 119 void hmm_mm_destroy(struct mm_struct *mm) 120 { 121 kfree(mm->hmm); 122 } 123 124 static void hmm_invalidate_range(struct hmm *hmm, 125 enum hmm_update_type action, 126 unsigned long start, 127 unsigned long end) 128 { 129 struct hmm_mirror *mirror; 130 struct hmm_range *range; 131 132 spin_lock(&hmm->lock); 133 list_for_each_entry(range, &hmm->ranges, list) { 134 unsigned long addr, idx, npages; 135 136 if (end < range->start || start >= range->end) 137 continue; 138 139 range->valid = false; 140 addr = max(start, range->start); 141 idx = (addr - range->start) >> PAGE_SHIFT; 142 npages = (min(range->end, end) - addr) >> PAGE_SHIFT; 143 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); 144 } 145 spin_unlock(&hmm->lock); 146 147 down_read(&hmm->mirrors_sem); 148 list_for_each_entry(mirror, &hmm->mirrors, list) 149 mirror->ops->sync_cpu_device_pagetables(mirror, action, 150 start, end); 151 up_read(&hmm->mirrors_sem); 152 } 153 154 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 155 { 156 struct hmm_mirror *mirror; 157 struct hmm *hmm = mm->hmm; 158 159 down_write(&hmm->mirrors_sem); 160 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, 161 list); 162 while (mirror) { 163 list_del_init(&mirror->list); 164 if (mirror->ops->release) { 165 /* 166 * Drop mirrors_sem so callback can wait on any pending 167 * work that might itself trigger mmu_notifier callback 168 * and thus would deadlock with us. 169 */ 170 up_write(&hmm->mirrors_sem); 171 mirror->ops->release(mirror); 172 down_write(&hmm->mirrors_sem); 173 } 174 mirror = list_first_entry_or_null(&hmm->mirrors, 175 struct hmm_mirror, list); 176 } 177 up_write(&hmm->mirrors_sem); 178 } 179 180 static void hmm_invalidate_range_start(struct mmu_notifier *mn, 181 struct mm_struct *mm, 182 unsigned long start, 183 unsigned long end) 184 { 185 struct hmm *hmm = mm->hmm; 186 187 VM_BUG_ON(!hmm); 188 189 atomic_inc(&hmm->sequence); 190 } 191 192 static void hmm_invalidate_range_end(struct mmu_notifier *mn, 193 struct mm_struct *mm, 194 unsigned long start, 195 unsigned long end) 196 { 197 struct hmm *hmm = mm->hmm; 198 199 VM_BUG_ON(!hmm); 200 201 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); 202 } 203 204 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 205 .release = hmm_release, 206 .invalidate_range_start = hmm_invalidate_range_start, 207 .invalidate_range_end = hmm_invalidate_range_end, 208 }; 209 210 /* 211 * hmm_mirror_register() - register a mirror against an mm 212 * 213 * @mirror: new mirror struct to register 214 * @mm: mm to register against 215 * 216 * To start mirroring a process address space, the device driver must register 217 * an HMM mirror struct. 218 * 219 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! 220 */ 221 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 222 { 223 /* Sanity check */ 224 if (!mm || !mirror || !mirror->ops) 225 return -EINVAL; 226 227 again: 228 mirror->hmm = hmm_register(mm); 229 if (!mirror->hmm) 230 return -ENOMEM; 231 232 down_write(&mirror->hmm->mirrors_sem); 233 if (mirror->hmm->mm == NULL) { 234 /* 235 * A racing hmm_mirror_unregister() is about to destroy the hmm 236 * struct. Try again to allocate a new one. 237 */ 238 up_write(&mirror->hmm->mirrors_sem); 239 mirror->hmm = NULL; 240 goto again; 241 } else { 242 list_add(&mirror->list, &mirror->hmm->mirrors); 243 up_write(&mirror->hmm->mirrors_sem); 244 } 245 246 return 0; 247 } 248 EXPORT_SYMBOL(hmm_mirror_register); 249 250 /* 251 * hmm_mirror_unregister() - unregister a mirror 252 * 253 * @mirror: new mirror struct to register 254 * 255 * Stop mirroring a process address space, and cleanup. 256 */ 257 void hmm_mirror_unregister(struct hmm_mirror *mirror) 258 { 259 bool should_unregister = false; 260 struct mm_struct *mm; 261 struct hmm *hmm; 262 263 if (mirror->hmm == NULL) 264 return; 265 266 hmm = mirror->hmm; 267 down_write(&hmm->mirrors_sem); 268 list_del_init(&mirror->list); 269 should_unregister = list_empty(&hmm->mirrors); 270 mirror->hmm = NULL; 271 mm = hmm->mm; 272 hmm->mm = NULL; 273 up_write(&hmm->mirrors_sem); 274 275 if (!should_unregister || mm == NULL) 276 return; 277 278 spin_lock(&mm->page_table_lock); 279 if (mm->hmm == hmm) 280 mm->hmm = NULL; 281 spin_unlock(&mm->page_table_lock); 282 283 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); 284 kfree(hmm); 285 } 286 EXPORT_SYMBOL(hmm_mirror_unregister); 287 288 struct hmm_vma_walk { 289 struct hmm_range *range; 290 unsigned long last; 291 bool fault; 292 bool block; 293 }; 294 295 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 296 bool write_fault, uint64_t *pfn) 297 { 298 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 299 struct hmm_vma_walk *hmm_vma_walk = walk->private; 300 struct hmm_range *range = hmm_vma_walk->range; 301 struct vm_area_struct *vma = walk->vma; 302 int r; 303 304 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 305 flags |= write_fault ? FAULT_FLAG_WRITE : 0; 306 r = handle_mm_fault(vma, addr, flags); 307 if (r & VM_FAULT_RETRY) 308 return -EBUSY; 309 if (r & VM_FAULT_ERROR) { 310 *pfn = range->values[HMM_PFN_ERROR]; 311 return -EFAULT; 312 } 313 314 return -EAGAIN; 315 } 316 317 static int hmm_pfns_bad(unsigned long addr, 318 unsigned long end, 319 struct mm_walk *walk) 320 { 321 struct hmm_vma_walk *hmm_vma_walk = walk->private; 322 struct hmm_range *range = hmm_vma_walk->range; 323 uint64_t *pfns = range->pfns; 324 unsigned long i; 325 326 i = (addr - range->start) >> PAGE_SHIFT; 327 for (; addr < end; addr += PAGE_SIZE, i++) 328 pfns[i] = range->values[HMM_PFN_ERROR]; 329 330 return 0; 331 } 332 333 /* 334 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 335 * @start: range virtual start address (inclusive) 336 * @end: range virtual end address (exclusive) 337 * @fault: should we fault or not ? 338 * @write_fault: write fault ? 339 * @walk: mm_walk structure 340 * Returns: 0 on success, -EAGAIN after page fault, or page fault error 341 * 342 * This function will be called whenever pmd_none() or pte_none() returns true, 343 * or whenever there is no page directory covering the virtual address range. 344 */ 345 static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 346 bool fault, bool write_fault, 347 struct mm_walk *walk) 348 { 349 struct hmm_vma_walk *hmm_vma_walk = walk->private; 350 struct hmm_range *range = hmm_vma_walk->range; 351 uint64_t *pfns = range->pfns; 352 unsigned long i; 353 354 hmm_vma_walk->last = addr; 355 i = (addr - range->start) >> PAGE_SHIFT; 356 for (; addr < end; addr += PAGE_SIZE, i++) { 357 pfns[i] = range->values[HMM_PFN_NONE]; 358 if (fault || write_fault) { 359 int ret; 360 361 ret = hmm_vma_do_fault(walk, addr, write_fault, 362 &pfns[i]); 363 if (ret != -EAGAIN) 364 return ret; 365 } 366 } 367 368 return (fault || write_fault) ? -EAGAIN : 0; 369 } 370 371 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 372 uint64_t pfns, uint64_t cpu_flags, 373 bool *fault, bool *write_fault) 374 { 375 struct hmm_range *range = hmm_vma_walk->range; 376 377 *fault = *write_fault = false; 378 if (!hmm_vma_walk->fault) 379 return; 380 381 /* We aren't ask to do anything ... */ 382 if (!(pfns & range->flags[HMM_PFN_VALID])) 383 return; 384 /* If this is device memory than only fault if explicitly requested */ 385 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 386 /* Do we fault on device memory ? */ 387 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 388 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 389 *fault = true; 390 } 391 return; 392 } 393 394 /* If CPU page table is not valid then we need to fault */ 395 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 396 /* Need to write fault ? */ 397 if ((pfns & range->flags[HMM_PFN_WRITE]) && 398 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 399 *write_fault = true; 400 *fault = true; 401 } 402 } 403 404 static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 405 const uint64_t *pfns, unsigned long npages, 406 uint64_t cpu_flags, bool *fault, 407 bool *write_fault) 408 { 409 unsigned long i; 410 411 if (!hmm_vma_walk->fault) { 412 *fault = *write_fault = false; 413 return; 414 } 415 416 for (i = 0; i < npages; ++i) { 417 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 418 fault, write_fault); 419 if ((*fault) || (*write_fault)) 420 return; 421 } 422 } 423 424 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 425 struct mm_walk *walk) 426 { 427 struct hmm_vma_walk *hmm_vma_walk = walk->private; 428 struct hmm_range *range = hmm_vma_walk->range; 429 bool fault, write_fault; 430 unsigned long i, npages; 431 uint64_t *pfns; 432 433 i = (addr - range->start) >> PAGE_SHIFT; 434 npages = (end - addr) >> PAGE_SHIFT; 435 pfns = &range->pfns[i]; 436 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 437 0, &fault, &write_fault); 438 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 439 } 440 441 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 442 { 443 if (pmd_protnone(pmd)) 444 return 0; 445 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 446 range->flags[HMM_PFN_WRITE] : 447 range->flags[HMM_PFN_VALID]; 448 } 449 450 static int hmm_vma_handle_pmd(struct mm_walk *walk, 451 unsigned long addr, 452 unsigned long end, 453 uint64_t *pfns, 454 pmd_t pmd) 455 { 456 struct hmm_vma_walk *hmm_vma_walk = walk->private; 457 struct hmm_range *range = hmm_vma_walk->range; 458 unsigned long pfn, npages, i; 459 bool fault, write_fault; 460 uint64_t cpu_flags; 461 462 npages = (end - addr) >> PAGE_SHIFT; 463 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 464 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 465 &fault, &write_fault); 466 467 if (pmd_protnone(pmd) || fault || write_fault) 468 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 469 470 pfn = pmd_pfn(pmd) + pte_index(addr); 471 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 472 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; 473 hmm_vma_walk->last = end; 474 return 0; 475 } 476 477 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 478 { 479 if (pte_none(pte) || !pte_present(pte)) 480 return 0; 481 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 482 range->flags[HMM_PFN_WRITE] : 483 range->flags[HMM_PFN_VALID]; 484 } 485 486 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 487 unsigned long end, pmd_t *pmdp, pte_t *ptep, 488 uint64_t *pfn) 489 { 490 struct hmm_vma_walk *hmm_vma_walk = walk->private; 491 struct hmm_range *range = hmm_vma_walk->range; 492 struct vm_area_struct *vma = walk->vma; 493 bool fault, write_fault; 494 uint64_t cpu_flags; 495 pte_t pte = *ptep; 496 uint64_t orig_pfn = *pfn; 497 498 *pfn = range->values[HMM_PFN_NONE]; 499 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 500 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 501 &fault, &write_fault); 502 503 if (pte_none(pte)) { 504 if (fault || write_fault) 505 goto fault; 506 return 0; 507 } 508 509 if (!pte_present(pte)) { 510 swp_entry_t entry = pte_to_swp_entry(pte); 511 512 if (!non_swap_entry(entry)) { 513 if (fault || write_fault) 514 goto fault; 515 return 0; 516 } 517 518 /* 519 * This is a special swap entry, ignore migration, use 520 * device and report anything else as error. 521 */ 522 if (is_device_private_entry(entry)) { 523 cpu_flags = range->flags[HMM_PFN_VALID] | 524 range->flags[HMM_PFN_DEVICE_PRIVATE]; 525 cpu_flags |= is_write_device_private_entry(entry) ? 526 range->flags[HMM_PFN_WRITE] : 0; 527 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 528 &fault, &write_fault); 529 if (fault || write_fault) 530 goto fault; 531 *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); 532 *pfn |= cpu_flags; 533 return 0; 534 } 535 536 if (is_migration_entry(entry)) { 537 if (fault || write_fault) { 538 pte_unmap(ptep); 539 hmm_vma_walk->last = addr; 540 migration_entry_wait(vma->vm_mm, 541 pmdp, addr); 542 return -EAGAIN; 543 } 544 return 0; 545 } 546 547 /* Report error for everything else */ 548 *pfn = range->values[HMM_PFN_ERROR]; 549 return -EFAULT; 550 } 551 552 if (fault || write_fault) 553 goto fault; 554 555 *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; 556 return 0; 557 558 fault: 559 pte_unmap(ptep); 560 /* Fault any virtual address we were asked to fault */ 561 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 562 } 563 564 static int hmm_vma_walk_pmd(pmd_t *pmdp, 565 unsigned long start, 566 unsigned long end, 567 struct mm_walk *walk) 568 { 569 struct hmm_vma_walk *hmm_vma_walk = walk->private; 570 struct hmm_range *range = hmm_vma_walk->range; 571 uint64_t *pfns = range->pfns; 572 unsigned long addr = start, i; 573 pte_t *ptep; 574 575 i = (addr - range->start) >> PAGE_SHIFT; 576 577 again: 578 if (pmd_none(*pmdp)) 579 return hmm_vma_walk_hole(start, end, walk); 580 581 if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) 582 return hmm_pfns_bad(start, end, walk); 583 584 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { 585 pmd_t pmd; 586 587 /* 588 * No need to take pmd_lock here, even if some other threads 589 * is splitting the huge pmd we will get that event through 590 * mmu_notifier callback. 591 * 592 * So just read pmd value and check again its a transparent 593 * huge or device mapping one and compute corresponding pfn 594 * values. 595 */ 596 pmd = pmd_read_atomic(pmdp); 597 barrier(); 598 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 599 goto again; 600 601 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 602 } 603 604 if (pmd_bad(*pmdp)) 605 return hmm_pfns_bad(start, end, walk); 606 607 ptep = pte_offset_map(pmdp, addr); 608 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 609 int r; 610 611 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 612 if (r) { 613 /* hmm_vma_handle_pte() did unmap pte directory */ 614 hmm_vma_walk->last = addr; 615 return r; 616 } 617 } 618 pte_unmap(ptep - 1); 619 620 hmm_vma_walk->last = addr; 621 return 0; 622 } 623 624 static void hmm_pfns_clear(struct hmm_range *range, 625 uint64_t *pfns, 626 unsigned long addr, 627 unsigned long end) 628 { 629 for (; addr < end; addr += PAGE_SIZE, pfns++) 630 *pfns = range->values[HMM_PFN_NONE]; 631 } 632 633 static void hmm_pfns_special(struct hmm_range *range) 634 { 635 unsigned long addr = range->start, i = 0; 636 637 for (; addr < range->end; addr += PAGE_SIZE, i++) 638 range->pfns[i] = range->values[HMM_PFN_SPECIAL]; 639 } 640 641 /* 642 * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses 643 * @range: range being snapshotted 644 * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 645 * vma permission, 0 success 646 * 647 * This snapshots the CPU page table for a range of virtual addresses. Snapshot 648 * validity is tracked by range struct. See hmm_vma_range_done() for further 649 * information. 650 * 651 * The range struct is initialized here. It tracks the CPU page table, but only 652 * if the function returns success (0), in which case the caller must then call 653 * hmm_vma_range_done() to stop CPU page table update tracking on this range. 654 * 655 * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS 656 * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! 657 */ 658 int hmm_vma_get_pfns(struct hmm_range *range) 659 { 660 struct vm_area_struct *vma = range->vma; 661 struct hmm_vma_walk hmm_vma_walk; 662 struct mm_walk mm_walk; 663 struct hmm *hmm; 664 665 /* Sanity check, this really should not happen ! */ 666 if (range->start < vma->vm_start || range->start >= vma->vm_end) 667 return -EINVAL; 668 if (range->end < vma->vm_start || range->end > vma->vm_end) 669 return -EINVAL; 670 671 hmm = hmm_register(vma->vm_mm); 672 if (!hmm) 673 return -ENOMEM; 674 /* Caller must have registered a mirror, via hmm_mirror_register() ! */ 675 if (!hmm->mmu_notifier.ops) 676 return -EINVAL; 677 678 /* FIXME support hugetlb fs */ 679 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { 680 hmm_pfns_special(range); 681 return -EINVAL; 682 } 683 684 if (!(vma->vm_flags & VM_READ)) { 685 /* 686 * If vma do not allow read access, then assume that it does 687 * not allow write access, either. Architecture that allow 688 * write without read access are not supported by HMM, because 689 * operations such has atomic access would not work. 690 */ 691 hmm_pfns_clear(range, range->pfns, range->start, range->end); 692 return -EPERM; 693 } 694 695 /* Initialize range to track CPU page table update */ 696 spin_lock(&hmm->lock); 697 range->valid = true; 698 list_add_rcu(&range->list, &hmm->ranges); 699 spin_unlock(&hmm->lock); 700 701 hmm_vma_walk.fault = false; 702 hmm_vma_walk.range = range; 703 mm_walk.private = &hmm_vma_walk; 704 705 mm_walk.vma = vma; 706 mm_walk.mm = vma->vm_mm; 707 mm_walk.pte_entry = NULL; 708 mm_walk.test_walk = NULL; 709 mm_walk.hugetlb_entry = NULL; 710 mm_walk.pmd_entry = hmm_vma_walk_pmd; 711 mm_walk.pte_hole = hmm_vma_walk_hole; 712 713 walk_page_range(range->start, range->end, &mm_walk); 714 return 0; 715 } 716 EXPORT_SYMBOL(hmm_vma_get_pfns); 717 718 /* 719 * hmm_vma_range_done() - stop tracking change to CPU page table over a range 720 * @range: range being tracked 721 * Returns: false if range data has been invalidated, true otherwise 722 * 723 * Range struct is used to track updates to the CPU page table after a call to 724 * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done 725 * using the data, or wants to lock updates to the data it got from those 726 * functions, it must call the hmm_vma_range_done() function, which will then 727 * stop tracking CPU page table updates. 728 * 729 * Note that device driver must still implement general CPU page table update 730 * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using 731 * the mmu_notifier API directly. 732 * 733 * CPU page table update tracking done through hmm_range is only temporary and 734 * to be used while trying to duplicate CPU page table contents for a range of 735 * virtual addresses. 736 * 737 * There are two ways to use this : 738 * again: 739 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 740 * trans = device_build_page_table_update_transaction(pfns); 741 * device_page_table_lock(); 742 * if (!hmm_vma_range_done(range)) { 743 * device_page_table_unlock(); 744 * goto again; 745 * } 746 * device_commit_transaction(trans); 747 * device_page_table_unlock(); 748 * 749 * Or: 750 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 751 * device_page_table_lock(); 752 * hmm_vma_range_done(range); 753 * device_update_page_table(range->pfns); 754 * device_page_table_unlock(); 755 */ 756 bool hmm_vma_range_done(struct hmm_range *range) 757 { 758 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; 759 struct hmm *hmm; 760 761 if (range->end <= range->start) { 762 BUG(); 763 return false; 764 } 765 766 hmm = hmm_register(range->vma->vm_mm); 767 if (!hmm) { 768 memset(range->pfns, 0, sizeof(*range->pfns) * npages); 769 return false; 770 } 771 772 spin_lock(&hmm->lock); 773 list_del_rcu(&range->list); 774 spin_unlock(&hmm->lock); 775 776 return range->valid; 777 } 778 EXPORT_SYMBOL(hmm_vma_range_done); 779 780 /* 781 * hmm_vma_fault() - try to fault some address in a virtual address range 782 * @range: range being faulted 783 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 784 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) 785 * 786 * This is similar to a regular CPU page fault except that it will not trigger 787 * any memory migration if the memory being faulted is not accessible by CPUs. 788 * 789 * On error, for one virtual address in the range, the function will mark the 790 * corresponding HMM pfn entry with an error flag. 791 * 792 * Expected use pattern: 793 * retry: 794 * down_read(&mm->mmap_sem); 795 * // Find vma and address device wants to fault, initialize hmm_pfn_t 796 * // array accordingly 797 * ret = hmm_vma_fault(range, write, block); 798 * switch (ret) { 799 * case -EAGAIN: 800 * hmm_vma_range_done(range); 801 * // You might want to rate limit or yield to play nicely, you may 802 * // also commit any valid pfn in the array assuming that you are 803 * // getting true from hmm_vma_range_monitor_end() 804 * goto retry; 805 * case 0: 806 * break; 807 * case -ENOMEM: 808 * case -EINVAL: 809 * case -EPERM: 810 * default: 811 * // Handle error ! 812 * up_read(&mm->mmap_sem) 813 * return; 814 * } 815 * // Take device driver lock that serialize device page table update 816 * driver_lock_device_page_table_update(); 817 * hmm_vma_range_done(range); 818 * // Commit pfns we got from hmm_vma_fault() 819 * driver_unlock_device_page_table_update(); 820 * up_read(&mm->mmap_sem) 821 * 822 * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) 823 * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! 824 * 825 * YOU HAVE BEEN WARNED ! 826 */ 827 int hmm_vma_fault(struct hmm_range *range, bool block) 828 { 829 struct vm_area_struct *vma = range->vma; 830 unsigned long start = range->start; 831 struct hmm_vma_walk hmm_vma_walk; 832 struct mm_walk mm_walk; 833 struct hmm *hmm; 834 int ret; 835 836 /* Sanity check, this really should not happen ! */ 837 if (range->start < vma->vm_start || range->start >= vma->vm_end) 838 return -EINVAL; 839 if (range->end < vma->vm_start || range->end > vma->vm_end) 840 return -EINVAL; 841 842 hmm = hmm_register(vma->vm_mm); 843 if (!hmm) { 844 hmm_pfns_clear(range, range->pfns, range->start, range->end); 845 return -ENOMEM; 846 } 847 /* Caller must have registered a mirror using hmm_mirror_register() */ 848 if (!hmm->mmu_notifier.ops) 849 return -EINVAL; 850 851 /* FIXME support hugetlb fs */ 852 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { 853 hmm_pfns_special(range); 854 return -EINVAL; 855 } 856 857 if (!(vma->vm_flags & VM_READ)) { 858 /* 859 * If vma do not allow read access, then assume that it does 860 * not allow write access, either. Architecture that allow 861 * write without read access are not supported by HMM, because 862 * operations such has atomic access would not work. 863 */ 864 hmm_pfns_clear(range, range->pfns, range->start, range->end); 865 return -EPERM; 866 } 867 868 /* Initialize range to track CPU page table update */ 869 spin_lock(&hmm->lock); 870 range->valid = true; 871 list_add_rcu(&range->list, &hmm->ranges); 872 spin_unlock(&hmm->lock); 873 874 hmm_vma_walk.fault = true; 875 hmm_vma_walk.block = block; 876 hmm_vma_walk.range = range; 877 mm_walk.private = &hmm_vma_walk; 878 hmm_vma_walk.last = range->start; 879 880 mm_walk.vma = vma; 881 mm_walk.mm = vma->vm_mm; 882 mm_walk.pte_entry = NULL; 883 mm_walk.test_walk = NULL; 884 mm_walk.hugetlb_entry = NULL; 885 mm_walk.pmd_entry = hmm_vma_walk_pmd; 886 mm_walk.pte_hole = hmm_vma_walk_hole; 887 888 do { 889 ret = walk_page_range(start, range->end, &mm_walk); 890 start = hmm_vma_walk.last; 891 } while (ret == -EAGAIN); 892 893 if (ret) { 894 unsigned long i; 895 896 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 897 hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, 898 range->end); 899 hmm_vma_range_done(range); 900 } 901 return ret; 902 } 903 EXPORT_SYMBOL(hmm_vma_fault); 904 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 905 906 907 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) 908 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, 909 unsigned long addr) 910 { 911 struct page *page; 912 913 page = alloc_page_vma(GFP_HIGHUSER, vma, addr); 914 if (!page) 915 return NULL; 916 lock_page(page); 917 return page; 918 } 919 EXPORT_SYMBOL(hmm_vma_alloc_locked_page); 920 921 922 static void hmm_devmem_ref_release(struct percpu_ref *ref) 923 { 924 struct hmm_devmem *devmem; 925 926 devmem = container_of(ref, struct hmm_devmem, ref); 927 complete(&devmem->completion); 928 } 929 930 static void hmm_devmem_ref_exit(void *data) 931 { 932 struct percpu_ref *ref = data; 933 struct hmm_devmem *devmem; 934 935 devmem = container_of(ref, struct hmm_devmem, ref); 936 percpu_ref_exit(ref); 937 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); 938 } 939 940 static void hmm_devmem_ref_kill(void *data) 941 { 942 struct percpu_ref *ref = data; 943 struct hmm_devmem *devmem; 944 945 devmem = container_of(ref, struct hmm_devmem, ref); 946 percpu_ref_kill(ref); 947 wait_for_completion(&devmem->completion); 948 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); 949 } 950 951 static int hmm_devmem_fault(struct vm_area_struct *vma, 952 unsigned long addr, 953 const struct page *page, 954 unsigned int flags, 955 pmd_t *pmdp) 956 { 957 struct hmm_devmem *devmem = page->pgmap->data; 958 959 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); 960 } 961 962 static void hmm_devmem_free(struct page *page, void *data) 963 { 964 struct hmm_devmem *devmem = data; 965 966 devmem->ops->free(devmem, page); 967 } 968 969 static DEFINE_MUTEX(hmm_devmem_lock); 970 static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); 971 972 static void hmm_devmem_radix_release(struct resource *resource) 973 { 974 resource_size_t key, align_start, align_size; 975 976 align_start = resource->start & ~(PA_SECTION_SIZE - 1); 977 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); 978 979 mutex_lock(&hmm_devmem_lock); 980 for (key = resource->start; 981 key <= resource->end; 982 key += PA_SECTION_SIZE) 983 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); 984 mutex_unlock(&hmm_devmem_lock); 985 } 986 987 static void hmm_devmem_release(struct device *dev, void *data) 988 { 989 struct hmm_devmem *devmem = data; 990 struct resource *resource = devmem->resource; 991 unsigned long start_pfn, npages; 992 struct zone *zone; 993 struct page *page; 994 995 if (percpu_ref_tryget_live(&devmem->ref)) { 996 dev_WARN(dev, "%s: page mapping is still live!\n", __func__); 997 percpu_ref_put(&devmem->ref); 998 } 999 1000 /* pages are dead and unused, undo the arch mapping */ 1001 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; 1002 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; 1003 1004 page = pfn_to_page(start_pfn); 1005 zone = page_zone(page); 1006 1007 mem_hotplug_begin(); 1008 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) 1009 __remove_pages(zone, start_pfn, npages, NULL); 1010 else 1011 arch_remove_memory(start_pfn << PAGE_SHIFT, 1012 npages << PAGE_SHIFT, NULL); 1013 mem_hotplug_done(); 1014 1015 hmm_devmem_radix_release(resource); 1016 } 1017 1018 static int hmm_devmem_pages_create(struct hmm_devmem *devmem) 1019 { 1020 resource_size_t key, align_start, align_size, align_end; 1021 struct device *device = devmem->device; 1022 int ret, nid, is_ram; 1023 unsigned long pfn; 1024 1025 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); 1026 align_size = ALIGN(devmem->resource->start + 1027 resource_size(devmem->resource), 1028 PA_SECTION_SIZE) - align_start; 1029 1030 is_ram = region_intersects(align_start, align_size, 1031 IORESOURCE_SYSTEM_RAM, 1032 IORES_DESC_NONE); 1033 if (is_ram == REGION_MIXED) { 1034 WARN_ONCE(1, "%s attempted on mixed region %pr\n", 1035 __func__, devmem->resource); 1036 return -ENXIO; 1037 } 1038 if (is_ram == REGION_INTERSECTS) 1039 return -ENXIO; 1040 1041 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) 1042 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; 1043 else 1044 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 1045 1046 devmem->pagemap.res = *devmem->resource; 1047 devmem->pagemap.page_fault = hmm_devmem_fault; 1048 devmem->pagemap.page_free = hmm_devmem_free; 1049 devmem->pagemap.dev = devmem->device; 1050 devmem->pagemap.ref = &devmem->ref; 1051 devmem->pagemap.data = devmem; 1052 1053 mutex_lock(&hmm_devmem_lock); 1054 align_end = align_start + align_size - 1; 1055 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { 1056 struct hmm_devmem *dup; 1057 1058 dup = radix_tree_lookup(&hmm_devmem_radix, 1059 key >> PA_SECTION_SHIFT); 1060 if (dup) { 1061 dev_err(device, "%s: collides with mapping for %s\n", 1062 __func__, dev_name(dup->device)); 1063 mutex_unlock(&hmm_devmem_lock); 1064 ret = -EBUSY; 1065 goto error; 1066 } 1067 ret = radix_tree_insert(&hmm_devmem_radix, 1068 key >> PA_SECTION_SHIFT, 1069 devmem); 1070 if (ret) { 1071 dev_err(device, "%s: failed: %d\n", __func__, ret); 1072 mutex_unlock(&hmm_devmem_lock); 1073 goto error_radix; 1074 } 1075 } 1076 mutex_unlock(&hmm_devmem_lock); 1077 1078 nid = dev_to_node(device); 1079 if (nid < 0) 1080 nid = numa_mem_id(); 1081 1082 mem_hotplug_begin(); 1083 /* 1084 * For device private memory we call add_pages() as we only need to 1085 * allocate and initialize struct page for the device memory. More- 1086 * over the device memory is un-accessible thus we do not want to 1087 * create a linear mapping for the memory like arch_add_memory() 1088 * would do. 1089 * 1090 * For device public memory, which is accesible by the CPU, we do 1091 * want the linear mapping and thus use arch_add_memory(). 1092 */ 1093 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) 1094 ret = arch_add_memory(nid, align_start, align_size, NULL, 1095 false); 1096 else 1097 ret = add_pages(nid, align_start >> PAGE_SHIFT, 1098 align_size >> PAGE_SHIFT, NULL, false); 1099 if (ret) { 1100 mem_hotplug_done(); 1101 goto error_add_memory; 1102 } 1103 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], 1104 align_start >> PAGE_SHIFT, 1105 align_size >> PAGE_SHIFT, NULL); 1106 mem_hotplug_done(); 1107 1108 for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { 1109 struct page *page = pfn_to_page(pfn); 1110 1111 page->pgmap = &devmem->pagemap; 1112 } 1113 return 0; 1114 1115 error_add_memory: 1116 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); 1117 error_radix: 1118 hmm_devmem_radix_release(devmem->resource); 1119 error: 1120 return ret; 1121 } 1122 1123 static int hmm_devmem_match(struct device *dev, void *data, void *match_data) 1124 { 1125 struct hmm_devmem *devmem = data; 1126 1127 return devmem->resource == match_data; 1128 } 1129 1130 static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) 1131 { 1132 devres_release(devmem->device, &hmm_devmem_release, 1133 &hmm_devmem_match, devmem->resource); 1134 } 1135 1136 /* 1137 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory 1138 * 1139 * @ops: memory event device driver callback (see struct hmm_devmem_ops) 1140 * @device: device struct to bind the resource too 1141 * @size: size in bytes of the device memory to add 1142 * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise 1143 * 1144 * This function first finds an empty range of physical address big enough to 1145 * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which 1146 * in turn allocates struct pages. It does not do anything beyond that; all 1147 * events affecting the memory will go through the various callbacks provided 1148 * by hmm_devmem_ops struct. 1149 * 1150 * Device driver should call this function during device initialization and 1151 * is then responsible of memory management. HMM only provides helpers. 1152 */ 1153 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, 1154 struct device *device, 1155 unsigned long size) 1156 { 1157 struct hmm_devmem *devmem; 1158 resource_size_t addr; 1159 int ret; 1160 1161 dev_pagemap_get_ops(); 1162 1163 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1164 GFP_KERNEL, dev_to_node(device)); 1165 if (!devmem) 1166 return ERR_PTR(-ENOMEM); 1167 1168 init_completion(&devmem->completion); 1169 devmem->pfn_first = -1UL; 1170 devmem->pfn_last = -1UL; 1171 devmem->resource = NULL; 1172 devmem->device = device; 1173 devmem->ops = ops; 1174 1175 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1176 0, GFP_KERNEL); 1177 if (ret) 1178 goto error_percpu_ref; 1179 1180 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1181 if (ret) 1182 goto error_devm_add_action; 1183 1184 size = ALIGN(size, PA_SECTION_SIZE); 1185 addr = min((unsigned long)iomem_resource.end, 1186 (1UL << MAX_PHYSMEM_BITS) - 1); 1187 addr = addr - size + 1UL; 1188 1189 /* 1190 * FIXME add a new helper to quickly walk resource tree and find free 1191 * range 1192 * 1193 * FIXME what about ioport_resource resource ? 1194 */ 1195 for (; addr > size && addr >= iomem_resource.start; addr -= size) { 1196 ret = region_intersects(addr, size, 0, IORES_DESC_NONE); 1197 if (ret != REGION_DISJOINT) 1198 continue; 1199 1200 devmem->resource = devm_request_mem_region(device, addr, size, 1201 dev_name(device)); 1202 if (!devmem->resource) { 1203 ret = -ENOMEM; 1204 goto error_no_resource; 1205 } 1206 break; 1207 } 1208 if (!devmem->resource) { 1209 ret = -ERANGE; 1210 goto error_no_resource; 1211 } 1212 1213 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1214 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1215 devmem->pfn_last = devmem->pfn_first + 1216 (resource_size(devmem->resource) >> PAGE_SHIFT); 1217 1218 ret = hmm_devmem_pages_create(devmem); 1219 if (ret) 1220 goto error_pages; 1221 1222 devres_add(device, devmem); 1223 1224 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1225 if (ret) { 1226 hmm_devmem_remove(devmem); 1227 return ERR_PTR(ret); 1228 } 1229 1230 return devmem; 1231 1232 error_pages: 1233 devm_release_mem_region(device, devmem->resource->start, 1234 resource_size(devmem->resource)); 1235 error_no_resource: 1236 error_devm_add_action: 1237 hmm_devmem_ref_kill(&devmem->ref); 1238 hmm_devmem_ref_exit(&devmem->ref); 1239 error_percpu_ref: 1240 devres_free(devmem); 1241 return ERR_PTR(ret); 1242 } 1243 EXPORT_SYMBOL(hmm_devmem_add); 1244 1245 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 1246 struct device *device, 1247 struct resource *res) 1248 { 1249 struct hmm_devmem *devmem; 1250 int ret; 1251 1252 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) 1253 return ERR_PTR(-EINVAL); 1254 1255 dev_pagemap_get_ops(); 1256 1257 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1258 GFP_KERNEL, dev_to_node(device)); 1259 if (!devmem) 1260 return ERR_PTR(-ENOMEM); 1261 1262 init_completion(&devmem->completion); 1263 devmem->pfn_first = -1UL; 1264 devmem->pfn_last = -1UL; 1265 devmem->resource = res; 1266 devmem->device = device; 1267 devmem->ops = ops; 1268 1269 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1270 0, GFP_KERNEL); 1271 if (ret) 1272 goto error_percpu_ref; 1273 1274 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1275 if (ret) 1276 goto error_devm_add_action; 1277 1278 1279 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1280 devmem->pfn_last = devmem->pfn_first + 1281 (resource_size(devmem->resource) >> PAGE_SHIFT); 1282 1283 ret = hmm_devmem_pages_create(devmem); 1284 if (ret) 1285 goto error_devm_add_action; 1286 1287 devres_add(device, devmem); 1288 1289 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1290 if (ret) { 1291 hmm_devmem_remove(devmem); 1292 return ERR_PTR(ret); 1293 } 1294 1295 return devmem; 1296 1297 error_devm_add_action: 1298 hmm_devmem_ref_kill(&devmem->ref); 1299 hmm_devmem_ref_exit(&devmem->ref); 1300 error_percpu_ref: 1301 devres_free(devmem); 1302 return ERR_PTR(ret); 1303 } 1304 EXPORT_SYMBOL(hmm_devmem_add_resource); 1305 1306 /* 1307 * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) 1308 * 1309 * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory 1310 * 1311 * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf 1312 * of the device driver. It will free struct page and remove the resource that 1313 * reserved the physical address range for this device memory. 1314 */ 1315 void hmm_devmem_remove(struct hmm_devmem *devmem) 1316 { 1317 resource_size_t start, size; 1318 struct device *device; 1319 bool cdm = false; 1320 1321 if (!devmem) 1322 return; 1323 1324 device = devmem->device; 1325 start = devmem->resource->start; 1326 size = resource_size(devmem->resource); 1327 1328 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; 1329 hmm_devmem_ref_kill(&devmem->ref); 1330 hmm_devmem_ref_exit(&devmem->ref); 1331 hmm_devmem_pages_remove(devmem); 1332 1333 if (!cdm) 1334 devm_release_mem_region(device, start, size); 1335 } 1336 EXPORT_SYMBOL(hmm_devmem_remove); 1337 1338 /* 1339 * A device driver that wants to handle multiple devices memory through a 1340 * single fake device can use hmm_device to do so. This is purely a helper 1341 * and it is not needed to make use of any HMM functionality. 1342 */ 1343 #define HMM_DEVICE_MAX 256 1344 1345 static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); 1346 static DEFINE_SPINLOCK(hmm_device_lock); 1347 static struct class *hmm_device_class; 1348 static dev_t hmm_device_devt; 1349 1350 static void hmm_device_release(struct device *device) 1351 { 1352 struct hmm_device *hmm_device; 1353 1354 hmm_device = container_of(device, struct hmm_device, device); 1355 spin_lock(&hmm_device_lock); 1356 clear_bit(hmm_device->minor, hmm_device_mask); 1357 spin_unlock(&hmm_device_lock); 1358 1359 kfree(hmm_device); 1360 } 1361 1362 struct hmm_device *hmm_device_new(void *drvdata) 1363 { 1364 struct hmm_device *hmm_device; 1365 1366 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); 1367 if (!hmm_device) 1368 return ERR_PTR(-ENOMEM); 1369 1370 spin_lock(&hmm_device_lock); 1371 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); 1372 if (hmm_device->minor >= HMM_DEVICE_MAX) { 1373 spin_unlock(&hmm_device_lock); 1374 kfree(hmm_device); 1375 return ERR_PTR(-EBUSY); 1376 } 1377 set_bit(hmm_device->minor, hmm_device_mask); 1378 spin_unlock(&hmm_device_lock); 1379 1380 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); 1381 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), 1382 hmm_device->minor); 1383 hmm_device->device.release = hmm_device_release; 1384 dev_set_drvdata(&hmm_device->device, drvdata); 1385 hmm_device->device.class = hmm_device_class; 1386 device_initialize(&hmm_device->device); 1387 1388 return hmm_device; 1389 } 1390 EXPORT_SYMBOL(hmm_device_new); 1391 1392 void hmm_device_put(struct hmm_device *hmm_device) 1393 { 1394 put_device(&hmm_device->device); 1395 } 1396 EXPORT_SYMBOL(hmm_device_put); 1397 1398 static int __init hmm_init(void) 1399 { 1400 int ret; 1401 1402 ret = alloc_chrdev_region(&hmm_device_devt, 0, 1403 HMM_DEVICE_MAX, 1404 "hmm_device"); 1405 if (ret) 1406 return ret; 1407 1408 hmm_device_class = class_create(THIS_MODULE, "hmm_device"); 1409 if (IS_ERR(hmm_device_class)) { 1410 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); 1411 return PTR_ERR(hmm_device_class); 1412 } 1413 return 0; 1414 } 1415 1416 device_initcall(hmm_init); 1417 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 1418