1 /* 2 * Copyright 2013 Red Hat Inc. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * Authors: Jérôme Glisse <jglisse@redhat.com> 15 */ 16 /* 17 * Refer to include/linux/hmm.h for information about heterogeneous memory 18 * management or HMM for short. 19 */ 20 #include <linux/mm.h> 21 #include <linux/hmm.h> 22 #include <linux/init.h> 23 #include <linux/rmap.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/sched.h> 27 #include <linux/mmzone.h> 28 #include <linux/pagemap.h> 29 #include <linux/swapops.h> 30 #include <linux/hugetlb.h> 31 #include <linux/memremap.h> 32 #include <linux/jump_label.h> 33 #include <linux/mmu_notifier.h> 34 #include <linux/memory_hotplug.h> 35 36 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) 37 38 #if IS_ENABLED(CONFIG_HMM_MIRROR) 39 static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 40 41 /* 42 * struct hmm - HMM per mm struct 43 * 44 * @mm: mm struct this HMM struct is bound to 45 * @lock: lock protecting ranges list 46 * @sequence: we track updates to the CPU page table with a sequence number 47 * @ranges: list of range being snapshotted 48 * @mirrors: list of mirrors for this mm 49 * @mmu_notifier: mmu notifier to track updates to CPU page table 50 * @mirrors_sem: read/write semaphore protecting the mirrors list 51 */ 52 struct hmm { 53 struct mm_struct *mm; 54 spinlock_t lock; 55 atomic_t sequence; 56 struct list_head ranges; 57 struct list_head mirrors; 58 struct mmu_notifier mmu_notifier; 59 struct rw_semaphore mirrors_sem; 60 }; 61 62 /* 63 * hmm_register - register HMM against an mm (HMM internal) 64 * 65 * @mm: mm struct to attach to 66 * 67 * This is not intended to be used directly by device drivers. It allocates an 68 * HMM struct if mm does not have one, and initializes it. 69 */ 70 static struct hmm *hmm_register(struct mm_struct *mm) 71 { 72 struct hmm *hmm = READ_ONCE(mm->hmm); 73 bool cleanup = false; 74 75 /* 76 * The hmm struct can only be freed once the mm_struct goes away, 77 * hence we should always have pre-allocated an new hmm struct 78 * above. 79 */ 80 if (hmm) 81 return hmm; 82 83 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 84 if (!hmm) 85 return NULL; 86 INIT_LIST_HEAD(&hmm->mirrors); 87 init_rwsem(&hmm->mirrors_sem); 88 atomic_set(&hmm->sequence, 0); 89 hmm->mmu_notifier.ops = NULL; 90 INIT_LIST_HEAD(&hmm->ranges); 91 spin_lock_init(&hmm->lock); 92 hmm->mm = mm; 93 94 /* 95 * We should only get here if hold the mmap_sem in write mode ie on 96 * registration of first mirror through hmm_mirror_register() 97 */ 98 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; 99 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { 100 kfree(hmm); 101 return NULL; 102 } 103 104 spin_lock(&mm->page_table_lock); 105 if (!mm->hmm) 106 mm->hmm = hmm; 107 else 108 cleanup = true; 109 spin_unlock(&mm->page_table_lock); 110 111 if (cleanup) { 112 mmu_notifier_unregister(&hmm->mmu_notifier, mm); 113 kfree(hmm); 114 } 115 116 return mm->hmm; 117 } 118 119 void hmm_mm_destroy(struct mm_struct *mm) 120 { 121 kfree(mm->hmm); 122 } 123 124 static void hmm_invalidate_range(struct hmm *hmm, 125 enum hmm_update_type action, 126 unsigned long start, 127 unsigned long end) 128 { 129 struct hmm_mirror *mirror; 130 struct hmm_range *range; 131 132 spin_lock(&hmm->lock); 133 list_for_each_entry(range, &hmm->ranges, list) { 134 unsigned long addr, idx, npages; 135 136 if (end < range->start || start >= range->end) 137 continue; 138 139 range->valid = false; 140 addr = max(start, range->start); 141 idx = (addr - range->start) >> PAGE_SHIFT; 142 npages = (min(range->end, end) - addr) >> PAGE_SHIFT; 143 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); 144 } 145 spin_unlock(&hmm->lock); 146 147 down_read(&hmm->mirrors_sem); 148 list_for_each_entry(mirror, &hmm->mirrors, list) 149 mirror->ops->sync_cpu_device_pagetables(mirror, action, 150 start, end); 151 up_read(&hmm->mirrors_sem); 152 } 153 154 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 155 { 156 struct hmm_mirror *mirror; 157 struct hmm *hmm = mm->hmm; 158 159 down_write(&hmm->mirrors_sem); 160 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, 161 list); 162 while (mirror) { 163 list_del_init(&mirror->list); 164 if (mirror->ops->release) { 165 /* 166 * Drop mirrors_sem so callback can wait on any pending 167 * work that might itself trigger mmu_notifier callback 168 * and thus would deadlock with us. 169 */ 170 up_write(&hmm->mirrors_sem); 171 mirror->ops->release(mirror); 172 down_write(&hmm->mirrors_sem); 173 } 174 mirror = list_first_entry_or_null(&hmm->mirrors, 175 struct hmm_mirror, list); 176 } 177 up_write(&hmm->mirrors_sem); 178 } 179 180 static void hmm_invalidate_range_start(struct mmu_notifier *mn, 181 struct mm_struct *mm, 182 unsigned long start, 183 unsigned long end) 184 { 185 struct hmm *hmm = mm->hmm; 186 187 VM_BUG_ON(!hmm); 188 189 atomic_inc(&hmm->sequence); 190 } 191 192 static void hmm_invalidate_range_end(struct mmu_notifier *mn, 193 struct mm_struct *mm, 194 unsigned long start, 195 unsigned long end) 196 { 197 struct hmm *hmm = mm->hmm; 198 199 VM_BUG_ON(!hmm); 200 201 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); 202 } 203 204 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 205 .release = hmm_release, 206 .invalidate_range_start = hmm_invalidate_range_start, 207 .invalidate_range_end = hmm_invalidate_range_end, 208 }; 209 210 /* 211 * hmm_mirror_register() - register a mirror against an mm 212 * 213 * @mirror: new mirror struct to register 214 * @mm: mm to register against 215 * 216 * To start mirroring a process address space, the device driver must register 217 * an HMM mirror struct. 218 * 219 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! 220 */ 221 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 222 { 223 /* Sanity check */ 224 if (!mm || !mirror || !mirror->ops) 225 return -EINVAL; 226 227 again: 228 mirror->hmm = hmm_register(mm); 229 if (!mirror->hmm) 230 return -ENOMEM; 231 232 down_write(&mirror->hmm->mirrors_sem); 233 if (mirror->hmm->mm == NULL) { 234 /* 235 * A racing hmm_mirror_unregister() is about to destroy the hmm 236 * struct. Try again to allocate a new one. 237 */ 238 up_write(&mirror->hmm->mirrors_sem); 239 mirror->hmm = NULL; 240 goto again; 241 } else { 242 list_add(&mirror->list, &mirror->hmm->mirrors); 243 up_write(&mirror->hmm->mirrors_sem); 244 } 245 246 return 0; 247 } 248 EXPORT_SYMBOL(hmm_mirror_register); 249 250 /* 251 * hmm_mirror_unregister() - unregister a mirror 252 * 253 * @mirror: new mirror struct to register 254 * 255 * Stop mirroring a process address space, and cleanup. 256 */ 257 void hmm_mirror_unregister(struct hmm_mirror *mirror) 258 { 259 bool should_unregister = false; 260 struct mm_struct *mm; 261 struct hmm *hmm; 262 263 if (mirror->hmm == NULL) 264 return; 265 266 hmm = mirror->hmm; 267 down_write(&hmm->mirrors_sem); 268 list_del_init(&mirror->list); 269 should_unregister = list_empty(&hmm->mirrors); 270 mirror->hmm = NULL; 271 mm = hmm->mm; 272 hmm->mm = NULL; 273 up_write(&hmm->mirrors_sem); 274 275 if (!should_unregister || mm == NULL) 276 return; 277 278 spin_lock(&mm->page_table_lock); 279 if (mm->hmm == hmm) 280 mm->hmm = NULL; 281 spin_unlock(&mm->page_table_lock); 282 283 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); 284 kfree(hmm); 285 } 286 EXPORT_SYMBOL(hmm_mirror_unregister); 287 288 struct hmm_vma_walk { 289 struct hmm_range *range; 290 unsigned long last; 291 bool fault; 292 bool block; 293 }; 294 295 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 296 bool write_fault, uint64_t *pfn) 297 { 298 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 299 struct hmm_vma_walk *hmm_vma_walk = walk->private; 300 struct hmm_range *range = hmm_vma_walk->range; 301 struct vm_area_struct *vma = walk->vma; 302 vm_fault_t ret; 303 304 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 305 flags |= write_fault ? FAULT_FLAG_WRITE : 0; 306 ret = handle_mm_fault(vma, addr, flags); 307 if (ret & VM_FAULT_RETRY) 308 return -EBUSY; 309 if (ret & VM_FAULT_ERROR) { 310 *pfn = range->values[HMM_PFN_ERROR]; 311 return -EFAULT; 312 } 313 314 return -EAGAIN; 315 } 316 317 static int hmm_pfns_bad(unsigned long addr, 318 unsigned long end, 319 struct mm_walk *walk) 320 { 321 struct hmm_vma_walk *hmm_vma_walk = walk->private; 322 struct hmm_range *range = hmm_vma_walk->range; 323 uint64_t *pfns = range->pfns; 324 unsigned long i; 325 326 i = (addr - range->start) >> PAGE_SHIFT; 327 for (; addr < end; addr += PAGE_SIZE, i++) 328 pfns[i] = range->values[HMM_PFN_ERROR]; 329 330 return 0; 331 } 332 333 /* 334 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 335 * @start: range virtual start address (inclusive) 336 * @end: range virtual end address (exclusive) 337 * @fault: should we fault or not ? 338 * @write_fault: write fault ? 339 * @walk: mm_walk structure 340 * Returns: 0 on success, -EAGAIN after page fault, or page fault error 341 * 342 * This function will be called whenever pmd_none() or pte_none() returns true, 343 * or whenever there is no page directory covering the virtual address range. 344 */ 345 static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 346 bool fault, bool write_fault, 347 struct mm_walk *walk) 348 { 349 struct hmm_vma_walk *hmm_vma_walk = walk->private; 350 struct hmm_range *range = hmm_vma_walk->range; 351 uint64_t *pfns = range->pfns; 352 unsigned long i; 353 354 hmm_vma_walk->last = addr; 355 i = (addr - range->start) >> PAGE_SHIFT; 356 for (; addr < end; addr += PAGE_SIZE, i++) { 357 pfns[i] = range->values[HMM_PFN_NONE]; 358 if (fault || write_fault) { 359 int ret; 360 361 ret = hmm_vma_do_fault(walk, addr, write_fault, 362 &pfns[i]); 363 if (ret != -EAGAIN) 364 return ret; 365 } 366 } 367 368 return (fault || write_fault) ? -EAGAIN : 0; 369 } 370 371 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 372 uint64_t pfns, uint64_t cpu_flags, 373 bool *fault, bool *write_fault) 374 { 375 struct hmm_range *range = hmm_vma_walk->range; 376 377 *fault = *write_fault = false; 378 if (!hmm_vma_walk->fault) 379 return; 380 381 /* We aren't ask to do anything ... */ 382 if (!(pfns & range->flags[HMM_PFN_VALID])) 383 return; 384 /* If this is device memory than only fault if explicitly requested */ 385 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 386 /* Do we fault on device memory ? */ 387 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 388 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 389 *fault = true; 390 } 391 return; 392 } 393 394 /* If CPU page table is not valid then we need to fault */ 395 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 396 /* Need to write fault ? */ 397 if ((pfns & range->flags[HMM_PFN_WRITE]) && 398 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 399 *write_fault = true; 400 *fault = true; 401 } 402 } 403 404 static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 405 const uint64_t *pfns, unsigned long npages, 406 uint64_t cpu_flags, bool *fault, 407 bool *write_fault) 408 { 409 unsigned long i; 410 411 if (!hmm_vma_walk->fault) { 412 *fault = *write_fault = false; 413 return; 414 } 415 416 for (i = 0; i < npages; ++i) { 417 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 418 fault, write_fault); 419 if ((*fault) || (*write_fault)) 420 return; 421 } 422 } 423 424 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 425 struct mm_walk *walk) 426 { 427 struct hmm_vma_walk *hmm_vma_walk = walk->private; 428 struct hmm_range *range = hmm_vma_walk->range; 429 bool fault, write_fault; 430 unsigned long i, npages; 431 uint64_t *pfns; 432 433 i = (addr - range->start) >> PAGE_SHIFT; 434 npages = (end - addr) >> PAGE_SHIFT; 435 pfns = &range->pfns[i]; 436 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 437 0, &fault, &write_fault); 438 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 439 } 440 441 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 442 { 443 if (pmd_protnone(pmd)) 444 return 0; 445 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 446 range->flags[HMM_PFN_WRITE] : 447 range->flags[HMM_PFN_VALID]; 448 } 449 450 static int hmm_vma_handle_pmd(struct mm_walk *walk, 451 unsigned long addr, 452 unsigned long end, 453 uint64_t *pfns, 454 pmd_t pmd) 455 { 456 struct hmm_vma_walk *hmm_vma_walk = walk->private; 457 struct hmm_range *range = hmm_vma_walk->range; 458 unsigned long pfn, npages, i; 459 bool fault, write_fault; 460 uint64_t cpu_flags; 461 462 npages = (end - addr) >> PAGE_SHIFT; 463 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 464 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 465 &fault, &write_fault); 466 467 if (pmd_protnone(pmd) || fault || write_fault) 468 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 469 470 pfn = pmd_pfn(pmd) + pte_index(addr); 471 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 472 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; 473 hmm_vma_walk->last = end; 474 return 0; 475 } 476 477 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 478 { 479 if (pte_none(pte) || !pte_present(pte)) 480 return 0; 481 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 482 range->flags[HMM_PFN_WRITE] : 483 range->flags[HMM_PFN_VALID]; 484 } 485 486 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 487 unsigned long end, pmd_t *pmdp, pte_t *ptep, 488 uint64_t *pfn) 489 { 490 struct hmm_vma_walk *hmm_vma_walk = walk->private; 491 struct hmm_range *range = hmm_vma_walk->range; 492 struct vm_area_struct *vma = walk->vma; 493 bool fault, write_fault; 494 uint64_t cpu_flags; 495 pte_t pte = *ptep; 496 uint64_t orig_pfn = *pfn; 497 498 *pfn = range->values[HMM_PFN_NONE]; 499 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 500 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 501 &fault, &write_fault); 502 503 if (pte_none(pte)) { 504 if (fault || write_fault) 505 goto fault; 506 return 0; 507 } 508 509 if (!pte_present(pte)) { 510 swp_entry_t entry = pte_to_swp_entry(pte); 511 512 if (!non_swap_entry(entry)) { 513 if (fault || write_fault) 514 goto fault; 515 return 0; 516 } 517 518 /* 519 * This is a special swap entry, ignore migration, use 520 * device and report anything else as error. 521 */ 522 if (is_device_private_entry(entry)) { 523 cpu_flags = range->flags[HMM_PFN_VALID] | 524 range->flags[HMM_PFN_DEVICE_PRIVATE]; 525 cpu_flags |= is_write_device_private_entry(entry) ? 526 range->flags[HMM_PFN_WRITE] : 0; 527 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 528 &fault, &write_fault); 529 if (fault || write_fault) 530 goto fault; 531 *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); 532 *pfn |= cpu_flags; 533 return 0; 534 } 535 536 if (is_migration_entry(entry)) { 537 if (fault || write_fault) { 538 pte_unmap(ptep); 539 hmm_vma_walk->last = addr; 540 migration_entry_wait(vma->vm_mm, 541 pmdp, addr); 542 return -EAGAIN; 543 } 544 return 0; 545 } 546 547 /* Report error for everything else */ 548 *pfn = range->values[HMM_PFN_ERROR]; 549 return -EFAULT; 550 } 551 552 if (fault || write_fault) 553 goto fault; 554 555 *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; 556 return 0; 557 558 fault: 559 pte_unmap(ptep); 560 /* Fault any virtual address we were asked to fault */ 561 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 562 } 563 564 static int hmm_vma_walk_pmd(pmd_t *pmdp, 565 unsigned long start, 566 unsigned long end, 567 struct mm_walk *walk) 568 { 569 struct hmm_vma_walk *hmm_vma_walk = walk->private; 570 struct hmm_range *range = hmm_vma_walk->range; 571 uint64_t *pfns = range->pfns; 572 unsigned long addr = start, i; 573 pte_t *ptep; 574 575 i = (addr - range->start) >> PAGE_SHIFT; 576 577 again: 578 if (pmd_none(*pmdp)) 579 return hmm_vma_walk_hole(start, end, walk); 580 581 if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) 582 return hmm_pfns_bad(start, end, walk); 583 584 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { 585 pmd_t pmd; 586 587 /* 588 * No need to take pmd_lock here, even if some other threads 589 * is splitting the huge pmd we will get that event through 590 * mmu_notifier callback. 591 * 592 * So just read pmd value and check again its a transparent 593 * huge or device mapping one and compute corresponding pfn 594 * values. 595 */ 596 pmd = pmd_read_atomic(pmdp); 597 barrier(); 598 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 599 goto again; 600 601 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 602 } 603 604 if (pmd_bad(*pmdp)) 605 return hmm_pfns_bad(start, end, walk); 606 607 ptep = pte_offset_map(pmdp, addr); 608 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 609 int r; 610 611 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 612 if (r) { 613 /* hmm_vma_handle_pte() did unmap pte directory */ 614 hmm_vma_walk->last = addr; 615 return r; 616 } 617 } 618 pte_unmap(ptep - 1); 619 620 hmm_vma_walk->last = addr; 621 return 0; 622 } 623 624 static void hmm_pfns_clear(struct hmm_range *range, 625 uint64_t *pfns, 626 unsigned long addr, 627 unsigned long end) 628 { 629 for (; addr < end; addr += PAGE_SIZE, pfns++) 630 *pfns = range->values[HMM_PFN_NONE]; 631 } 632 633 static void hmm_pfns_special(struct hmm_range *range) 634 { 635 unsigned long addr = range->start, i = 0; 636 637 for (; addr < range->end; addr += PAGE_SIZE, i++) 638 range->pfns[i] = range->values[HMM_PFN_SPECIAL]; 639 } 640 641 /* 642 * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses 643 * @range: range being snapshotted 644 * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 645 * vma permission, 0 success 646 * 647 * This snapshots the CPU page table for a range of virtual addresses. Snapshot 648 * validity is tracked by range struct. See hmm_vma_range_done() for further 649 * information. 650 * 651 * The range struct is initialized here. It tracks the CPU page table, but only 652 * if the function returns success (0), in which case the caller must then call 653 * hmm_vma_range_done() to stop CPU page table update tracking on this range. 654 * 655 * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS 656 * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! 657 */ 658 int hmm_vma_get_pfns(struct hmm_range *range) 659 { 660 struct vm_area_struct *vma = range->vma; 661 struct hmm_vma_walk hmm_vma_walk; 662 struct mm_walk mm_walk; 663 struct hmm *hmm; 664 665 /* Sanity check, this really should not happen ! */ 666 if (range->start < vma->vm_start || range->start >= vma->vm_end) 667 return -EINVAL; 668 if (range->end < vma->vm_start || range->end > vma->vm_end) 669 return -EINVAL; 670 671 hmm = hmm_register(vma->vm_mm); 672 if (!hmm) 673 return -ENOMEM; 674 /* Caller must have registered a mirror, via hmm_mirror_register() ! */ 675 if (!hmm->mmu_notifier.ops) 676 return -EINVAL; 677 678 /* FIXME support hugetlb fs */ 679 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 680 vma_is_dax(vma)) { 681 hmm_pfns_special(range); 682 return -EINVAL; 683 } 684 685 if (!(vma->vm_flags & VM_READ)) { 686 /* 687 * If vma do not allow read access, then assume that it does 688 * not allow write access, either. Architecture that allow 689 * write without read access are not supported by HMM, because 690 * operations such has atomic access would not work. 691 */ 692 hmm_pfns_clear(range, range->pfns, range->start, range->end); 693 return -EPERM; 694 } 695 696 /* Initialize range to track CPU page table update */ 697 spin_lock(&hmm->lock); 698 range->valid = true; 699 list_add_rcu(&range->list, &hmm->ranges); 700 spin_unlock(&hmm->lock); 701 702 hmm_vma_walk.fault = false; 703 hmm_vma_walk.range = range; 704 mm_walk.private = &hmm_vma_walk; 705 706 mm_walk.vma = vma; 707 mm_walk.mm = vma->vm_mm; 708 mm_walk.pte_entry = NULL; 709 mm_walk.test_walk = NULL; 710 mm_walk.hugetlb_entry = NULL; 711 mm_walk.pmd_entry = hmm_vma_walk_pmd; 712 mm_walk.pte_hole = hmm_vma_walk_hole; 713 714 walk_page_range(range->start, range->end, &mm_walk); 715 return 0; 716 } 717 EXPORT_SYMBOL(hmm_vma_get_pfns); 718 719 /* 720 * hmm_vma_range_done() - stop tracking change to CPU page table over a range 721 * @range: range being tracked 722 * Returns: false if range data has been invalidated, true otherwise 723 * 724 * Range struct is used to track updates to the CPU page table after a call to 725 * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done 726 * using the data, or wants to lock updates to the data it got from those 727 * functions, it must call the hmm_vma_range_done() function, which will then 728 * stop tracking CPU page table updates. 729 * 730 * Note that device driver must still implement general CPU page table update 731 * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using 732 * the mmu_notifier API directly. 733 * 734 * CPU page table update tracking done through hmm_range is only temporary and 735 * to be used while trying to duplicate CPU page table contents for a range of 736 * virtual addresses. 737 * 738 * There are two ways to use this : 739 * again: 740 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 741 * trans = device_build_page_table_update_transaction(pfns); 742 * device_page_table_lock(); 743 * if (!hmm_vma_range_done(range)) { 744 * device_page_table_unlock(); 745 * goto again; 746 * } 747 * device_commit_transaction(trans); 748 * device_page_table_unlock(); 749 * 750 * Or: 751 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 752 * device_page_table_lock(); 753 * hmm_vma_range_done(range); 754 * device_update_page_table(range->pfns); 755 * device_page_table_unlock(); 756 */ 757 bool hmm_vma_range_done(struct hmm_range *range) 758 { 759 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; 760 struct hmm *hmm; 761 762 if (range->end <= range->start) { 763 BUG(); 764 return false; 765 } 766 767 hmm = hmm_register(range->vma->vm_mm); 768 if (!hmm) { 769 memset(range->pfns, 0, sizeof(*range->pfns) * npages); 770 return false; 771 } 772 773 spin_lock(&hmm->lock); 774 list_del_rcu(&range->list); 775 spin_unlock(&hmm->lock); 776 777 return range->valid; 778 } 779 EXPORT_SYMBOL(hmm_vma_range_done); 780 781 /* 782 * hmm_vma_fault() - try to fault some address in a virtual address range 783 * @range: range being faulted 784 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 785 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) 786 * 787 * This is similar to a regular CPU page fault except that it will not trigger 788 * any memory migration if the memory being faulted is not accessible by CPUs. 789 * 790 * On error, for one virtual address in the range, the function will mark the 791 * corresponding HMM pfn entry with an error flag. 792 * 793 * Expected use pattern: 794 * retry: 795 * down_read(&mm->mmap_sem); 796 * // Find vma and address device wants to fault, initialize hmm_pfn_t 797 * // array accordingly 798 * ret = hmm_vma_fault(range, write, block); 799 * switch (ret) { 800 * case -EAGAIN: 801 * hmm_vma_range_done(range); 802 * // You might want to rate limit or yield to play nicely, you may 803 * // also commit any valid pfn in the array assuming that you are 804 * // getting true from hmm_vma_range_monitor_end() 805 * goto retry; 806 * case 0: 807 * break; 808 * case -ENOMEM: 809 * case -EINVAL: 810 * case -EPERM: 811 * default: 812 * // Handle error ! 813 * up_read(&mm->mmap_sem) 814 * return; 815 * } 816 * // Take device driver lock that serialize device page table update 817 * driver_lock_device_page_table_update(); 818 * hmm_vma_range_done(range); 819 * // Commit pfns we got from hmm_vma_fault() 820 * driver_unlock_device_page_table_update(); 821 * up_read(&mm->mmap_sem) 822 * 823 * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) 824 * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! 825 * 826 * YOU HAVE BEEN WARNED ! 827 */ 828 int hmm_vma_fault(struct hmm_range *range, bool block) 829 { 830 struct vm_area_struct *vma = range->vma; 831 unsigned long start = range->start; 832 struct hmm_vma_walk hmm_vma_walk; 833 struct mm_walk mm_walk; 834 struct hmm *hmm; 835 int ret; 836 837 /* Sanity check, this really should not happen ! */ 838 if (range->start < vma->vm_start || range->start >= vma->vm_end) 839 return -EINVAL; 840 if (range->end < vma->vm_start || range->end > vma->vm_end) 841 return -EINVAL; 842 843 hmm = hmm_register(vma->vm_mm); 844 if (!hmm) { 845 hmm_pfns_clear(range, range->pfns, range->start, range->end); 846 return -ENOMEM; 847 } 848 /* Caller must have registered a mirror using hmm_mirror_register() */ 849 if (!hmm->mmu_notifier.ops) 850 return -EINVAL; 851 852 /* FIXME support hugetlb fs */ 853 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 854 vma_is_dax(vma)) { 855 hmm_pfns_special(range); 856 return -EINVAL; 857 } 858 859 if (!(vma->vm_flags & VM_READ)) { 860 /* 861 * If vma do not allow read access, then assume that it does 862 * not allow write access, either. Architecture that allow 863 * write without read access are not supported by HMM, because 864 * operations such has atomic access would not work. 865 */ 866 hmm_pfns_clear(range, range->pfns, range->start, range->end); 867 return -EPERM; 868 } 869 870 /* Initialize range to track CPU page table update */ 871 spin_lock(&hmm->lock); 872 range->valid = true; 873 list_add_rcu(&range->list, &hmm->ranges); 874 spin_unlock(&hmm->lock); 875 876 hmm_vma_walk.fault = true; 877 hmm_vma_walk.block = block; 878 hmm_vma_walk.range = range; 879 mm_walk.private = &hmm_vma_walk; 880 hmm_vma_walk.last = range->start; 881 882 mm_walk.vma = vma; 883 mm_walk.mm = vma->vm_mm; 884 mm_walk.pte_entry = NULL; 885 mm_walk.test_walk = NULL; 886 mm_walk.hugetlb_entry = NULL; 887 mm_walk.pmd_entry = hmm_vma_walk_pmd; 888 mm_walk.pte_hole = hmm_vma_walk_hole; 889 890 do { 891 ret = walk_page_range(start, range->end, &mm_walk); 892 start = hmm_vma_walk.last; 893 } while (ret == -EAGAIN); 894 895 if (ret) { 896 unsigned long i; 897 898 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 899 hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, 900 range->end); 901 hmm_vma_range_done(range); 902 } 903 return ret; 904 } 905 EXPORT_SYMBOL(hmm_vma_fault); 906 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 907 908 909 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) 910 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, 911 unsigned long addr) 912 { 913 struct page *page; 914 915 page = alloc_page_vma(GFP_HIGHUSER, vma, addr); 916 if (!page) 917 return NULL; 918 lock_page(page); 919 return page; 920 } 921 EXPORT_SYMBOL(hmm_vma_alloc_locked_page); 922 923 924 static void hmm_devmem_ref_release(struct percpu_ref *ref) 925 { 926 struct hmm_devmem *devmem; 927 928 devmem = container_of(ref, struct hmm_devmem, ref); 929 complete(&devmem->completion); 930 } 931 932 static void hmm_devmem_ref_exit(void *data) 933 { 934 struct percpu_ref *ref = data; 935 struct hmm_devmem *devmem; 936 937 devmem = container_of(ref, struct hmm_devmem, ref); 938 percpu_ref_exit(ref); 939 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); 940 } 941 942 static void hmm_devmem_ref_kill(void *data) 943 { 944 struct percpu_ref *ref = data; 945 struct hmm_devmem *devmem; 946 947 devmem = container_of(ref, struct hmm_devmem, ref); 948 percpu_ref_kill(ref); 949 wait_for_completion(&devmem->completion); 950 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); 951 } 952 953 static int hmm_devmem_fault(struct vm_area_struct *vma, 954 unsigned long addr, 955 const struct page *page, 956 unsigned int flags, 957 pmd_t *pmdp) 958 { 959 struct hmm_devmem *devmem = page->pgmap->data; 960 961 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); 962 } 963 964 static void hmm_devmem_free(struct page *page, void *data) 965 { 966 struct hmm_devmem *devmem = data; 967 968 devmem->ops->free(devmem, page); 969 } 970 971 static DEFINE_MUTEX(hmm_devmem_lock); 972 static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); 973 974 static void hmm_devmem_radix_release(struct resource *resource) 975 { 976 resource_size_t key; 977 978 mutex_lock(&hmm_devmem_lock); 979 for (key = resource->start; 980 key <= resource->end; 981 key += PA_SECTION_SIZE) 982 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); 983 mutex_unlock(&hmm_devmem_lock); 984 } 985 986 static void hmm_devmem_release(struct device *dev, void *data) 987 { 988 struct hmm_devmem *devmem = data; 989 struct resource *resource = devmem->resource; 990 unsigned long start_pfn, npages; 991 struct zone *zone; 992 struct page *page; 993 994 if (percpu_ref_tryget_live(&devmem->ref)) { 995 dev_WARN(dev, "%s: page mapping is still live!\n", __func__); 996 percpu_ref_put(&devmem->ref); 997 } 998 999 /* pages are dead and unused, undo the arch mapping */ 1000 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; 1001 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; 1002 1003 page = pfn_to_page(start_pfn); 1004 zone = page_zone(page); 1005 1006 mem_hotplug_begin(); 1007 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) 1008 __remove_pages(zone, start_pfn, npages, NULL); 1009 else 1010 arch_remove_memory(start_pfn << PAGE_SHIFT, 1011 npages << PAGE_SHIFT, NULL); 1012 mem_hotplug_done(); 1013 1014 hmm_devmem_radix_release(resource); 1015 } 1016 1017 static int hmm_devmem_pages_create(struct hmm_devmem *devmem) 1018 { 1019 resource_size_t key, align_start, align_size, align_end; 1020 struct device *device = devmem->device; 1021 int ret, nid, is_ram; 1022 unsigned long pfn; 1023 1024 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); 1025 align_size = ALIGN(devmem->resource->start + 1026 resource_size(devmem->resource), 1027 PA_SECTION_SIZE) - align_start; 1028 1029 is_ram = region_intersects(align_start, align_size, 1030 IORESOURCE_SYSTEM_RAM, 1031 IORES_DESC_NONE); 1032 if (is_ram == REGION_MIXED) { 1033 WARN_ONCE(1, "%s attempted on mixed region %pr\n", 1034 __func__, devmem->resource); 1035 return -ENXIO; 1036 } 1037 if (is_ram == REGION_INTERSECTS) 1038 return -ENXIO; 1039 1040 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) 1041 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; 1042 else 1043 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 1044 1045 devmem->pagemap.res = *devmem->resource; 1046 devmem->pagemap.page_fault = hmm_devmem_fault; 1047 devmem->pagemap.page_free = hmm_devmem_free; 1048 devmem->pagemap.dev = devmem->device; 1049 devmem->pagemap.ref = &devmem->ref; 1050 devmem->pagemap.data = devmem; 1051 1052 mutex_lock(&hmm_devmem_lock); 1053 align_end = align_start + align_size - 1; 1054 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { 1055 struct hmm_devmem *dup; 1056 1057 dup = radix_tree_lookup(&hmm_devmem_radix, 1058 key >> PA_SECTION_SHIFT); 1059 if (dup) { 1060 dev_err(device, "%s: collides with mapping for %s\n", 1061 __func__, dev_name(dup->device)); 1062 mutex_unlock(&hmm_devmem_lock); 1063 ret = -EBUSY; 1064 goto error; 1065 } 1066 ret = radix_tree_insert(&hmm_devmem_radix, 1067 key >> PA_SECTION_SHIFT, 1068 devmem); 1069 if (ret) { 1070 dev_err(device, "%s: failed: %d\n", __func__, ret); 1071 mutex_unlock(&hmm_devmem_lock); 1072 goto error_radix; 1073 } 1074 } 1075 mutex_unlock(&hmm_devmem_lock); 1076 1077 nid = dev_to_node(device); 1078 if (nid < 0) 1079 nid = numa_mem_id(); 1080 1081 mem_hotplug_begin(); 1082 /* 1083 * For device private memory we call add_pages() as we only need to 1084 * allocate and initialize struct page for the device memory. More- 1085 * over the device memory is un-accessible thus we do not want to 1086 * create a linear mapping for the memory like arch_add_memory() 1087 * would do. 1088 * 1089 * For device public memory, which is accesible by the CPU, we do 1090 * want the linear mapping and thus use arch_add_memory(). 1091 */ 1092 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) 1093 ret = arch_add_memory(nid, align_start, align_size, NULL, 1094 false); 1095 else 1096 ret = add_pages(nid, align_start >> PAGE_SHIFT, 1097 align_size >> PAGE_SHIFT, NULL, false); 1098 if (ret) { 1099 mem_hotplug_done(); 1100 goto error_add_memory; 1101 } 1102 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], 1103 align_start >> PAGE_SHIFT, 1104 align_size >> PAGE_SHIFT, NULL); 1105 mem_hotplug_done(); 1106 1107 for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { 1108 struct page *page = pfn_to_page(pfn); 1109 1110 page->pgmap = &devmem->pagemap; 1111 } 1112 return 0; 1113 1114 error_add_memory: 1115 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); 1116 error_radix: 1117 hmm_devmem_radix_release(devmem->resource); 1118 error: 1119 return ret; 1120 } 1121 1122 static int hmm_devmem_match(struct device *dev, void *data, void *match_data) 1123 { 1124 struct hmm_devmem *devmem = data; 1125 1126 return devmem->resource == match_data; 1127 } 1128 1129 static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) 1130 { 1131 devres_release(devmem->device, &hmm_devmem_release, 1132 &hmm_devmem_match, devmem->resource); 1133 } 1134 1135 /* 1136 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory 1137 * 1138 * @ops: memory event device driver callback (see struct hmm_devmem_ops) 1139 * @device: device struct to bind the resource too 1140 * @size: size in bytes of the device memory to add 1141 * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise 1142 * 1143 * This function first finds an empty range of physical address big enough to 1144 * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which 1145 * in turn allocates struct pages. It does not do anything beyond that; all 1146 * events affecting the memory will go through the various callbacks provided 1147 * by hmm_devmem_ops struct. 1148 * 1149 * Device driver should call this function during device initialization and 1150 * is then responsible of memory management. HMM only provides helpers. 1151 */ 1152 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, 1153 struct device *device, 1154 unsigned long size) 1155 { 1156 struct hmm_devmem *devmem; 1157 resource_size_t addr; 1158 int ret; 1159 1160 dev_pagemap_get_ops(); 1161 1162 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1163 GFP_KERNEL, dev_to_node(device)); 1164 if (!devmem) 1165 return ERR_PTR(-ENOMEM); 1166 1167 init_completion(&devmem->completion); 1168 devmem->pfn_first = -1UL; 1169 devmem->pfn_last = -1UL; 1170 devmem->resource = NULL; 1171 devmem->device = device; 1172 devmem->ops = ops; 1173 1174 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1175 0, GFP_KERNEL); 1176 if (ret) 1177 goto error_percpu_ref; 1178 1179 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1180 if (ret) 1181 goto error_devm_add_action; 1182 1183 size = ALIGN(size, PA_SECTION_SIZE); 1184 addr = min((unsigned long)iomem_resource.end, 1185 (1UL << MAX_PHYSMEM_BITS) - 1); 1186 addr = addr - size + 1UL; 1187 1188 /* 1189 * FIXME add a new helper to quickly walk resource tree and find free 1190 * range 1191 * 1192 * FIXME what about ioport_resource resource ? 1193 */ 1194 for (; addr > size && addr >= iomem_resource.start; addr -= size) { 1195 ret = region_intersects(addr, size, 0, IORES_DESC_NONE); 1196 if (ret != REGION_DISJOINT) 1197 continue; 1198 1199 devmem->resource = devm_request_mem_region(device, addr, size, 1200 dev_name(device)); 1201 if (!devmem->resource) { 1202 ret = -ENOMEM; 1203 goto error_no_resource; 1204 } 1205 break; 1206 } 1207 if (!devmem->resource) { 1208 ret = -ERANGE; 1209 goto error_no_resource; 1210 } 1211 1212 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1213 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1214 devmem->pfn_last = devmem->pfn_first + 1215 (resource_size(devmem->resource) >> PAGE_SHIFT); 1216 1217 ret = hmm_devmem_pages_create(devmem); 1218 if (ret) 1219 goto error_pages; 1220 1221 devres_add(device, devmem); 1222 1223 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1224 if (ret) { 1225 hmm_devmem_remove(devmem); 1226 return ERR_PTR(ret); 1227 } 1228 1229 return devmem; 1230 1231 error_pages: 1232 devm_release_mem_region(device, devmem->resource->start, 1233 resource_size(devmem->resource)); 1234 error_no_resource: 1235 error_devm_add_action: 1236 hmm_devmem_ref_kill(&devmem->ref); 1237 hmm_devmem_ref_exit(&devmem->ref); 1238 error_percpu_ref: 1239 devres_free(devmem); 1240 return ERR_PTR(ret); 1241 } 1242 EXPORT_SYMBOL(hmm_devmem_add); 1243 1244 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 1245 struct device *device, 1246 struct resource *res) 1247 { 1248 struct hmm_devmem *devmem; 1249 int ret; 1250 1251 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) 1252 return ERR_PTR(-EINVAL); 1253 1254 dev_pagemap_get_ops(); 1255 1256 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1257 GFP_KERNEL, dev_to_node(device)); 1258 if (!devmem) 1259 return ERR_PTR(-ENOMEM); 1260 1261 init_completion(&devmem->completion); 1262 devmem->pfn_first = -1UL; 1263 devmem->pfn_last = -1UL; 1264 devmem->resource = res; 1265 devmem->device = device; 1266 devmem->ops = ops; 1267 1268 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1269 0, GFP_KERNEL); 1270 if (ret) 1271 goto error_percpu_ref; 1272 1273 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1274 if (ret) 1275 goto error_devm_add_action; 1276 1277 1278 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1279 devmem->pfn_last = devmem->pfn_first + 1280 (resource_size(devmem->resource) >> PAGE_SHIFT); 1281 1282 ret = hmm_devmem_pages_create(devmem); 1283 if (ret) 1284 goto error_devm_add_action; 1285 1286 devres_add(device, devmem); 1287 1288 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1289 if (ret) { 1290 hmm_devmem_remove(devmem); 1291 return ERR_PTR(ret); 1292 } 1293 1294 return devmem; 1295 1296 error_devm_add_action: 1297 hmm_devmem_ref_kill(&devmem->ref); 1298 hmm_devmem_ref_exit(&devmem->ref); 1299 error_percpu_ref: 1300 devres_free(devmem); 1301 return ERR_PTR(ret); 1302 } 1303 EXPORT_SYMBOL(hmm_devmem_add_resource); 1304 1305 /* 1306 * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) 1307 * 1308 * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory 1309 * 1310 * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf 1311 * of the device driver. It will free struct page and remove the resource that 1312 * reserved the physical address range for this device memory. 1313 */ 1314 void hmm_devmem_remove(struct hmm_devmem *devmem) 1315 { 1316 resource_size_t start, size; 1317 struct device *device; 1318 bool cdm = false; 1319 1320 if (!devmem) 1321 return; 1322 1323 device = devmem->device; 1324 start = devmem->resource->start; 1325 size = resource_size(devmem->resource); 1326 1327 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; 1328 hmm_devmem_ref_kill(&devmem->ref); 1329 hmm_devmem_ref_exit(&devmem->ref); 1330 hmm_devmem_pages_remove(devmem); 1331 1332 if (!cdm) 1333 devm_release_mem_region(device, start, size); 1334 } 1335 EXPORT_SYMBOL(hmm_devmem_remove); 1336 1337 /* 1338 * A device driver that wants to handle multiple devices memory through a 1339 * single fake device can use hmm_device to do so. This is purely a helper 1340 * and it is not needed to make use of any HMM functionality. 1341 */ 1342 #define HMM_DEVICE_MAX 256 1343 1344 static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); 1345 static DEFINE_SPINLOCK(hmm_device_lock); 1346 static struct class *hmm_device_class; 1347 static dev_t hmm_device_devt; 1348 1349 static void hmm_device_release(struct device *device) 1350 { 1351 struct hmm_device *hmm_device; 1352 1353 hmm_device = container_of(device, struct hmm_device, device); 1354 spin_lock(&hmm_device_lock); 1355 clear_bit(hmm_device->minor, hmm_device_mask); 1356 spin_unlock(&hmm_device_lock); 1357 1358 kfree(hmm_device); 1359 } 1360 1361 struct hmm_device *hmm_device_new(void *drvdata) 1362 { 1363 struct hmm_device *hmm_device; 1364 1365 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); 1366 if (!hmm_device) 1367 return ERR_PTR(-ENOMEM); 1368 1369 spin_lock(&hmm_device_lock); 1370 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); 1371 if (hmm_device->minor >= HMM_DEVICE_MAX) { 1372 spin_unlock(&hmm_device_lock); 1373 kfree(hmm_device); 1374 return ERR_PTR(-EBUSY); 1375 } 1376 set_bit(hmm_device->minor, hmm_device_mask); 1377 spin_unlock(&hmm_device_lock); 1378 1379 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); 1380 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), 1381 hmm_device->minor); 1382 hmm_device->device.release = hmm_device_release; 1383 dev_set_drvdata(&hmm_device->device, drvdata); 1384 hmm_device->device.class = hmm_device_class; 1385 device_initialize(&hmm_device->device); 1386 1387 return hmm_device; 1388 } 1389 EXPORT_SYMBOL(hmm_device_new); 1390 1391 void hmm_device_put(struct hmm_device *hmm_device) 1392 { 1393 put_device(&hmm_device->device); 1394 } 1395 EXPORT_SYMBOL(hmm_device_put); 1396 1397 static int __init hmm_init(void) 1398 { 1399 int ret; 1400 1401 ret = alloc_chrdev_region(&hmm_device_devt, 0, 1402 HMM_DEVICE_MAX, 1403 "hmm_device"); 1404 if (ret) 1405 return ret; 1406 1407 hmm_device_class = class_create(THIS_MODULE, "hmm_device"); 1408 if (IS_ERR(hmm_device_class)) { 1409 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); 1410 return PTR_ERR(hmm_device_class); 1411 } 1412 return 0; 1413 } 1414 1415 device_initcall(hmm_init); 1416 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 1417