1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright 2013 Red Hat Inc. 4 * 5 * Authors: Jérôme Glisse <jglisse@redhat.com> 6 */ 7 /* 8 * Refer to include/linux/hmm.h for information about heterogeneous memory 9 * management or HMM for short. 10 */ 11 #include <linux/mm.h> 12 #include <linux/hmm.h> 13 #include <linux/init.h> 14 #include <linux/rmap.h> 15 #include <linux/swap.h> 16 #include <linux/slab.h> 17 #include <linux/sched.h> 18 #include <linux/mmzone.h> 19 #include <linux/pagemap.h> 20 #include <linux/swapops.h> 21 #include <linux/hugetlb.h> 22 #include <linux/memremap.h> 23 #include <linux/jump_label.h> 24 #include <linux/dma-mapping.h> 25 #include <linux/mmu_notifier.h> 26 #include <linux/memory_hotplug.h> 27 28 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) 29 30 #if IS_ENABLED(CONFIG_HMM_MIRROR) 31 static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 32 33 static inline struct hmm *mm_get_hmm(struct mm_struct *mm) 34 { 35 struct hmm *hmm = READ_ONCE(mm->hmm); 36 37 if (hmm && kref_get_unless_zero(&hmm->kref)) 38 return hmm; 39 40 return NULL; 41 } 42 43 /** 44 * hmm_get_or_create - register HMM against an mm (HMM internal) 45 * 46 * @mm: mm struct to attach to 47 * Returns: returns an HMM object, either by referencing the existing 48 * (per-process) object, or by creating a new one. 49 * 50 * This is not intended to be used directly by device drivers. If mm already 51 * has an HMM struct then it get a reference on it and returns it. Otherwise 52 * it allocates an HMM struct, initializes it, associate it with the mm and 53 * returns it. 54 */ 55 static struct hmm *hmm_get_or_create(struct mm_struct *mm) 56 { 57 struct hmm *hmm = mm_get_hmm(mm); 58 bool cleanup = false; 59 60 if (hmm) 61 return hmm; 62 63 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 64 if (!hmm) 65 return NULL; 66 init_waitqueue_head(&hmm->wq); 67 INIT_LIST_HEAD(&hmm->mirrors); 68 init_rwsem(&hmm->mirrors_sem); 69 hmm->mmu_notifier.ops = NULL; 70 INIT_LIST_HEAD(&hmm->ranges); 71 mutex_init(&hmm->lock); 72 kref_init(&hmm->kref); 73 hmm->notifiers = 0; 74 hmm->dead = false; 75 hmm->mm = mm; 76 77 spin_lock(&mm->page_table_lock); 78 if (!mm->hmm) 79 mm->hmm = hmm; 80 else 81 cleanup = true; 82 spin_unlock(&mm->page_table_lock); 83 84 if (cleanup) 85 goto error; 86 87 /* 88 * We should only get here if hold the mmap_sem in write mode ie on 89 * registration of first mirror through hmm_mirror_register() 90 */ 91 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; 92 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) 93 goto error_mm; 94 95 return hmm; 96 97 error_mm: 98 spin_lock(&mm->page_table_lock); 99 if (mm->hmm == hmm) 100 mm->hmm = NULL; 101 spin_unlock(&mm->page_table_lock); 102 error: 103 kfree(hmm); 104 return NULL; 105 } 106 107 static void hmm_free(struct kref *kref) 108 { 109 struct hmm *hmm = container_of(kref, struct hmm, kref); 110 struct mm_struct *mm = hmm->mm; 111 112 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); 113 114 spin_lock(&mm->page_table_lock); 115 if (mm->hmm == hmm) 116 mm->hmm = NULL; 117 spin_unlock(&mm->page_table_lock); 118 119 kfree(hmm); 120 } 121 122 static inline void hmm_put(struct hmm *hmm) 123 { 124 kref_put(&hmm->kref, hmm_free); 125 } 126 127 void hmm_mm_destroy(struct mm_struct *mm) 128 { 129 struct hmm *hmm; 130 131 spin_lock(&mm->page_table_lock); 132 hmm = mm_get_hmm(mm); 133 mm->hmm = NULL; 134 if (hmm) { 135 hmm->mm = NULL; 136 hmm->dead = true; 137 spin_unlock(&mm->page_table_lock); 138 hmm_put(hmm); 139 return; 140 } 141 142 spin_unlock(&mm->page_table_lock); 143 } 144 145 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 146 { 147 struct hmm *hmm = mm_get_hmm(mm); 148 struct hmm_mirror *mirror; 149 struct hmm_range *range; 150 151 /* Report this HMM as dying. */ 152 hmm->dead = true; 153 154 /* Wake-up everyone waiting on any range. */ 155 mutex_lock(&hmm->lock); 156 list_for_each_entry(range, &hmm->ranges, list) { 157 range->valid = false; 158 } 159 wake_up_all(&hmm->wq); 160 mutex_unlock(&hmm->lock); 161 162 down_write(&hmm->mirrors_sem); 163 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, 164 list); 165 while (mirror) { 166 list_del_init(&mirror->list); 167 if (mirror->ops->release) { 168 /* 169 * Drop mirrors_sem so callback can wait on any pending 170 * work that might itself trigger mmu_notifier callback 171 * and thus would deadlock with us. 172 */ 173 up_write(&hmm->mirrors_sem); 174 mirror->ops->release(mirror); 175 down_write(&hmm->mirrors_sem); 176 } 177 mirror = list_first_entry_or_null(&hmm->mirrors, 178 struct hmm_mirror, list); 179 } 180 up_write(&hmm->mirrors_sem); 181 182 hmm_put(hmm); 183 } 184 185 static int hmm_invalidate_range_start(struct mmu_notifier *mn, 186 const struct mmu_notifier_range *nrange) 187 { 188 struct hmm *hmm = mm_get_hmm(nrange->mm); 189 struct hmm_mirror *mirror; 190 struct hmm_update update; 191 struct hmm_range *range; 192 int ret = 0; 193 194 VM_BUG_ON(!hmm); 195 196 update.start = nrange->start; 197 update.end = nrange->end; 198 update.event = HMM_UPDATE_INVALIDATE; 199 update.blockable = mmu_notifier_range_blockable(nrange); 200 201 if (mmu_notifier_range_blockable(nrange)) 202 mutex_lock(&hmm->lock); 203 else if (!mutex_trylock(&hmm->lock)) { 204 ret = -EAGAIN; 205 goto out; 206 } 207 hmm->notifiers++; 208 list_for_each_entry(range, &hmm->ranges, list) { 209 if (update.end < range->start || update.start >= range->end) 210 continue; 211 212 range->valid = false; 213 } 214 mutex_unlock(&hmm->lock); 215 216 if (mmu_notifier_range_blockable(nrange)) 217 down_read(&hmm->mirrors_sem); 218 else if (!down_read_trylock(&hmm->mirrors_sem)) { 219 ret = -EAGAIN; 220 goto out; 221 } 222 list_for_each_entry(mirror, &hmm->mirrors, list) { 223 int ret; 224 225 ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); 226 if (!update.blockable && ret == -EAGAIN) { 227 up_read(&hmm->mirrors_sem); 228 ret = -EAGAIN; 229 goto out; 230 } 231 } 232 up_read(&hmm->mirrors_sem); 233 234 out: 235 hmm_put(hmm); 236 return ret; 237 } 238 239 static void hmm_invalidate_range_end(struct mmu_notifier *mn, 240 const struct mmu_notifier_range *nrange) 241 { 242 struct hmm *hmm = mm_get_hmm(nrange->mm); 243 244 VM_BUG_ON(!hmm); 245 246 mutex_lock(&hmm->lock); 247 hmm->notifiers--; 248 if (!hmm->notifiers) { 249 struct hmm_range *range; 250 251 list_for_each_entry(range, &hmm->ranges, list) { 252 if (range->valid) 253 continue; 254 range->valid = true; 255 } 256 wake_up_all(&hmm->wq); 257 } 258 mutex_unlock(&hmm->lock); 259 260 hmm_put(hmm); 261 } 262 263 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 264 .release = hmm_release, 265 .invalidate_range_start = hmm_invalidate_range_start, 266 .invalidate_range_end = hmm_invalidate_range_end, 267 }; 268 269 /* 270 * hmm_mirror_register() - register a mirror against an mm 271 * 272 * @mirror: new mirror struct to register 273 * @mm: mm to register against 274 * 275 * To start mirroring a process address space, the device driver must register 276 * an HMM mirror struct. 277 * 278 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! 279 */ 280 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 281 { 282 /* Sanity check */ 283 if (!mm || !mirror || !mirror->ops) 284 return -EINVAL; 285 286 mirror->hmm = hmm_get_or_create(mm); 287 if (!mirror->hmm) 288 return -ENOMEM; 289 290 down_write(&mirror->hmm->mirrors_sem); 291 list_add(&mirror->list, &mirror->hmm->mirrors); 292 up_write(&mirror->hmm->mirrors_sem); 293 294 return 0; 295 } 296 EXPORT_SYMBOL(hmm_mirror_register); 297 298 /* 299 * hmm_mirror_unregister() - unregister a mirror 300 * 301 * @mirror: new mirror struct to register 302 * 303 * Stop mirroring a process address space, and cleanup. 304 */ 305 void hmm_mirror_unregister(struct hmm_mirror *mirror) 306 { 307 struct hmm *hmm = READ_ONCE(mirror->hmm); 308 309 if (hmm == NULL) 310 return; 311 312 down_write(&hmm->mirrors_sem); 313 list_del_init(&mirror->list); 314 /* To protect us against double unregister ... */ 315 mirror->hmm = NULL; 316 up_write(&hmm->mirrors_sem); 317 318 hmm_put(hmm); 319 } 320 EXPORT_SYMBOL(hmm_mirror_unregister); 321 322 struct hmm_vma_walk { 323 struct hmm_range *range; 324 struct dev_pagemap *pgmap; 325 unsigned long last; 326 bool fault; 327 bool block; 328 }; 329 330 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 331 bool write_fault, uint64_t *pfn) 332 { 333 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 334 struct hmm_vma_walk *hmm_vma_walk = walk->private; 335 struct hmm_range *range = hmm_vma_walk->range; 336 struct vm_area_struct *vma = walk->vma; 337 vm_fault_t ret; 338 339 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 340 flags |= write_fault ? FAULT_FLAG_WRITE : 0; 341 ret = handle_mm_fault(vma, addr, flags); 342 if (ret & VM_FAULT_RETRY) 343 return -EAGAIN; 344 if (ret & VM_FAULT_ERROR) { 345 *pfn = range->values[HMM_PFN_ERROR]; 346 return -EFAULT; 347 } 348 349 return -EBUSY; 350 } 351 352 static int hmm_pfns_bad(unsigned long addr, 353 unsigned long end, 354 struct mm_walk *walk) 355 { 356 struct hmm_vma_walk *hmm_vma_walk = walk->private; 357 struct hmm_range *range = hmm_vma_walk->range; 358 uint64_t *pfns = range->pfns; 359 unsigned long i; 360 361 i = (addr - range->start) >> PAGE_SHIFT; 362 for (; addr < end; addr += PAGE_SIZE, i++) 363 pfns[i] = range->values[HMM_PFN_ERROR]; 364 365 return 0; 366 } 367 368 /* 369 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 370 * @start: range virtual start address (inclusive) 371 * @end: range virtual end address (exclusive) 372 * @fault: should we fault or not ? 373 * @write_fault: write fault ? 374 * @walk: mm_walk structure 375 * Returns: 0 on success, -EBUSY after page fault, or page fault error 376 * 377 * This function will be called whenever pmd_none() or pte_none() returns true, 378 * or whenever there is no page directory covering the virtual address range. 379 */ 380 static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 381 bool fault, bool write_fault, 382 struct mm_walk *walk) 383 { 384 struct hmm_vma_walk *hmm_vma_walk = walk->private; 385 struct hmm_range *range = hmm_vma_walk->range; 386 uint64_t *pfns = range->pfns; 387 unsigned long i, page_size; 388 389 hmm_vma_walk->last = addr; 390 page_size = hmm_range_page_size(range); 391 i = (addr - range->start) >> range->page_shift; 392 393 for (; addr < end; addr += page_size, i++) { 394 pfns[i] = range->values[HMM_PFN_NONE]; 395 if (fault || write_fault) { 396 int ret; 397 398 ret = hmm_vma_do_fault(walk, addr, write_fault, 399 &pfns[i]); 400 if (ret != -EBUSY) 401 return ret; 402 } 403 } 404 405 return (fault || write_fault) ? -EBUSY : 0; 406 } 407 408 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 409 uint64_t pfns, uint64_t cpu_flags, 410 bool *fault, bool *write_fault) 411 { 412 struct hmm_range *range = hmm_vma_walk->range; 413 414 if (!hmm_vma_walk->fault) 415 return; 416 417 /* 418 * So we not only consider the individual per page request we also 419 * consider the default flags requested for the range. The API can 420 * be use in 2 fashions. The first one where the HMM user coalesce 421 * multiple page fault into one request and set flags per pfns for 422 * of those faults. The second one where the HMM user want to pre- 423 * fault a range with specific flags. For the latter one it is a 424 * waste to have the user pre-fill the pfn arrays with a default 425 * flags value. 426 */ 427 pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 428 429 /* We aren't ask to do anything ... */ 430 if (!(pfns & range->flags[HMM_PFN_VALID])) 431 return; 432 /* If this is device memory than only fault if explicitly requested */ 433 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 434 /* Do we fault on device memory ? */ 435 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 436 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 437 *fault = true; 438 } 439 return; 440 } 441 442 /* If CPU page table is not valid then we need to fault */ 443 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 444 /* Need to write fault ? */ 445 if ((pfns & range->flags[HMM_PFN_WRITE]) && 446 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 447 *write_fault = true; 448 *fault = true; 449 } 450 } 451 452 static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 453 const uint64_t *pfns, unsigned long npages, 454 uint64_t cpu_flags, bool *fault, 455 bool *write_fault) 456 { 457 unsigned long i; 458 459 if (!hmm_vma_walk->fault) { 460 *fault = *write_fault = false; 461 return; 462 } 463 464 *fault = *write_fault = false; 465 for (i = 0; i < npages; ++i) { 466 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 467 fault, write_fault); 468 if ((*write_fault)) 469 return; 470 } 471 } 472 473 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 474 struct mm_walk *walk) 475 { 476 struct hmm_vma_walk *hmm_vma_walk = walk->private; 477 struct hmm_range *range = hmm_vma_walk->range; 478 bool fault, write_fault; 479 unsigned long i, npages; 480 uint64_t *pfns; 481 482 i = (addr - range->start) >> PAGE_SHIFT; 483 npages = (end - addr) >> PAGE_SHIFT; 484 pfns = &range->pfns[i]; 485 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 486 0, &fault, &write_fault); 487 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 488 } 489 490 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 491 { 492 if (pmd_protnone(pmd)) 493 return 0; 494 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 495 range->flags[HMM_PFN_WRITE] : 496 range->flags[HMM_PFN_VALID]; 497 } 498 499 static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 500 { 501 if (!pud_present(pud)) 502 return 0; 503 return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 504 range->flags[HMM_PFN_WRITE] : 505 range->flags[HMM_PFN_VALID]; 506 } 507 508 static int hmm_vma_handle_pmd(struct mm_walk *walk, 509 unsigned long addr, 510 unsigned long end, 511 uint64_t *pfns, 512 pmd_t pmd) 513 { 514 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 515 struct hmm_vma_walk *hmm_vma_walk = walk->private; 516 struct hmm_range *range = hmm_vma_walk->range; 517 unsigned long pfn, npages, i; 518 bool fault, write_fault; 519 uint64_t cpu_flags; 520 521 npages = (end - addr) >> PAGE_SHIFT; 522 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 523 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 524 &fault, &write_fault); 525 526 if (pmd_protnone(pmd) || fault || write_fault) 527 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 528 529 pfn = pmd_pfn(pmd) + pte_index(addr); 530 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 531 if (pmd_devmap(pmd)) { 532 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 533 hmm_vma_walk->pgmap); 534 if (unlikely(!hmm_vma_walk->pgmap)) 535 return -EBUSY; 536 } 537 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 538 } 539 if (hmm_vma_walk->pgmap) { 540 put_dev_pagemap(hmm_vma_walk->pgmap); 541 hmm_vma_walk->pgmap = NULL; 542 } 543 hmm_vma_walk->last = end; 544 return 0; 545 #else 546 /* If THP is not enabled then we should never reach that code ! */ 547 return -EINVAL; 548 #endif 549 } 550 551 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 552 { 553 if (pte_none(pte) || !pte_present(pte)) 554 return 0; 555 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 556 range->flags[HMM_PFN_WRITE] : 557 range->flags[HMM_PFN_VALID]; 558 } 559 560 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 561 unsigned long end, pmd_t *pmdp, pte_t *ptep, 562 uint64_t *pfn) 563 { 564 struct hmm_vma_walk *hmm_vma_walk = walk->private; 565 struct hmm_range *range = hmm_vma_walk->range; 566 struct vm_area_struct *vma = walk->vma; 567 bool fault, write_fault; 568 uint64_t cpu_flags; 569 pte_t pte = *ptep; 570 uint64_t orig_pfn = *pfn; 571 572 *pfn = range->values[HMM_PFN_NONE]; 573 fault = write_fault = false; 574 575 if (pte_none(pte)) { 576 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, 577 &fault, &write_fault); 578 if (fault || write_fault) 579 goto fault; 580 return 0; 581 } 582 583 if (!pte_present(pte)) { 584 swp_entry_t entry = pte_to_swp_entry(pte); 585 586 if (!non_swap_entry(entry)) { 587 if (fault || write_fault) 588 goto fault; 589 return 0; 590 } 591 592 /* 593 * This is a special swap entry, ignore migration, use 594 * device and report anything else as error. 595 */ 596 if (is_device_private_entry(entry)) { 597 cpu_flags = range->flags[HMM_PFN_VALID] | 598 range->flags[HMM_PFN_DEVICE_PRIVATE]; 599 cpu_flags |= is_write_device_private_entry(entry) ? 600 range->flags[HMM_PFN_WRITE] : 0; 601 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 602 &fault, &write_fault); 603 if (fault || write_fault) 604 goto fault; 605 *pfn = hmm_device_entry_from_pfn(range, 606 swp_offset(entry)); 607 *pfn |= cpu_flags; 608 return 0; 609 } 610 611 if (is_migration_entry(entry)) { 612 if (fault || write_fault) { 613 pte_unmap(ptep); 614 hmm_vma_walk->last = addr; 615 migration_entry_wait(vma->vm_mm, 616 pmdp, addr); 617 return -EBUSY; 618 } 619 return 0; 620 } 621 622 /* Report error for everything else */ 623 *pfn = range->values[HMM_PFN_ERROR]; 624 return -EFAULT; 625 } else { 626 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 627 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 628 &fault, &write_fault); 629 } 630 631 if (fault || write_fault) 632 goto fault; 633 634 if (pte_devmap(pte)) { 635 hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), 636 hmm_vma_walk->pgmap); 637 if (unlikely(!hmm_vma_walk->pgmap)) 638 return -EBUSY; 639 } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { 640 *pfn = range->values[HMM_PFN_SPECIAL]; 641 return -EFAULT; 642 } 643 644 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 645 return 0; 646 647 fault: 648 if (hmm_vma_walk->pgmap) { 649 put_dev_pagemap(hmm_vma_walk->pgmap); 650 hmm_vma_walk->pgmap = NULL; 651 } 652 pte_unmap(ptep); 653 /* Fault any virtual address we were asked to fault */ 654 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 655 } 656 657 static int hmm_vma_walk_pmd(pmd_t *pmdp, 658 unsigned long start, 659 unsigned long end, 660 struct mm_walk *walk) 661 { 662 struct hmm_vma_walk *hmm_vma_walk = walk->private; 663 struct hmm_range *range = hmm_vma_walk->range; 664 struct vm_area_struct *vma = walk->vma; 665 uint64_t *pfns = range->pfns; 666 unsigned long addr = start, i; 667 pte_t *ptep; 668 pmd_t pmd; 669 670 671 again: 672 pmd = READ_ONCE(*pmdp); 673 if (pmd_none(pmd)) 674 return hmm_vma_walk_hole(start, end, walk); 675 676 if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) 677 return hmm_pfns_bad(start, end, walk); 678 679 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 680 bool fault, write_fault; 681 unsigned long npages; 682 uint64_t *pfns; 683 684 i = (addr - range->start) >> PAGE_SHIFT; 685 npages = (end - addr) >> PAGE_SHIFT; 686 pfns = &range->pfns[i]; 687 688 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 689 0, &fault, &write_fault); 690 if (fault || write_fault) { 691 hmm_vma_walk->last = addr; 692 pmd_migration_entry_wait(vma->vm_mm, pmdp); 693 return -EBUSY; 694 } 695 return 0; 696 } else if (!pmd_present(pmd)) 697 return hmm_pfns_bad(start, end, walk); 698 699 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 700 /* 701 * No need to take pmd_lock here, even if some other threads 702 * is splitting the huge pmd we will get that event through 703 * mmu_notifier callback. 704 * 705 * So just read pmd value and check again its a transparent 706 * huge or device mapping one and compute corresponding pfn 707 * values. 708 */ 709 pmd = pmd_read_atomic(pmdp); 710 barrier(); 711 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 712 goto again; 713 714 i = (addr - range->start) >> PAGE_SHIFT; 715 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 716 } 717 718 /* 719 * We have handled all the valid case above ie either none, migration, 720 * huge or transparent huge. At this point either it is a valid pmd 721 * entry pointing to pte directory or it is a bad pmd that will not 722 * recover. 723 */ 724 if (pmd_bad(pmd)) 725 return hmm_pfns_bad(start, end, walk); 726 727 ptep = pte_offset_map(pmdp, addr); 728 i = (addr - range->start) >> PAGE_SHIFT; 729 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 730 int r; 731 732 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 733 if (r) { 734 /* hmm_vma_handle_pte() did unmap pte directory */ 735 hmm_vma_walk->last = addr; 736 return r; 737 } 738 } 739 if (hmm_vma_walk->pgmap) { 740 /* 741 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() 742 * so that we can leverage get_dev_pagemap() optimization which 743 * will not re-take a reference on a pgmap if we already have 744 * one. 745 */ 746 put_dev_pagemap(hmm_vma_walk->pgmap); 747 hmm_vma_walk->pgmap = NULL; 748 } 749 pte_unmap(ptep - 1); 750 751 hmm_vma_walk->last = addr; 752 return 0; 753 } 754 755 static int hmm_vma_walk_pud(pud_t *pudp, 756 unsigned long start, 757 unsigned long end, 758 struct mm_walk *walk) 759 { 760 struct hmm_vma_walk *hmm_vma_walk = walk->private; 761 struct hmm_range *range = hmm_vma_walk->range; 762 unsigned long addr = start, next; 763 pmd_t *pmdp; 764 pud_t pud; 765 int ret; 766 767 again: 768 pud = READ_ONCE(*pudp); 769 if (pud_none(pud)) 770 return hmm_vma_walk_hole(start, end, walk); 771 772 if (pud_huge(pud) && pud_devmap(pud)) { 773 unsigned long i, npages, pfn; 774 uint64_t *pfns, cpu_flags; 775 bool fault, write_fault; 776 777 if (!pud_present(pud)) 778 return hmm_vma_walk_hole(start, end, walk); 779 780 i = (addr - range->start) >> PAGE_SHIFT; 781 npages = (end - addr) >> PAGE_SHIFT; 782 pfns = &range->pfns[i]; 783 784 cpu_flags = pud_to_hmm_pfn_flags(range, pud); 785 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 786 cpu_flags, &fault, &write_fault); 787 if (fault || write_fault) 788 return hmm_vma_walk_hole_(addr, end, fault, 789 write_fault, walk); 790 791 #ifdef CONFIG_HUGETLB_PAGE 792 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 793 for (i = 0; i < npages; ++i, ++pfn) { 794 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 795 hmm_vma_walk->pgmap); 796 if (unlikely(!hmm_vma_walk->pgmap)) 797 return -EBUSY; 798 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 799 cpu_flags; 800 } 801 if (hmm_vma_walk->pgmap) { 802 put_dev_pagemap(hmm_vma_walk->pgmap); 803 hmm_vma_walk->pgmap = NULL; 804 } 805 hmm_vma_walk->last = end; 806 return 0; 807 #else 808 return -EINVAL; 809 #endif 810 } 811 812 split_huge_pud(walk->vma, pudp, addr); 813 if (pud_none(*pudp)) 814 goto again; 815 816 pmdp = pmd_offset(pudp, addr); 817 do { 818 next = pmd_addr_end(addr, end); 819 ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); 820 if (ret) 821 return ret; 822 } while (pmdp++, addr = next, addr != end); 823 824 return 0; 825 } 826 827 static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 828 unsigned long start, unsigned long end, 829 struct mm_walk *walk) 830 { 831 #ifdef CONFIG_HUGETLB_PAGE 832 unsigned long addr = start, i, pfn, mask, size, pfn_inc; 833 struct hmm_vma_walk *hmm_vma_walk = walk->private; 834 struct hmm_range *range = hmm_vma_walk->range; 835 struct vm_area_struct *vma = walk->vma; 836 struct hstate *h = hstate_vma(vma); 837 uint64_t orig_pfn, cpu_flags; 838 bool fault, write_fault; 839 spinlock_t *ptl; 840 pte_t entry; 841 int ret = 0; 842 843 size = 1UL << huge_page_shift(h); 844 mask = size - 1; 845 if (range->page_shift != PAGE_SHIFT) { 846 /* Make sure we are looking at full page. */ 847 if (start & mask) 848 return -EINVAL; 849 if (end < (start + size)) 850 return -EINVAL; 851 pfn_inc = size >> PAGE_SHIFT; 852 } else { 853 pfn_inc = 1; 854 size = PAGE_SIZE; 855 } 856 857 858 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 859 entry = huge_ptep_get(pte); 860 861 i = (start - range->start) >> range->page_shift; 862 orig_pfn = range->pfns[i]; 863 range->pfns[i] = range->values[HMM_PFN_NONE]; 864 cpu_flags = pte_to_hmm_pfn_flags(range, entry); 865 fault = write_fault = false; 866 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 867 &fault, &write_fault); 868 if (fault || write_fault) { 869 ret = -ENOENT; 870 goto unlock; 871 } 872 873 pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); 874 for (; addr < end; addr += size, i++, pfn += pfn_inc) 875 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 876 cpu_flags; 877 hmm_vma_walk->last = end; 878 879 unlock: 880 spin_unlock(ptl); 881 882 if (ret == -ENOENT) 883 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 884 885 return ret; 886 #else /* CONFIG_HUGETLB_PAGE */ 887 return -EINVAL; 888 #endif 889 } 890 891 static void hmm_pfns_clear(struct hmm_range *range, 892 uint64_t *pfns, 893 unsigned long addr, 894 unsigned long end) 895 { 896 for (; addr < end; addr += PAGE_SIZE, pfns++) 897 *pfns = range->values[HMM_PFN_NONE]; 898 } 899 900 /* 901 * hmm_range_register() - start tracking change to CPU page table over a range 902 * @range: range 903 * @mm: the mm struct for the range of virtual address 904 * @start: start virtual address (inclusive) 905 * @end: end virtual address (exclusive) 906 * @page_shift: expect page shift for the range 907 * Returns 0 on success, -EFAULT if the address space is no longer valid 908 * 909 * Track updates to the CPU page table see include/linux/hmm.h 910 */ 911 int hmm_range_register(struct hmm_range *range, 912 struct mm_struct *mm, 913 unsigned long start, 914 unsigned long end, 915 unsigned page_shift) 916 { 917 unsigned long mask = ((1UL << page_shift) - 1UL); 918 919 range->valid = false; 920 range->hmm = NULL; 921 922 if ((start & mask) || (end & mask)) 923 return -EINVAL; 924 if (start >= end) 925 return -EINVAL; 926 927 range->page_shift = page_shift; 928 range->start = start; 929 range->end = end; 930 931 range->hmm = hmm_get_or_create(mm); 932 if (!range->hmm) 933 return -EFAULT; 934 935 /* Check if hmm_mm_destroy() was call. */ 936 if (range->hmm->mm == NULL || range->hmm->dead) { 937 hmm_put(range->hmm); 938 return -EFAULT; 939 } 940 941 /* Initialize range to track CPU page table update */ 942 mutex_lock(&range->hmm->lock); 943 944 list_add_rcu(&range->list, &range->hmm->ranges); 945 946 /* 947 * If there are any concurrent notifiers we have to wait for them for 948 * the range to be valid (see hmm_range_wait_until_valid()). 949 */ 950 if (!range->hmm->notifiers) 951 range->valid = true; 952 mutex_unlock(&range->hmm->lock); 953 954 return 0; 955 } 956 EXPORT_SYMBOL(hmm_range_register); 957 958 /* 959 * hmm_range_unregister() - stop tracking change to CPU page table over a range 960 * @range: range 961 * 962 * Range struct is used to track updates to the CPU page table after a call to 963 * hmm_range_register(). See include/linux/hmm.h for how to use it. 964 */ 965 void hmm_range_unregister(struct hmm_range *range) 966 { 967 /* Sanity check this really should not happen. */ 968 if (range->hmm == NULL || range->end <= range->start) 969 return; 970 971 mutex_lock(&range->hmm->lock); 972 list_del_rcu(&range->list); 973 mutex_unlock(&range->hmm->lock); 974 975 /* Drop reference taken by hmm_range_register() */ 976 range->valid = false; 977 hmm_put(range->hmm); 978 range->hmm = NULL; 979 } 980 EXPORT_SYMBOL(hmm_range_unregister); 981 982 /* 983 * hmm_range_snapshot() - snapshot CPU page table for a range 984 * @range: range 985 * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 986 * permission (for instance asking for write and range is read only), 987 * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid 988 * vma or it is illegal to access that range), number of valid pages 989 * in range->pfns[] (from range start address). 990 * 991 * This snapshots the CPU page table for a range of virtual addresses. Snapshot 992 * validity is tracked by range struct. See in include/linux/hmm.h for example 993 * on how to use. 994 */ 995 long hmm_range_snapshot(struct hmm_range *range) 996 { 997 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 998 unsigned long start = range->start, end; 999 struct hmm_vma_walk hmm_vma_walk; 1000 struct hmm *hmm = range->hmm; 1001 struct vm_area_struct *vma; 1002 struct mm_walk mm_walk; 1003 1004 /* Check if hmm_mm_destroy() was call. */ 1005 if (hmm->mm == NULL || hmm->dead) 1006 return -EFAULT; 1007 1008 do { 1009 /* If range is no longer valid force retry. */ 1010 if (!range->valid) 1011 return -EAGAIN; 1012 1013 vma = find_vma(hmm->mm, start); 1014 if (vma == NULL || (vma->vm_flags & device_vma)) 1015 return -EFAULT; 1016 1017 if (is_vm_hugetlb_page(vma)) { 1018 struct hstate *h = hstate_vma(vma); 1019 1020 if (huge_page_shift(h) != range->page_shift && 1021 range->page_shift != PAGE_SHIFT) 1022 return -EINVAL; 1023 } else { 1024 if (range->page_shift != PAGE_SHIFT) 1025 return -EINVAL; 1026 } 1027 1028 if (!(vma->vm_flags & VM_READ)) { 1029 /* 1030 * If vma do not allow read access, then assume that it 1031 * does not allow write access, either. HMM does not 1032 * support architecture that allow write without read. 1033 */ 1034 hmm_pfns_clear(range, range->pfns, 1035 range->start, range->end); 1036 return -EPERM; 1037 } 1038 1039 range->vma = vma; 1040 hmm_vma_walk.pgmap = NULL; 1041 hmm_vma_walk.last = start; 1042 hmm_vma_walk.fault = false; 1043 hmm_vma_walk.range = range; 1044 mm_walk.private = &hmm_vma_walk; 1045 end = min(range->end, vma->vm_end); 1046 1047 mm_walk.vma = vma; 1048 mm_walk.mm = vma->vm_mm; 1049 mm_walk.pte_entry = NULL; 1050 mm_walk.test_walk = NULL; 1051 mm_walk.hugetlb_entry = NULL; 1052 mm_walk.pud_entry = hmm_vma_walk_pud; 1053 mm_walk.pmd_entry = hmm_vma_walk_pmd; 1054 mm_walk.pte_hole = hmm_vma_walk_hole; 1055 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; 1056 1057 walk_page_range(start, end, &mm_walk); 1058 start = end; 1059 } while (start < range->end); 1060 1061 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 1062 } 1063 EXPORT_SYMBOL(hmm_range_snapshot); 1064 1065 /* 1066 * hmm_range_fault() - try to fault some address in a virtual address range 1067 * @range: range being faulted 1068 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 1069 * Returns: number of valid pages in range->pfns[] (from range start 1070 * address). This may be zero. If the return value is negative, 1071 * then one of the following values may be returned: 1072 * 1073 * -EINVAL invalid arguments or mm or virtual address are in an 1074 * invalid vma (for instance device file vma). 1075 * -ENOMEM: Out of memory. 1076 * -EPERM: Invalid permission (for instance asking for write and 1077 * range is read only). 1078 * -EAGAIN: If you need to retry and mmap_sem was drop. This can only 1079 * happens if block argument is false. 1080 * -EBUSY: If the the range is being invalidated and you should wait 1081 * for invalidation to finish. 1082 * -EFAULT: Invalid (ie either no valid vma or it is illegal to access 1083 * that range), number of valid pages in range->pfns[] (from 1084 * range start address). 1085 * 1086 * This is similar to a regular CPU page fault except that it will not trigger 1087 * any memory migration if the memory being faulted is not accessible by CPUs 1088 * and caller does not ask for migration. 1089 * 1090 * On error, for one virtual address in the range, the function will mark the 1091 * corresponding HMM pfn entry with an error flag. 1092 */ 1093 long hmm_range_fault(struct hmm_range *range, bool block) 1094 { 1095 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 1096 unsigned long start = range->start, end; 1097 struct hmm_vma_walk hmm_vma_walk; 1098 struct hmm *hmm = range->hmm; 1099 struct vm_area_struct *vma; 1100 struct mm_walk mm_walk; 1101 int ret; 1102 1103 /* Check if hmm_mm_destroy() was call. */ 1104 if (hmm->mm == NULL || hmm->dead) 1105 return -EFAULT; 1106 1107 do { 1108 /* If range is no longer valid force retry. */ 1109 if (!range->valid) { 1110 up_read(&hmm->mm->mmap_sem); 1111 return -EAGAIN; 1112 } 1113 1114 vma = find_vma(hmm->mm, start); 1115 if (vma == NULL || (vma->vm_flags & device_vma)) 1116 return -EFAULT; 1117 1118 if (is_vm_hugetlb_page(vma)) { 1119 if (huge_page_shift(hstate_vma(vma)) != 1120 range->page_shift && 1121 range->page_shift != PAGE_SHIFT) 1122 return -EINVAL; 1123 } else { 1124 if (range->page_shift != PAGE_SHIFT) 1125 return -EINVAL; 1126 } 1127 1128 if (!(vma->vm_flags & VM_READ)) { 1129 /* 1130 * If vma do not allow read access, then assume that it 1131 * does not allow write access, either. HMM does not 1132 * support architecture that allow write without read. 1133 */ 1134 hmm_pfns_clear(range, range->pfns, 1135 range->start, range->end); 1136 return -EPERM; 1137 } 1138 1139 range->vma = vma; 1140 hmm_vma_walk.pgmap = NULL; 1141 hmm_vma_walk.last = start; 1142 hmm_vma_walk.fault = true; 1143 hmm_vma_walk.block = block; 1144 hmm_vma_walk.range = range; 1145 mm_walk.private = &hmm_vma_walk; 1146 end = min(range->end, vma->vm_end); 1147 1148 mm_walk.vma = vma; 1149 mm_walk.mm = vma->vm_mm; 1150 mm_walk.pte_entry = NULL; 1151 mm_walk.test_walk = NULL; 1152 mm_walk.hugetlb_entry = NULL; 1153 mm_walk.pud_entry = hmm_vma_walk_pud; 1154 mm_walk.pmd_entry = hmm_vma_walk_pmd; 1155 mm_walk.pte_hole = hmm_vma_walk_hole; 1156 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; 1157 1158 do { 1159 ret = walk_page_range(start, end, &mm_walk); 1160 start = hmm_vma_walk.last; 1161 1162 /* Keep trying while the range is valid. */ 1163 } while (ret == -EBUSY && range->valid); 1164 1165 if (ret) { 1166 unsigned long i; 1167 1168 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 1169 hmm_pfns_clear(range, &range->pfns[i], 1170 hmm_vma_walk.last, range->end); 1171 return ret; 1172 } 1173 start = end; 1174 1175 } while (start < range->end); 1176 1177 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 1178 } 1179 EXPORT_SYMBOL(hmm_range_fault); 1180 1181 /** 1182 * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. 1183 * @range: range being faulted 1184 * @device: device against to dma map page to 1185 * @daddrs: dma address of mapped pages 1186 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 1187 * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been 1188 * drop and you need to try again, some other error value otherwise 1189 * 1190 * Note same usage pattern as hmm_range_fault(). 1191 */ 1192 long hmm_range_dma_map(struct hmm_range *range, 1193 struct device *device, 1194 dma_addr_t *daddrs, 1195 bool block) 1196 { 1197 unsigned long i, npages, mapped; 1198 long ret; 1199 1200 ret = hmm_range_fault(range, block); 1201 if (ret <= 0) 1202 return ret ? ret : -EBUSY; 1203 1204 npages = (range->end - range->start) >> PAGE_SHIFT; 1205 for (i = 0, mapped = 0; i < npages; ++i) { 1206 enum dma_data_direction dir = DMA_TO_DEVICE; 1207 struct page *page; 1208 1209 /* 1210 * FIXME need to update DMA API to provide invalid DMA address 1211 * value instead of a function to test dma address value. This 1212 * would remove lot of dumb code duplicated accross many arch. 1213 * 1214 * For now setting it to 0 here is good enough as the pfns[] 1215 * value is what is use to check what is valid and what isn't. 1216 */ 1217 daddrs[i] = 0; 1218 1219 page = hmm_device_entry_to_page(range, range->pfns[i]); 1220 if (page == NULL) 1221 continue; 1222 1223 /* Check if range is being invalidated */ 1224 if (!range->valid) { 1225 ret = -EBUSY; 1226 goto unmap; 1227 } 1228 1229 /* If it is read and write than map bi-directional. */ 1230 if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) 1231 dir = DMA_BIDIRECTIONAL; 1232 1233 daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); 1234 if (dma_mapping_error(device, daddrs[i])) { 1235 ret = -EFAULT; 1236 goto unmap; 1237 } 1238 1239 mapped++; 1240 } 1241 1242 return mapped; 1243 1244 unmap: 1245 for (npages = i, i = 0; (i < npages) && mapped; ++i) { 1246 enum dma_data_direction dir = DMA_TO_DEVICE; 1247 struct page *page; 1248 1249 page = hmm_device_entry_to_page(range, range->pfns[i]); 1250 if (page == NULL) 1251 continue; 1252 1253 if (dma_mapping_error(device, daddrs[i])) 1254 continue; 1255 1256 /* If it is read and write than map bi-directional. */ 1257 if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) 1258 dir = DMA_BIDIRECTIONAL; 1259 1260 dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); 1261 mapped--; 1262 } 1263 1264 return ret; 1265 } 1266 EXPORT_SYMBOL(hmm_range_dma_map); 1267 1268 /** 1269 * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() 1270 * @range: range being unmapped 1271 * @vma: the vma against which the range (optional) 1272 * @device: device against which dma map was done 1273 * @daddrs: dma address of mapped pages 1274 * @dirty: dirty page if it had the write flag set 1275 * Returns: number of page unmapped on success, -EINVAL otherwise 1276 * 1277 * Note that caller MUST abide by mmu notifier or use HMM mirror and abide 1278 * to the sync_cpu_device_pagetables() callback so that it is safe here to 1279 * call set_page_dirty(). Caller must also take appropriate locks to avoid 1280 * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. 1281 */ 1282 long hmm_range_dma_unmap(struct hmm_range *range, 1283 struct vm_area_struct *vma, 1284 struct device *device, 1285 dma_addr_t *daddrs, 1286 bool dirty) 1287 { 1288 unsigned long i, npages; 1289 long cpages = 0; 1290 1291 /* Sanity check. */ 1292 if (range->end <= range->start) 1293 return -EINVAL; 1294 if (!daddrs) 1295 return -EINVAL; 1296 if (!range->pfns) 1297 return -EINVAL; 1298 1299 npages = (range->end - range->start) >> PAGE_SHIFT; 1300 for (i = 0; i < npages; ++i) { 1301 enum dma_data_direction dir = DMA_TO_DEVICE; 1302 struct page *page; 1303 1304 page = hmm_device_entry_to_page(range, range->pfns[i]); 1305 if (page == NULL) 1306 continue; 1307 1308 /* If it is read and write than map bi-directional. */ 1309 if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { 1310 dir = DMA_BIDIRECTIONAL; 1311 1312 /* 1313 * See comments in function description on why it is 1314 * safe here to call set_page_dirty() 1315 */ 1316 if (dirty) 1317 set_page_dirty(page); 1318 } 1319 1320 /* Unmap and clear pfns/dma address */ 1321 dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); 1322 range->pfns[i] = range->values[HMM_PFN_NONE]; 1323 /* FIXME see comments in hmm_vma_dma_map() */ 1324 daddrs[i] = 0; 1325 cpages++; 1326 } 1327 1328 return cpages; 1329 } 1330 EXPORT_SYMBOL(hmm_range_dma_unmap); 1331 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 1332 1333 1334 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) 1335 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, 1336 unsigned long addr) 1337 { 1338 struct page *page; 1339 1340 page = alloc_page_vma(GFP_HIGHUSER, vma, addr); 1341 if (!page) 1342 return NULL; 1343 lock_page(page); 1344 return page; 1345 } 1346 EXPORT_SYMBOL(hmm_vma_alloc_locked_page); 1347 1348 1349 static void hmm_devmem_ref_release(struct percpu_ref *ref) 1350 { 1351 struct hmm_devmem *devmem; 1352 1353 devmem = container_of(ref, struct hmm_devmem, ref); 1354 complete(&devmem->completion); 1355 } 1356 1357 static void hmm_devmem_ref_exit(struct percpu_ref *ref) 1358 { 1359 struct hmm_devmem *devmem; 1360 1361 devmem = container_of(ref, struct hmm_devmem, ref); 1362 wait_for_completion(&devmem->completion); 1363 percpu_ref_exit(ref); 1364 } 1365 1366 static void hmm_devmem_ref_kill(struct percpu_ref *ref) 1367 { 1368 percpu_ref_kill(ref); 1369 } 1370 1371 static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, 1372 unsigned long addr, 1373 const struct page *page, 1374 unsigned int flags, 1375 pmd_t *pmdp) 1376 { 1377 struct hmm_devmem *devmem = page->pgmap->data; 1378 1379 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); 1380 } 1381 1382 static void hmm_devmem_free(struct page *page, void *data) 1383 { 1384 struct hmm_devmem *devmem = data; 1385 1386 page->mapping = NULL; 1387 1388 devmem->ops->free(devmem, page); 1389 } 1390 1391 /* 1392 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory 1393 * 1394 * @ops: memory event device driver callback (see struct hmm_devmem_ops) 1395 * @device: device struct to bind the resource too 1396 * @size: size in bytes of the device memory to add 1397 * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise 1398 * 1399 * This function first finds an empty range of physical address big enough to 1400 * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which 1401 * in turn allocates struct pages. It does not do anything beyond that; all 1402 * events affecting the memory will go through the various callbacks provided 1403 * by hmm_devmem_ops struct. 1404 * 1405 * Device driver should call this function during device initialization and 1406 * is then responsible of memory management. HMM only provides helpers. 1407 */ 1408 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, 1409 struct device *device, 1410 unsigned long size) 1411 { 1412 struct hmm_devmem *devmem; 1413 resource_size_t addr; 1414 void *result; 1415 int ret; 1416 1417 dev_pagemap_get_ops(); 1418 1419 devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); 1420 if (!devmem) 1421 return ERR_PTR(-ENOMEM); 1422 1423 init_completion(&devmem->completion); 1424 devmem->pfn_first = -1UL; 1425 devmem->pfn_last = -1UL; 1426 devmem->resource = NULL; 1427 devmem->device = device; 1428 devmem->ops = ops; 1429 1430 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1431 0, GFP_KERNEL); 1432 if (ret) 1433 return ERR_PTR(ret); 1434 1435 size = ALIGN(size, PA_SECTION_SIZE); 1436 addr = min((unsigned long)iomem_resource.end, 1437 (1UL << MAX_PHYSMEM_BITS) - 1); 1438 addr = addr - size + 1UL; 1439 1440 /* 1441 * FIXME add a new helper to quickly walk resource tree and find free 1442 * range 1443 * 1444 * FIXME what about ioport_resource resource ? 1445 */ 1446 for (; addr > size && addr >= iomem_resource.start; addr -= size) { 1447 ret = region_intersects(addr, size, 0, IORES_DESC_NONE); 1448 if (ret != REGION_DISJOINT) 1449 continue; 1450 1451 devmem->resource = devm_request_mem_region(device, addr, size, 1452 dev_name(device)); 1453 if (!devmem->resource) 1454 return ERR_PTR(-ENOMEM); 1455 break; 1456 } 1457 if (!devmem->resource) 1458 return ERR_PTR(-ERANGE); 1459 1460 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1461 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1462 devmem->pfn_last = devmem->pfn_first + 1463 (resource_size(devmem->resource) >> PAGE_SHIFT); 1464 devmem->page_fault = hmm_devmem_fault; 1465 1466 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 1467 devmem->pagemap.res = *devmem->resource; 1468 devmem->pagemap.page_free = hmm_devmem_free; 1469 devmem->pagemap.altmap_valid = false; 1470 devmem->pagemap.ref = &devmem->ref; 1471 devmem->pagemap.data = devmem; 1472 devmem->pagemap.kill = hmm_devmem_ref_kill; 1473 devmem->pagemap.cleanup = hmm_devmem_ref_exit; 1474 1475 result = devm_memremap_pages(devmem->device, &devmem->pagemap); 1476 if (IS_ERR(result)) 1477 return result; 1478 return devmem; 1479 } 1480 EXPORT_SYMBOL_GPL(hmm_devmem_add); 1481 1482 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 1483 struct device *device, 1484 struct resource *res) 1485 { 1486 struct hmm_devmem *devmem; 1487 void *result; 1488 int ret; 1489 1490 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) 1491 return ERR_PTR(-EINVAL); 1492 1493 dev_pagemap_get_ops(); 1494 1495 devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); 1496 if (!devmem) 1497 return ERR_PTR(-ENOMEM); 1498 1499 init_completion(&devmem->completion); 1500 devmem->pfn_first = -1UL; 1501 devmem->pfn_last = -1UL; 1502 devmem->resource = res; 1503 devmem->device = device; 1504 devmem->ops = ops; 1505 1506 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1507 0, GFP_KERNEL); 1508 if (ret) 1509 return ERR_PTR(ret); 1510 1511 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1512 devmem->pfn_last = devmem->pfn_first + 1513 (resource_size(devmem->resource) >> PAGE_SHIFT); 1514 devmem->page_fault = hmm_devmem_fault; 1515 1516 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; 1517 devmem->pagemap.res = *devmem->resource; 1518 devmem->pagemap.page_free = hmm_devmem_free; 1519 devmem->pagemap.altmap_valid = false; 1520 devmem->pagemap.ref = &devmem->ref; 1521 devmem->pagemap.data = devmem; 1522 devmem->pagemap.kill = hmm_devmem_ref_kill; 1523 devmem->pagemap.cleanup = hmm_devmem_ref_exit; 1524 1525 result = devm_memremap_pages(devmem->device, &devmem->pagemap); 1526 if (IS_ERR(result)) 1527 return result; 1528 return devmem; 1529 } 1530 EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); 1531 1532 /* 1533 * A device driver that wants to handle multiple devices memory through a 1534 * single fake device can use hmm_device to do so. This is purely a helper 1535 * and it is not needed to make use of any HMM functionality. 1536 */ 1537 #define HMM_DEVICE_MAX 256 1538 1539 static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); 1540 static DEFINE_SPINLOCK(hmm_device_lock); 1541 static struct class *hmm_device_class; 1542 static dev_t hmm_device_devt; 1543 1544 static void hmm_device_release(struct device *device) 1545 { 1546 struct hmm_device *hmm_device; 1547 1548 hmm_device = container_of(device, struct hmm_device, device); 1549 spin_lock(&hmm_device_lock); 1550 clear_bit(hmm_device->minor, hmm_device_mask); 1551 spin_unlock(&hmm_device_lock); 1552 1553 kfree(hmm_device); 1554 } 1555 1556 struct hmm_device *hmm_device_new(void *drvdata) 1557 { 1558 struct hmm_device *hmm_device; 1559 1560 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); 1561 if (!hmm_device) 1562 return ERR_PTR(-ENOMEM); 1563 1564 spin_lock(&hmm_device_lock); 1565 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); 1566 if (hmm_device->minor >= HMM_DEVICE_MAX) { 1567 spin_unlock(&hmm_device_lock); 1568 kfree(hmm_device); 1569 return ERR_PTR(-EBUSY); 1570 } 1571 set_bit(hmm_device->minor, hmm_device_mask); 1572 spin_unlock(&hmm_device_lock); 1573 1574 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); 1575 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), 1576 hmm_device->minor); 1577 hmm_device->device.release = hmm_device_release; 1578 dev_set_drvdata(&hmm_device->device, drvdata); 1579 hmm_device->device.class = hmm_device_class; 1580 device_initialize(&hmm_device->device); 1581 1582 return hmm_device; 1583 } 1584 EXPORT_SYMBOL(hmm_device_new); 1585 1586 void hmm_device_put(struct hmm_device *hmm_device) 1587 { 1588 put_device(&hmm_device->device); 1589 } 1590 EXPORT_SYMBOL(hmm_device_put); 1591 1592 static int __init hmm_init(void) 1593 { 1594 int ret; 1595 1596 ret = alloc_chrdev_region(&hmm_device_devt, 0, 1597 HMM_DEVICE_MAX, 1598 "hmm_device"); 1599 if (ret) 1600 return ret; 1601 1602 hmm_device_class = class_create(THIS_MODULE, "hmm_device"); 1603 if (IS_ERR(hmm_device_class)) { 1604 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); 1605 return PTR_ERR(hmm_device_class); 1606 } 1607 return 0; 1608 } 1609 1610 device_initcall(hmm_init); 1611 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 1612