1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright 2013 Red Hat Inc. 4 * 5 * Authors: Jérôme Glisse <jglisse@redhat.com> 6 */ 7 /* 8 * Refer to include/linux/hmm.h for information about heterogeneous memory 9 * management or HMM for short. 10 */ 11 #include <linux/pagewalk.h> 12 #include <linux/hmm.h> 13 #include <linux/init.h> 14 #include <linux/rmap.h> 15 #include <linux/swap.h> 16 #include <linux/slab.h> 17 #include <linux/sched.h> 18 #include <linux/mmzone.h> 19 #include <linux/pagemap.h> 20 #include <linux/swapops.h> 21 #include <linux/hugetlb.h> 22 #include <linux/memremap.h> 23 #include <linux/sched/mm.h> 24 #include <linux/jump_label.h> 25 #include <linux/dma-mapping.h> 26 #include <linux/mmu_notifier.h> 27 #include <linux/memory_hotplug.h> 28 29 static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) 30 { 31 struct hmm *hmm; 32 33 hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); 34 if (!hmm) 35 return ERR_PTR(-ENOMEM); 36 37 init_waitqueue_head(&hmm->wq); 38 INIT_LIST_HEAD(&hmm->mirrors); 39 init_rwsem(&hmm->mirrors_sem); 40 INIT_LIST_HEAD(&hmm->ranges); 41 spin_lock_init(&hmm->ranges_lock); 42 hmm->notifiers = 0; 43 return &hmm->mmu_notifier; 44 } 45 46 static void hmm_free_notifier(struct mmu_notifier *mn) 47 { 48 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 49 50 WARN_ON(!list_empty(&hmm->ranges)); 51 WARN_ON(!list_empty(&hmm->mirrors)); 52 kfree(hmm); 53 } 54 55 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 56 { 57 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 58 struct hmm_mirror *mirror; 59 60 /* 61 * Since hmm_range_register() holds the mmget() lock hmm_release() is 62 * prevented as long as a range exists. 63 */ 64 WARN_ON(!list_empty_careful(&hmm->ranges)); 65 66 down_read(&hmm->mirrors_sem); 67 list_for_each_entry(mirror, &hmm->mirrors, list) { 68 /* 69 * Note: The driver is not allowed to trigger 70 * hmm_mirror_unregister() from this thread. 71 */ 72 if (mirror->ops->release) 73 mirror->ops->release(mirror); 74 } 75 up_read(&hmm->mirrors_sem); 76 } 77 78 static void notifiers_decrement(struct hmm *hmm) 79 { 80 unsigned long flags; 81 82 spin_lock_irqsave(&hmm->ranges_lock, flags); 83 hmm->notifiers--; 84 if (!hmm->notifiers) { 85 struct hmm_range *range; 86 87 list_for_each_entry(range, &hmm->ranges, list) { 88 if (range->valid) 89 continue; 90 range->valid = true; 91 } 92 wake_up_all(&hmm->wq); 93 } 94 spin_unlock_irqrestore(&hmm->ranges_lock, flags); 95 } 96 97 static int hmm_invalidate_range_start(struct mmu_notifier *mn, 98 const struct mmu_notifier_range *nrange) 99 { 100 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 101 struct hmm_mirror *mirror; 102 struct hmm_range *range; 103 unsigned long flags; 104 int ret = 0; 105 106 spin_lock_irqsave(&hmm->ranges_lock, flags); 107 hmm->notifiers++; 108 list_for_each_entry(range, &hmm->ranges, list) { 109 if (nrange->end < range->start || nrange->start >= range->end) 110 continue; 111 112 range->valid = false; 113 } 114 spin_unlock_irqrestore(&hmm->ranges_lock, flags); 115 116 if (mmu_notifier_range_blockable(nrange)) 117 down_read(&hmm->mirrors_sem); 118 else if (!down_read_trylock(&hmm->mirrors_sem)) { 119 ret = -EAGAIN; 120 goto out; 121 } 122 123 list_for_each_entry(mirror, &hmm->mirrors, list) { 124 int rc; 125 126 rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); 127 if (rc) { 128 if (WARN_ON(mmu_notifier_range_blockable(nrange) || 129 rc != -EAGAIN)) 130 continue; 131 ret = -EAGAIN; 132 break; 133 } 134 } 135 up_read(&hmm->mirrors_sem); 136 137 out: 138 if (ret) 139 notifiers_decrement(hmm); 140 return ret; 141 } 142 143 static void hmm_invalidate_range_end(struct mmu_notifier *mn, 144 const struct mmu_notifier_range *nrange) 145 { 146 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 147 148 notifiers_decrement(hmm); 149 } 150 151 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 152 .release = hmm_release, 153 .invalidate_range_start = hmm_invalidate_range_start, 154 .invalidate_range_end = hmm_invalidate_range_end, 155 .alloc_notifier = hmm_alloc_notifier, 156 .free_notifier = hmm_free_notifier, 157 }; 158 159 /* 160 * hmm_mirror_register() - register a mirror against an mm 161 * 162 * @mirror: new mirror struct to register 163 * @mm: mm to register against 164 * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments 165 * 166 * To start mirroring a process address space, the device driver must register 167 * an HMM mirror struct. 168 * 169 * The caller cannot unregister the hmm_mirror while any ranges are 170 * registered. 171 * 172 * Callers using this function must put a call to mmu_notifier_synchronize() 173 * in their module exit functions. 174 */ 175 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 176 { 177 struct mmu_notifier *mn; 178 179 lockdep_assert_held_write(&mm->mmap_sem); 180 181 /* Sanity check */ 182 if (!mm || !mirror || !mirror->ops) 183 return -EINVAL; 184 185 mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); 186 if (IS_ERR(mn)) 187 return PTR_ERR(mn); 188 mirror->hmm = container_of(mn, struct hmm, mmu_notifier); 189 190 down_write(&mirror->hmm->mirrors_sem); 191 list_add(&mirror->list, &mirror->hmm->mirrors); 192 up_write(&mirror->hmm->mirrors_sem); 193 194 return 0; 195 } 196 EXPORT_SYMBOL(hmm_mirror_register); 197 198 /* 199 * hmm_mirror_unregister() - unregister a mirror 200 * 201 * @mirror: mirror struct to unregister 202 * 203 * Stop mirroring a process address space, and cleanup. 204 */ 205 void hmm_mirror_unregister(struct hmm_mirror *mirror) 206 { 207 struct hmm *hmm = mirror->hmm; 208 209 down_write(&hmm->mirrors_sem); 210 list_del(&mirror->list); 211 up_write(&hmm->mirrors_sem); 212 mmu_notifier_put(&hmm->mmu_notifier); 213 } 214 EXPORT_SYMBOL(hmm_mirror_unregister); 215 216 struct hmm_vma_walk { 217 struct hmm_range *range; 218 struct dev_pagemap *pgmap; 219 unsigned long last; 220 unsigned int flags; 221 }; 222 223 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 224 bool write_fault, uint64_t *pfn) 225 { 226 unsigned int flags = FAULT_FLAG_REMOTE; 227 struct hmm_vma_walk *hmm_vma_walk = walk->private; 228 struct hmm_range *range = hmm_vma_walk->range; 229 struct vm_area_struct *vma = walk->vma; 230 vm_fault_t ret; 231 232 if (!vma) 233 goto err; 234 235 if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) 236 flags |= FAULT_FLAG_ALLOW_RETRY; 237 if (write_fault) 238 flags |= FAULT_FLAG_WRITE; 239 240 ret = handle_mm_fault(vma, addr, flags); 241 if (ret & VM_FAULT_RETRY) { 242 /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ 243 return -EAGAIN; 244 } 245 if (ret & VM_FAULT_ERROR) 246 goto err; 247 248 return -EBUSY; 249 250 err: 251 *pfn = range->values[HMM_PFN_ERROR]; 252 return -EFAULT; 253 } 254 255 static int hmm_pfns_bad(unsigned long addr, 256 unsigned long end, 257 struct mm_walk *walk) 258 { 259 struct hmm_vma_walk *hmm_vma_walk = walk->private; 260 struct hmm_range *range = hmm_vma_walk->range; 261 uint64_t *pfns = range->pfns; 262 unsigned long i; 263 264 i = (addr - range->start) >> PAGE_SHIFT; 265 for (; addr < end; addr += PAGE_SIZE, i++) 266 pfns[i] = range->values[HMM_PFN_ERROR]; 267 268 return 0; 269 } 270 271 /* 272 * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) 273 * @addr: range virtual start address (inclusive) 274 * @end: range virtual end address (exclusive) 275 * @fault: should we fault or not ? 276 * @write_fault: write fault ? 277 * @walk: mm_walk structure 278 * Return: 0 on success, -EBUSY after page fault, or page fault error 279 * 280 * This function will be called whenever pmd_none() or pte_none() returns true, 281 * or whenever there is no page directory covering the virtual address range. 282 */ 283 static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 284 bool fault, bool write_fault, 285 struct mm_walk *walk) 286 { 287 struct hmm_vma_walk *hmm_vma_walk = walk->private; 288 struct hmm_range *range = hmm_vma_walk->range; 289 uint64_t *pfns = range->pfns; 290 unsigned long i; 291 292 hmm_vma_walk->last = addr; 293 i = (addr - range->start) >> PAGE_SHIFT; 294 295 if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) 296 return -EPERM; 297 298 for (; addr < end; addr += PAGE_SIZE, i++) { 299 pfns[i] = range->values[HMM_PFN_NONE]; 300 if (fault || write_fault) { 301 int ret; 302 303 ret = hmm_vma_do_fault(walk, addr, write_fault, 304 &pfns[i]); 305 if (ret != -EBUSY) 306 return ret; 307 } 308 } 309 310 return (fault || write_fault) ? -EBUSY : 0; 311 } 312 313 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 314 uint64_t pfns, uint64_t cpu_flags, 315 bool *fault, bool *write_fault) 316 { 317 struct hmm_range *range = hmm_vma_walk->range; 318 319 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) 320 return; 321 322 /* 323 * So we not only consider the individual per page request we also 324 * consider the default flags requested for the range. The API can 325 * be used 2 ways. The first one where the HMM user coalesces 326 * multiple page faults into one request and sets flags per pfn for 327 * those faults. The second one where the HMM user wants to pre- 328 * fault a range with specific flags. For the latter one it is a 329 * waste to have the user pre-fill the pfn arrays with a default 330 * flags value. 331 */ 332 pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 333 334 /* We aren't ask to do anything ... */ 335 if (!(pfns & range->flags[HMM_PFN_VALID])) 336 return; 337 /* If this is device memory then only fault if explicitly requested */ 338 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 339 /* Do we fault on device memory ? */ 340 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 341 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 342 *fault = true; 343 } 344 return; 345 } 346 347 /* If CPU page table is not valid then we need to fault */ 348 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 349 /* Need to write fault ? */ 350 if ((pfns & range->flags[HMM_PFN_WRITE]) && 351 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 352 *write_fault = true; 353 *fault = true; 354 } 355 } 356 357 static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 358 const uint64_t *pfns, unsigned long npages, 359 uint64_t cpu_flags, bool *fault, 360 bool *write_fault) 361 { 362 unsigned long i; 363 364 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { 365 *fault = *write_fault = false; 366 return; 367 } 368 369 *fault = *write_fault = false; 370 for (i = 0; i < npages; ++i) { 371 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 372 fault, write_fault); 373 if ((*write_fault)) 374 return; 375 } 376 } 377 378 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 379 struct mm_walk *walk) 380 { 381 struct hmm_vma_walk *hmm_vma_walk = walk->private; 382 struct hmm_range *range = hmm_vma_walk->range; 383 bool fault, write_fault; 384 unsigned long i, npages; 385 uint64_t *pfns; 386 387 i = (addr - range->start) >> PAGE_SHIFT; 388 npages = (end - addr) >> PAGE_SHIFT; 389 pfns = &range->pfns[i]; 390 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 391 0, &fault, &write_fault); 392 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 393 } 394 395 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 396 { 397 if (pmd_protnone(pmd)) 398 return 0; 399 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 400 range->flags[HMM_PFN_WRITE] : 401 range->flags[HMM_PFN_VALID]; 402 } 403 404 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 405 static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 406 unsigned long end, uint64_t *pfns, pmd_t pmd) 407 { 408 struct hmm_vma_walk *hmm_vma_walk = walk->private; 409 struct hmm_range *range = hmm_vma_walk->range; 410 unsigned long pfn, npages, i; 411 bool fault, write_fault; 412 uint64_t cpu_flags; 413 414 npages = (end - addr) >> PAGE_SHIFT; 415 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 416 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 417 &fault, &write_fault); 418 419 if (pmd_protnone(pmd) || fault || write_fault) 420 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 421 422 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 423 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 424 if (pmd_devmap(pmd)) { 425 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 426 hmm_vma_walk->pgmap); 427 if (unlikely(!hmm_vma_walk->pgmap)) 428 return -EBUSY; 429 } 430 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 431 } 432 if (hmm_vma_walk->pgmap) { 433 put_dev_pagemap(hmm_vma_walk->pgmap); 434 hmm_vma_walk->pgmap = NULL; 435 } 436 hmm_vma_walk->last = end; 437 return 0; 438 } 439 #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 440 /* stub to allow the code below to compile */ 441 int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 442 unsigned long end, uint64_t *pfns, pmd_t pmd); 443 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 444 445 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 446 { 447 if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 448 return 0; 449 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 450 range->flags[HMM_PFN_WRITE] : 451 range->flags[HMM_PFN_VALID]; 452 } 453 454 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 455 unsigned long end, pmd_t *pmdp, pte_t *ptep, 456 uint64_t *pfn) 457 { 458 struct hmm_vma_walk *hmm_vma_walk = walk->private; 459 struct hmm_range *range = hmm_vma_walk->range; 460 bool fault, write_fault; 461 uint64_t cpu_flags; 462 pte_t pte = *ptep; 463 uint64_t orig_pfn = *pfn; 464 465 *pfn = range->values[HMM_PFN_NONE]; 466 fault = write_fault = false; 467 468 if (pte_none(pte)) { 469 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, 470 &fault, &write_fault); 471 if (fault || write_fault) 472 goto fault; 473 return 0; 474 } 475 476 if (!pte_present(pte)) { 477 swp_entry_t entry = pte_to_swp_entry(pte); 478 479 if (!non_swap_entry(entry)) { 480 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 481 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 482 &fault, &write_fault); 483 if (fault || write_fault) 484 goto fault; 485 return 0; 486 } 487 488 /* 489 * This is a special swap entry, ignore migration, use 490 * device and report anything else as error. 491 */ 492 if (is_device_private_entry(entry)) { 493 cpu_flags = range->flags[HMM_PFN_VALID] | 494 range->flags[HMM_PFN_DEVICE_PRIVATE]; 495 cpu_flags |= is_write_device_private_entry(entry) ? 496 range->flags[HMM_PFN_WRITE] : 0; 497 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 498 &fault, &write_fault); 499 if (fault || write_fault) 500 goto fault; 501 *pfn = hmm_device_entry_from_pfn(range, 502 swp_offset(entry)); 503 *pfn |= cpu_flags; 504 return 0; 505 } 506 507 if (is_migration_entry(entry)) { 508 if (fault || write_fault) { 509 pte_unmap(ptep); 510 hmm_vma_walk->last = addr; 511 migration_entry_wait(walk->mm, pmdp, addr); 512 return -EBUSY; 513 } 514 return 0; 515 } 516 517 /* Report error for everything else */ 518 *pfn = range->values[HMM_PFN_ERROR]; 519 return -EFAULT; 520 } else { 521 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 522 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 523 &fault, &write_fault); 524 } 525 526 if (fault || write_fault) 527 goto fault; 528 529 if (pte_devmap(pte)) { 530 hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), 531 hmm_vma_walk->pgmap); 532 if (unlikely(!hmm_vma_walk->pgmap)) 533 return -EBUSY; 534 } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { 535 *pfn = range->values[HMM_PFN_SPECIAL]; 536 return -EFAULT; 537 } 538 539 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 540 return 0; 541 542 fault: 543 if (hmm_vma_walk->pgmap) { 544 put_dev_pagemap(hmm_vma_walk->pgmap); 545 hmm_vma_walk->pgmap = NULL; 546 } 547 pte_unmap(ptep); 548 /* Fault any virtual address we were asked to fault */ 549 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 550 } 551 552 static int hmm_vma_walk_pmd(pmd_t *pmdp, 553 unsigned long start, 554 unsigned long end, 555 struct mm_walk *walk) 556 { 557 struct hmm_vma_walk *hmm_vma_walk = walk->private; 558 struct hmm_range *range = hmm_vma_walk->range; 559 uint64_t *pfns = range->pfns; 560 unsigned long addr = start, i; 561 pte_t *ptep; 562 pmd_t pmd; 563 564 again: 565 pmd = READ_ONCE(*pmdp); 566 if (pmd_none(pmd)) 567 return hmm_vma_walk_hole(start, end, walk); 568 569 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 570 bool fault, write_fault; 571 unsigned long npages; 572 uint64_t *pfns; 573 574 i = (addr - range->start) >> PAGE_SHIFT; 575 npages = (end - addr) >> PAGE_SHIFT; 576 pfns = &range->pfns[i]; 577 578 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 579 0, &fault, &write_fault); 580 if (fault || write_fault) { 581 hmm_vma_walk->last = addr; 582 pmd_migration_entry_wait(walk->mm, pmdp); 583 return -EBUSY; 584 } 585 return 0; 586 } else if (!pmd_present(pmd)) 587 return hmm_pfns_bad(start, end, walk); 588 589 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 590 /* 591 * No need to take pmd_lock here, even if some other thread 592 * is splitting the huge pmd we will get that event through 593 * mmu_notifier callback. 594 * 595 * So just read pmd value and check again it's a transparent 596 * huge or device mapping one and compute corresponding pfn 597 * values. 598 */ 599 pmd = pmd_read_atomic(pmdp); 600 barrier(); 601 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 602 goto again; 603 604 i = (addr - range->start) >> PAGE_SHIFT; 605 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 606 } 607 608 /* 609 * We have handled all the valid cases above ie either none, migration, 610 * huge or transparent huge. At this point either it is a valid pmd 611 * entry pointing to pte directory or it is a bad pmd that will not 612 * recover. 613 */ 614 if (pmd_bad(pmd)) 615 return hmm_pfns_bad(start, end, walk); 616 617 ptep = pte_offset_map(pmdp, addr); 618 i = (addr - range->start) >> PAGE_SHIFT; 619 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 620 int r; 621 622 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 623 if (r) { 624 /* hmm_vma_handle_pte() did unmap pte directory */ 625 hmm_vma_walk->last = addr; 626 return r; 627 } 628 } 629 if (hmm_vma_walk->pgmap) { 630 /* 631 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() 632 * so that we can leverage get_dev_pagemap() optimization which 633 * will not re-take a reference on a pgmap if we already have 634 * one. 635 */ 636 put_dev_pagemap(hmm_vma_walk->pgmap); 637 hmm_vma_walk->pgmap = NULL; 638 } 639 pte_unmap(ptep - 1); 640 641 hmm_vma_walk->last = addr; 642 return 0; 643 } 644 645 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 646 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 647 static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 648 { 649 if (!pud_present(pud)) 650 return 0; 651 return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 652 range->flags[HMM_PFN_WRITE] : 653 range->flags[HMM_PFN_VALID]; 654 } 655 656 static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 657 struct mm_walk *walk) 658 { 659 struct hmm_vma_walk *hmm_vma_walk = walk->private; 660 struct hmm_range *range = hmm_vma_walk->range; 661 unsigned long addr = start, next; 662 pmd_t *pmdp; 663 pud_t pud; 664 int ret; 665 666 again: 667 pud = READ_ONCE(*pudp); 668 if (pud_none(pud)) 669 return hmm_vma_walk_hole(start, end, walk); 670 671 if (pud_huge(pud) && pud_devmap(pud)) { 672 unsigned long i, npages, pfn; 673 uint64_t *pfns, cpu_flags; 674 bool fault, write_fault; 675 676 if (!pud_present(pud)) 677 return hmm_vma_walk_hole(start, end, walk); 678 679 i = (addr - range->start) >> PAGE_SHIFT; 680 npages = (end - addr) >> PAGE_SHIFT; 681 pfns = &range->pfns[i]; 682 683 cpu_flags = pud_to_hmm_pfn_flags(range, pud); 684 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 685 cpu_flags, &fault, &write_fault); 686 if (fault || write_fault) 687 return hmm_vma_walk_hole_(addr, end, fault, 688 write_fault, walk); 689 690 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 691 for (i = 0; i < npages; ++i, ++pfn) { 692 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 693 hmm_vma_walk->pgmap); 694 if (unlikely(!hmm_vma_walk->pgmap)) 695 return -EBUSY; 696 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 697 cpu_flags; 698 } 699 if (hmm_vma_walk->pgmap) { 700 put_dev_pagemap(hmm_vma_walk->pgmap); 701 hmm_vma_walk->pgmap = NULL; 702 } 703 hmm_vma_walk->last = end; 704 return 0; 705 } 706 707 split_huge_pud(walk->vma, pudp, addr); 708 if (pud_none(*pudp)) 709 goto again; 710 711 pmdp = pmd_offset(pudp, addr); 712 do { 713 next = pmd_addr_end(addr, end); 714 ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); 715 if (ret) 716 return ret; 717 } while (pmdp++, addr = next, addr != end); 718 719 return 0; 720 } 721 #else 722 #define hmm_vma_walk_pud NULL 723 #endif 724 725 #ifdef CONFIG_HUGETLB_PAGE 726 static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 727 unsigned long start, unsigned long end, 728 struct mm_walk *walk) 729 { 730 unsigned long addr = start, i, pfn; 731 struct hmm_vma_walk *hmm_vma_walk = walk->private; 732 struct hmm_range *range = hmm_vma_walk->range; 733 struct vm_area_struct *vma = walk->vma; 734 uint64_t orig_pfn, cpu_flags; 735 bool fault, write_fault; 736 spinlock_t *ptl; 737 pte_t entry; 738 int ret = 0; 739 740 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 741 entry = huge_ptep_get(pte); 742 743 i = (start - range->start) >> PAGE_SHIFT; 744 orig_pfn = range->pfns[i]; 745 range->pfns[i] = range->values[HMM_PFN_NONE]; 746 cpu_flags = pte_to_hmm_pfn_flags(range, entry); 747 fault = write_fault = false; 748 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 749 &fault, &write_fault); 750 if (fault || write_fault) { 751 ret = -ENOENT; 752 goto unlock; 753 } 754 755 pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 756 for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 757 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 758 cpu_flags; 759 hmm_vma_walk->last = end; 760 761 unlock: 762 spin_unlock(ptl); 763 764 if (ret == -ENOENT) 765 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 766 767 return ret; 768 } 769 #else 770 #define hmm_vma_walk_hugetlb_entry NULL 771 #endif /* CONFIG_HUGETLB_PAGE */ 772 773 static void hmm_pfns_clear(struct hmm_range *range, 774 uint64_t *pfns, 775 unsigned long addr, 776 unsigned long end) 777 { 778 for (; addr < end; addr += PAGE_SIZE, pfns++) 779 *pfns = range->values[HMM_PFN_NONE]; 780 } 781 782 /* 783 * hmm_range_register() - start tracking change to CPU page table over a range 784 * @range: range 785 * @mm: the mm struct for the range of virtual address 786 * 787 * Return: 0 on success, -EFAULT if the address space is no longer valid 788 * 789 * Track updates to the CPU page table see include/linux/hmm.h 790 */ 791 int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) 792 { 793 struct hmm *hmm = mirror->hmm; 794 unsigned long flags; 795 796 range->valid = false; 797 range->hmm = NULL; 798 799 if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) 800 return -EINVAL; 801 if (range->start >= range->end) 802 return -EINVAL; 803 804 /* Prevent hmm_release() from running while the range is valid */ 805 if (!mmget_not_zero(hmm->mmu_notifier.mm)) 806 return -EFAULT; 807 808 /* Initialize range to track CPU page table updates. */ 809 spin_lock_irqsave(&hmm->ranges_lock, flags); 810 811 range->hmm = hmm; 812 list_add(&range->list, &hmm->ranges); 813 814 /* 815 * If there are any concurrent notifiers we have to wait for them for 816 * the range to be valid (see hmm_range_wait_until_valid()). 817 */ 818 if (!hmm->notifiers) 819 range->valid = true; 820 spin_unlock_irqrestore(&hmm->ranges_lock, flags); 821 822 return 0; 823 } 824 EXPORT_SYMBOL(hmm_range_register); 825 826 /* 827 * hmm_range_unregister() - stop tracking change to CPU page table over a range 828 * @range: range 829 * 830 * Range struct is used to track updates to the CPU page table after a call to 831 * hmm_range_register(). See include/linux/hmm.h for how to use it. 832 */ 833 void hmm_range_unregister(struct hmm_range *range) 834 { 835 struct hmm *hmm = range->hmm; 836 unsigned long flags; 837 838 spin_lock_irqsave(&hmm->ranges_lock, flags); 839 list_del_init(&range->list); 840 spin_unlock_irqrestore(&hmm->ranges_lock, flags); 841 842 /* Drop reference taken by hmm_range_register() */ 843 mmput(hmm->mmu_notifier.mm); 844 845 /* 846 * The range is now invalid and the ref on the hmm is dropped, so 847 * poison the pointer. Leave other fields in place, for the caller's 848 * use. 849 */ 850 range->valid = false; 851 memset(&range->hmm, POISON_INUSE, sizeof(range->hmm)); 852 } 853 EXPORT_SYMBOL(hmm_range_unregister); 854 855 static const struct mm_walk_ops hmm_walk_ops = { 856 .pud_entry = hmm_vma_walk_pud, 857 .pmd_entry = hmm_vma_walk_pmd, 858 .pte_hole = hmm_vma_walk_hole, 859 .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 860 }; 861 862 /** 863 * hmm_range_fault - try to fault some address in a virtual address range 864 * @range: range being faulted 865 * @flags: HMM_FAULT_* flags 866 * 867 * Return: the number of valid pages in range->pfns[] (from range start 868 * address), which may be zero. On error one of the following status codes 869 * can be returned: 870 * 871 * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 872 * (e.g., device file vma). 873 * -ENOMEM: Out of memory. 874 * -EPERM: Invalid permission (e.g., asking for write and range is read 875 * only). 876 * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. 877 * -EBUSY: The range has been invalidated and the caller needs to wait for 878 * the invalidation to finish. 879 * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access 880 * that range) number of valid pages in range->pfns[] (from 881 * range start address). 882 * 883 * This is similar to a regular CPU page fault except that it will not trigger 884 * any memory migration if the memory being faulted is not accessible by CPUs 885 * and caller does not ask for migration. 886 * 887 * On error, for one virtual address in the range, the function will mark the 888 * corresponding HMM pfn entry with an error flag. 889 */ 890 long hmm_range_fault(struct hmm_range *range, unsigned int flags) 891 { 892 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 893 unsigned long start = range->start, end; 894 struct hmm_vma_walk hmm_vma_walk; 895 struct hmm *hmm = range->hmm; 896 struct vm_area_struct *vma; 897 int ret; 898 899 lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); 900 901 do { 902 /* If range is no longer valid force retry. */ 903 if (!range->valid) 904 return -EBUSY; 905 906 vma = find_vma(hmm->mmu_notifier.mm, start); 907 if (vma == NULL || (vma->vm_flags & device_vma)) 908 return -EFAULT; 909 910 if (!(vma->vm_flags & VM_READ)) { 911 /* 912 * If vma do not allow read access, then assume that it 913 * does not allow write access, either. HMM does not 914 * support architecture that allow write without read. 915 */ 916 hmm_pfns_clear(range, range->pfns, 917 range->start, range->end); 918 return -EPERM; 919 } 920 921 hmm_vma_walk.pgmap = NULL; 922 hmm_vma_walk.last = start; 923 hmm_vma_walk.flags = flags; 924 hmm_vma_walk.range = range; 925 end = min(range->end, vma->vm_end); 926 927 walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, 928 &hmm_vma_walk); 929 930 do { 931 ret = walk_page_range(vma->vm_mm, start, end, 932 &hmm_walk_ops, &hmm_vma_walk); 933 start = hmm_vma_walk.last; 934 935 /* Keep trying while the range is valid. */ 936 } while (ret == -EBUSY && range->valid); 937 938 if (ret) { 939 unsigned long i; 940 941 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 942 hmm_pfns_clear(range, &range->pfns[i], 943 hmm_vma_walk.last, range->end); 944 return ret; 945 } 946 start = end; 947 948 } while (start < range->end); 949 950 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 951 } 952 EXPORT_SYMBOL(hmm_range_fault); 953 954 /** 955 * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. 956 * @range: range being faulted 957 * @device: device to map page to 958 * @daddrs: array of dma addresses for the mapped pages 959 * @flags: HMM_FAULT_* 960 * 961 * Return: the number of pages mapped on success (including zero), or any 962 * status return from hmm_range_fault() otherwise. 963 */ 964 long hmm_range_dma_map(struct hmm_range *range, struct device *device, 965 dma_addr_t *daddrs, unsigned int flags) 966 { 967 unsigned long i, npages, mapped; 968 long ret; 969 970 ret = hmm_range_fault(range, flags); 971 if (ret <= 0) 972 return ret ? ret : -EBUSY; 973 974 npages = (range->end - range->start) >> PAGE_SHIFT; 975 for (i = 0, mapped = 0; i < npages; ++i) { 976 enum dma_data_direction dir = DMA_TO_DEVICE; 977 struct page *page; 978 979 /* 980 * FIXME need to update DMA API to provide invalid DMA address 981 * value instead of a function to test dma address value. This 982 * would remove lot of dumb code duplicated accross many arch. 983 * 984 * For now setting it to 0 here is good enough as the pfns[] 985 * value is what is use to check what is valid and what isn't. 986 */ 987 daddrs[i] = 0; 988 989 page = hmm_device_entry_to_page(range, range->pfns[i]); 990 if (page == NULL) 991 continue; 992 993 /* Check if range is being invalidated */ 994 if (!range->valid) { 995 ret = -EBUSY; 996 goto unmap; 997 } 998 999 /* If it is read and write than map bi-directional. */ 1000 if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) 1001 dir = DMA_BIDIRECTIONAL; 1002 1003 daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); 1004 if (dma_mapping_error(device, daddrs[i])) { 1005 ret = -EFAULT; 1006 goto unmap; 1007 } 1008 1009 mapped++; 1010 } 1011 1012 return mapped; 1013 1014 unmap: 1015 for (npages = i, i = 0; (i < npages) && mapped; ++i) { 1016 enum dma_data_direction dir = DMA_TO_DEVICE; 1017 struct page *page; 1018 1019 page = hmm_device_entry_to_page(range, range->pfns[i]); 1020 if (page == NULL) 1021 continue; 1022 1023 if (dma_mapping_error(device, daddrs[i])) 1024 continue; 1025 1026 /* If it is read and write than map bi-directional. */ 1027 if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) 1028 dir = DMA_BIDIRECTIONAL; 1029 1030 dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); 1031 mapped--; 1032 } 1033 1034 return ret; 1035 } 1036 EXPORT_SYMBOL(hmm_range_dma_map); 1037 1038 /** 1039 * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() 1040 * @range: range being unmapped 1041 * @device: device against which dma map was done 1042 * @daddrs: dma address of mapped pages 1043 * @dirty: dirty page if it had the write flag set 1044 * Return: number of page unmapped on success, -EINVAL otherwise 1045 * 1046 * Note that caller MUST abide by mmu notifier or use HMM mirror and abide 1047 * to the sync_cpu_device_pagetables() callback so that it is safe here to 1048 * call set_page_dirty(). Caller must also take appropriate locks to avoid 1049 * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. 1050 */ 1051 long hmm_range_dma_unmap(struct hmm_range *range, 1052 struct device *device, 1053 dma_addr_t *daddrs, 1054 bool dirty) 1055 { 1056 unsigned long i, npages; 1057 long cpages = 0; 1058 1059 /* Sanity check. */ 1060 if (range->end <= range->start) 1061 return -EINVAL; 1062 if (!daddrs) 1063 return -EINVAL; 1064 if (!range->pfns) 1065 return -EINVAL; 1066 1067 npages = (range->end - range->start) >> PAGE_SHIFT; 1068 for (i = 0; i < npages; ++i) { 1069 enum dma_data_direction dir = DMA_TO_DEVICE; 1070 struct page *page; 1071 1072 page = hmm_device_entry_to_page(range, range->pfns[i]); 1073 if (page == NULL) 1074 continue; 1075 1076 /* If it is read and write than map bi-directional. */ 1077 if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { 1078 dir = DMA_BIDIRECTIONAL; 1079 1080 /* 1081 * See comments in function description on why it is 1082 * safe here to call set_page_dirty() 1083 */ 1084 if (dirty) 1085 set_page_dirty(page); 1086 } 1087 1088 /* Unmap and clear pfns/dma address */ 1089 dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); 1090 range->pfns[i] = range->values[HMM_PFN_NONE]; 1091 /* FIXME see comments in hmm_vma_dma_map() */ 1092 daddrs[i] = 0; 1093 cpages++; 1094 } 1095 1096 return cpages; 1097 } 1098 EXPORT_SYMBOL(hmm_range_dma_unmap); 1099