1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/memremap.h> 16 #include <linux/mutex.h> 17 #include <linux/rwsem.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/highmem.h> 21 #include <linux/delay.h> 22 #include <linux/pagemap.h> 23 #include <linux/hmm.h> 24 #include <linux/vmalloc.h> 25 #include <linux/swap.h> 26 #include <linux/swapops.h> 27 #include <linux/sched/mm.h> 28 #include <linux/platform_device.h> 29 #include <linux/rmap.h> 30 #include <linux/mmu_notifier.h> 31 #include <linux/migrate.h> 32 33 #include "test_hmm_uapi.h" 34 35 #define DMIRROR_NDEVICES 2 36 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 37 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 38 #define DEVMEM_CHUNKS_RESERVE 16 39 40 static const struct dev_pagemap_ops dmirror_devmem_ops; 41 static const struct mmu_interval_notifier_ops dmirror_min_ops; 42 static dev_t dmirror_dev; 43 44 struct dmirror_device; 45 46 struct dmirror_bounce { 47 void *ptr; 48 unsigned long size; 49 unsigned long addr; 50 unsigned long cpages; 51 }; 52 53 #define DPT_XA_TAG_ATOMIC 1UL 54 #define DPT_XA_TAG_WRITE 3UL 55 56 /* 57 * Data structure to track address ranges and register for mmu interval 58 * notifier updates. 59 */ 60 struct dmirror_interval { 61 struct mmu_interval_notifier notifier; 62 struct dmirror *dmirror; 63 }; 64 65 /* 66 * Data attached to the open device file. 67 * Note that it might be shared after a fork(). 68 */ 69 struct dmirror { 70 struct dmirror_device *mdevice; 71 struct xarray pt; 72 struct mmu_interval_notifier notifier; 73 struct mutex mutex; 74 }; 75 76 /* 77 * ZONE_DEVICE pages for migration and simulating device memory. 78 */ 79 struct dmirror_chunk { 80 struct dev_pagemap pagemap; 81 struct dmirror_device *mdevice; 82 }; 83 84 /* 85 * Per device data. 86 */ 87 struct dmirror_device { 88 struct cdev cdevice; 89 struct hmm_devmem *devmem; 90 91 unsigned int devmem_capacity; 92 unsigned int devmem_count; 93 struct dmirror_chunk **devmem_chunks; 94 struct mutex devmem_lock; /* protects the above */ 95 96 unsigned long calloc; 97 unsigned long cfree; 98 struct page *free_pages; 99 spinlock_t lock; /* protects the above */ 100 }; 101 102 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 103 104 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 105 unsigned long addr, 106 unsigned long size) 107 { 108 bounce->addr = addr; 109 bounce->size = size; 110 bounce->cpages = 0; 111 bounce->ptr = vmalloc(size); 112 if (!bounce->ptr) 113 return -ENOMEM; 114 return 0; 115 } 116 117 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 118 { 119 vfree(bounce->ptr); 120 } 121 122 static int dmirror_fops_open(struct inode *inode, struct file *filp) 123 { 124 struct cdev *cdev = inode->i_cdev; 125 struct dmirror *dmirror; 126 int ret; 127 128 /* Mirror this process address space */ 129 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 130 if (dmirror == NULL) 131 return -ENOMEM; 132 133 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 134 mutex_init(&dmirror->mutex); 135 xa_init(&dmirror->pt); 136 137 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 138 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 139 if (ret) { 140 kfree(dmirror); 141 return ret; 142 } 143 144 filp->private_data = dmirror; 145 return 0; 146 } 147 148 static int dmirror_fops_release(struct inode *inode, struct file *filp) 149 { 150 struct dmirror *dmirror = filp->private_data; 151 152 mmu_interval_notifier_remove(&dmirror->notifier); 153 xa_destroy(&dmirror->pt); 154 kfree(dmirror); 155 return 0; 156 } 157 158 static struct dmirror_device *dmirror_page_to_device(struct page *page) 159 160 { 161 return container_of(page->pgmap, struct dmirror_chunk, 162 pagemap)->mdevice; 163 } 164 165 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 166 { 167 unsigned long *pfns = range->hmm_pfns; 168 unsigned long pfn; 169 170 for (pfn = (range->start >> PAGE_SHIFT); 171 pfn < (range->end >> PAGE_SHIFT); 172 pfn++, pfns++) { 173 struct page *page; 174 void *entry; 175 176 /* 177 * Since we asked for hmm_range_fault() to populate pages, 178 * it shouldn't return an error entry on success. 179 */ 180 WARN_ON(*pfns & HMM_PFN_ERROR); 181 WARN_ON(!(*pfns & HMM_PFN_VALID)); 182 183 page = hmm_pfn_to_page(*pfns); 184 WARN_ON(!page); 185 186 entry = page; 187 if (*pfns & HMM_PFN_WRITE) 188 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 189 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 190 return -EFAULT; 191 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 192 if (xa_is_err(entry)) 193 return xa_err(entry); 194 } 195 196 return 0; 197 } 198 199 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 200 unsigned long end) 201 { 202 unsigned long pfn; 203 void *entry; 204 205 /* 206 * The XArray doesn't hold references to pages since it relies on 207 * the mmu notifier to clear page pointers when they become stale. 208 * Therefore, it is OK to just clear the entry. 209 */ 210 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 211 end >> PAGE_SHIFT) 212 xa_erase(&dmirror->pt, pfn); 213 } 214 215 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 216 const struct mmu_notifier_range *range, 217 unsigned long cur_seq) 218 { 219 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 220 221 /* 222 * Ignore invalidation callbacks for device private pages since 223 * the invalidation is handled as part of the migration process. 224 */ 225 if (range->event == MMU_NOTIFY_MIGRATE && 226 range->owner == dmirror->mdevice) 227 return true; 228 229 if (mmu_notifier_range_blockable(range)) 230 mutex_lock(&dmirror->mutex); 231 else if (!mutex_trylock(&dmirror->mutex)) 232 return false; 233 234 mmu_interval_set_seq(mni, cur_seq); 235 dmirror_do_update(dmirror, range->start, range->end); 236 237 mutex_unlock(&dmirror->mutex); 238 return true; 239 } 240 241 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 242 .invalidate = dmirror_interval_invalidate, 243 }; 244 245 static int dmirror_range_fault(struct dmirror *dmirror, 246 struct hmm_range *range) 247 { 248 struct mm_struct *mm = dmirror->notifier.mm; 249 unsigned long timeout = 250 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 251 int ret; 252 253 while (true) { 254 if (time_after(jiffies, timeout)) { 255 ret = -EBUSY; 256 goto out; 257 } 258 259 range->notifier_seq = mmu_interval_read_begin(range->notifier); 260 mmap_read_lock(mm); 261 ret = hmm_range_fault(range); 262 mmap_read_unlock(mm); 263 if (ret) { 264 if (ret == -EBUSY) 265 continue; 266 goto out; 267 } 268 269 mutex_lock(&dmirror->mutex); 270 if (mmu_interval_read_retry(range->notifier, 271 range->notifier_seq)) { 272 mutex_unlock(&dmirror->mutex); 273 continue; 274 } 275 break; 276 } 277 278 ret = dmirror_do_fault(dmirror, range); 279 280 mutex_unlock(&dmirror->mutex); 281 out: 282 return ret; 283 } 284 285 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 286 unsigned long end, bool write) 287 { 288 struct mm_struct *mm = dmirror->notifier.mm; 289 unsigned long addr; 290 unsigned long pfns[64]; 291 struct hmm_range range = { 292 .notifier = &dmirror->notifier, 293 .hmm_pfns = pfns, 294 .pfn_flags_mask = 0, 295 .default_flags = 296 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 297 .dev_private_owner = dmirror->mdevice, 298 }; 299 int ret = 0; 300 301 /* Since the mm is for the mirrored process, get a reference first. */ 302 if (!mmget_not_zero(mm)) 303 return 0; 304 305 for (addr = start; addr < end; addr = range.end) { 306 range.start = addr; 307 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 308 309 ret = dmirror_range_fault(dmirror, &range); 310 if (ret) 311 break; 312 } 313 314 mmput(mm); 315 return ret; 316 } 317 318 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 319 unsigned long end, struct dmirror_bounce *bounce) 320 { 321 unsigned long pfn; 322 void *ptr; 323 324 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 325 326 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 327 void *entry; 328 struct page *page; 329 void *tmp; 330 331 entry = xa_load(&dmirror->pt, pfn); 332 page = xa_untag_pointer(entry); 333 if (!page) 334 return -ENOENT; 335 336 tmp = kmap(page); 337 memcpy(ptr, tmp, PAGE_SIZE); 338 kunmap(page); 339 340 ptr += PAGE_SIZE; 341 bounce->cpages++; 342 } 343 344 return 0; 345 } 346 347 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 348 { 349 struct dmirror_bounce bounce; 350 unsigned long start, end; 351 unsigned long size = cmd->npages << PAGE_SHIFT; 352 int ret; 353 354 start = cmd->addr; 355 end = start + size; 356 if (end < start) 357 return -EINVAL; 358 359 ret = dmirror_bounce_init(&bounce, start, size); 360 if (ret) 361 return ret; 362 363 while (1) { 364 mutex_lock(&dmirror->mutex); 365 ret = dmirror_do_read(dmirror, start, end, &bounce); 366 mutex_unlock(&dmirror->mutex); 367 if (ret != -ENOENT) 368 break; 369 370 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 371 ret = dmirror_fault(dmirror, start, end, false); 372 if (ret) 373 break; 374 cmd->faults++; 375 } 376 377 if (ret == 0) { 378 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 379 bounce.size)) 380 ret = -EFAULT; 381 } 382 cmd->cpages = bounce.cpages; 383 dmirror_bounce_fini(&bounce); 384 return ret; 385 } 386 387 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 388 unsigned long end, struct dmirror_bounce *bounce) 389 { 390 unsigned long pfn; 391 void *ptr; 392 393 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 394 395 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 396 void *entry; 397 struct page *page; 398 void *tmp; 399 400 entry = xa_load(&dmirror->pt, pfn); 401 page = xa_untag_pointer(entry); 402 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 403 return -ENOENT; 404 405 tmp = kmap(page); 406 memcpy(tmp, ptr, PAGE_SIZE); 407 kunmap(page); 408 409 ptr += PAGE_SIZE; 410 bounce->cpages++; 411 } 412 413 return 0; 414 } 415 416 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 417 { 418 struct dmirror_bounce bounce; 419 unsigned long start, end; 420 unsigned long size = cmd->npages << PAGE_SHIFT; 421 int ret; 422 423 start = cmd->addr; 424 end = start + size; 425 if (end < start) 426 return -EINVAL; 427 428 ret = dmirror_bounce_init(&bounce, start, size); 429 if (ret) 430 return ret; 431 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 432 bounce.size)) { 433 ret = -EFAULT; 434 goto fini; 435 } 436 437 while (1) { 438 mutex_lock(&dmirror->mutex); 439 ret = dmirror_do_write(dmirror, start, end, &bounce); 440 mutex_unlock(&dmirror->mutex); 441 if (ret != -ENOENT) 442 break; 443 444 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 445 ret = dmirror_fault(dmirror, start, end, true); 446 if (ret) 447 break; 448 cmd->faults++; 449 } 450 451 fini: 452 cmd->cpages = bounce.cpages; 453 dmirror_bounce_fini(&bounce); 454 return ret; 455 } 456 457 static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, 458 struct page **ppage) 459 { 460 struct dmirror_chunk *devmem; 461 struct resource *res; 462 unsigned long pfn; 463 unsigned long pfn_first; 464 unsigned long pfn_last; 465 void *ptr; 466 467 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 468 if (!devmem) 469 return false; 470 471 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 472 "hmm_dmirror"); 473 if (IS_ERR(res)) 474 goto err_devmem; 475 476 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 477 devmem->pagemap.range.start = res->start; 478 devmem->pagemap.range.end = res->end; 479 devmem->pagemap.nr_range = 1; 480 devmem->pagemap.ops = &dmirror_devmem_ops; 481 devmem->pagemap.owner = mdevice; 482 483 mutex_lock(&mdevice->devmem_lock); 484 485 if (mdevice->devmem_count == mdevice->devmem_capacity) { 486 struct dmirror_chunk **new_chunks; 487 unsigned int new_capacity; 488 489 new_capacity = mdevice->devmem_capacity + 490 DEVMEM_CHUNKS_RESERVE; 491 new_chunks = krealloc(mdevice->devmem_chunks, 492 sizeof(new_chunks[0]) * new_capacity, 493 GFP_KERNEL); 494 if (!new_chunks) 495 goto err_release; 496 mdevice->devmem_capacity = new_capacity; 497 mdevice->devmem_chunks = new_chunks; 498 } 499 500 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 501 if (IS_ERR(ptr)) 502 goto err_release; 503 504 devmem->mdevice = mdevice; 505 pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; 506 pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); 507 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 508 509 mutex_unlock(&mdevice->devmem_lock); 510 511 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 512 DEVMEM_CHUNK_SIZE / (1024 * 1024), 513 mdevice->devmem_count, 514 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 515 pfn_first, pfn_last); 516 517 spin_lock(&mdevice->lock); 518 for (pfn = pfn_first; pfn < pfn_last; pfn++) { 519 struct page *page = pfn_to_page(pfn); 520 521 page->zone_device_data = mdevice->free_pages; 522 mdevice->free_pages = page; 523 } 524 if (ppage) { 525 *ppage = mdevice->free_pages; 526 mdevice->free_pages = (*ppage)->zone_device_data; 527 mdevice->calloc++; 528 } 529 spin_unlock(&mdevice->lock); 530 531 return true; 532 533 err_release: 534 mutex_unlock(&mdevice->devmem_lock); 535 release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); 536 err_devmem: 537 kfree(devmem); 538 539 return false; 540 } 541 542 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 543 { 544 struct page *dpage = NULL; 545 struct page *rpage; 546 547 /* 548 * This is a fake device so we alloc real system memory to store 549 * our device memory. 550 */ 551 rpage = alloc_page(GFP_HIGHUSER); 552 if (!rpage) 553 return NULL; 554 555 spin_lock(&mdevice->lock); 556 557 if (mdevice->free_pages) { 558 dpage = mdevice->free_pages; 559 mdevice->free_pages = dpage->zone_device_data; 560 mdevice->calloc++; 561 spin_unlock(&mdevice->lock); 562 } else { 563 spin_unlock(&mdevice->lock); 564 if (!dmirror_allocate_chunk(mdevice, &dpage)) 565 goto error; 566 } 567 568 dpage->zone_device_data = rpage; 569 lock_page(dpage); 570 return dpage; 571 572 error: 573 __free_page(rpage); 574 return NULL; 575 } 576 577 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 578 struct dmirror *dmirror) 579 { 580 struct dmirror_device *mdevice = dmirror->mdevice; 581 const unsigned long *src = args->src; 582 unsigned long *dst = args->dst; 583 unsigned long addr; 584 585 for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 586 src++, dst++) { 587 struct page *spage; 588 struct page *dpage; 589 struct page *rpage; 590 591 if (!(*src & MIGRATE_PFN_MIGRATE)) 592 continue; 593 594 /* 595 * Note that spage might be NULL which is OK since it is an 596 * unallocated pte_none() or read-only zero page. 597 */ 598 spage = migrate_pfn_to_page(*src); 599 600 dpage = dmirror_devmem_alloc_page(mdevice); 601 if (!dpage) 602 continue; 603 604 rpage = dpage->zone_device_data; 605 if (spage) 606 copy_highpage(rpage, spage); 607 else 608 clear_highpage(rpage); 609 610 /* 611 * Normally, a device would use the page->zone_device_data to 612 * point to the mirror but here we use it to hold the page for 613 * the simulated device memory and that page holds the pointer 614 * to the mirror. 615 */ 616 rpage->zone_device_data = dmirror; 617 618 *dst = migrate_pfn(page_to_pfn(dpage)); 619 if ((*src & MIGRATE_PFN_WRITE) || 620 (!spage && args->vma->vm_flags & VM_WRITE)) 621 *dst |= MIGRATE_PFN_WRITE; 622 } 623 } 624 625 static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start, 626 unsigned long end) 627 { 628 unsigned long pfn; 629 630 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 631 void *entry; 632 633 entry = xa_load(&dmirror->pt, pfn); 634 if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC) 635 return -EPERM; 636 } 637 638 return 0; 639 } 640 641 static int dmirror_atomic_map(unsigned long start, unsigned long end, 642 struct page **pages, struct dmirror *dmirror) 643 { 644 unsigned long pfn, mapped = 0; 645 int i; 646 647 /* Map the migrated pages into the device's page tables. */ 648 mutex_lock(&dmirror->mutex); 649 650 for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) { 651 void *entry; 652 653 if (!pages[i]) 654 continue; 655 656 entry = pages[i]; 657 entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC); 658 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 659 if (xa_is_err(entry)) { 660 mutex_unlock(&dmirror->mutex); 661 return xa_err(entry); 662 } 663 664 mapped++; 665 } 666 667 mutex_unlock(&dmirror->mutex); 668 return mapped; 669 } 670 671 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 672 struct dmirror *dmirror) 673 { 674 unsigned long start = args->start; 675 unsigned long end = args->end; 676 const unsigned long *src = args->src; 677 const unsigned long *dst = args->dst; 678 unsigned long pfn; 679 680 /* Map the migrated pages into the device's page tables. */ 681 mutex_lock(&dmirror->mutex); 682 683 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 684 src++, dst++) { 685 struct page *dpage; 686 void *entry; 687 688 if (!(*src & MIGRATE_PFN_MIGRATE)) 689 continue; 690 691 dpage = migrate_pfn_to_page(*dst); 692 if (!dpage) 693 continue; 694 695 /* 696 * Store the page that holds the data so the page table 697 * doesn't have to deal with ZONE_DEVICE private pages. 698 */ 699 entry = dpage->zone_device_data; 700 if (*dst & MIGRATE_PFN_WRITE) 701 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 702 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 703 if (xa_is_err(entry)) { 704 mutex_unlock(&dmirror->mutex); 705 return xa_err(entry); 706 } 707 } 708 709 mutex_unlock(&dmirror->mutex); 710 return 0; 711 } 712 713 static int dmirror_exclusive(struct dmirror *dmirror, 714 struct hmm_dmirror_cmd *cmd) 715 { 716 unsigned long start, end, addr; 717 unsigned long size = cmd->npages << PAGE_SHIFT; 718 struct mm_struct *mm = dmirror->notifier.mm; 719 struct page *pages[64]; 720 struct dmirror_bounce bounce; 721 unsigned long next; 722 int ret; 723 724 start = cmd->addr; 725 end = start + size; 726 if (end < start) 727 return -EINVAL; 728 729 /* Since the mm is for the mirrored process, get a reference first. */ 730 if (!mmget_not_zero(mm)) 731 return -EINVAL; 732 733 mmap_read_lock(mm); 734 for (addr = start; addr < end; addr = next) { 735 unsigned long mapped = 0; 736 int i; 737 738 if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)) 739 next = end; 740 else 741 next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT); 742 743 ret = make_device_exclusive_range(mm, addr, next, pages, NULL); 744 /* 745 * Do dmirror_atomic_map() iff all pages are marked for 746 * exclusive access to avoid accessing uninitialized 747 * fields of pages. 748 */ 749 if (ret == (next - addr) >> PAGE_SHIFT) 750 mapped = dmirror_atomic_map(addr, next, pages, dmirror); 751 for (i = 0; i < ret; i++) { 752 if (pages[i]) { 753 unlock_page(pages[i]); 754 put_page(pages[i]); 755 } 756 } 757 758 if (addr + (mapped << PAGE_SHIFT) < next) { 759 mmap_read_unlock(mm); 760 mmput(mm); 761 return -EBUSY; 762 } 763 } 764 mmap_read_unlock(mm); 765 mmput(mm); 766 767 /* Return the migrated data for verification. */ 768 ret = dmirror_bounce_init(&bounce, start, size); 769 if (ret) 770 return ret; 771 mutex_lock(&dmirror->mutex); 772 ret = dmirror_do_read(dmirror, start, end, &bounce); 773 mutex_unlock(&dmirror->mutex); 774 if (ret == 0) { 775 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 776 bounce.size)) 777 ret = -EFAULT; 778 } 779 780 cmd->cpages = bounce.cpages; 781 dmirror_bounce_fini(&bounce); 782 return ret; 783 } 784 785 static int dmirror_migrate(struct dmirror *dmirror, 786 struct hmm_dmirror_cmd *cmd) 787 { 788 unsigned long start, end, addr; 789 unsigned long size = cmd->npages << PAGE_SHIFT; 790 struct mm_struct *mm = dmirror->notifier.mm; 791 struct vm_area_struct *vma; 792 unsigned long src_pfns[64]; 793 unsigned long dst_pfns[64]; 794 struct dmirror_bounce bounce; 795 struct migrate_vma args; 796 unsigned long next; 797 int ret; 798 799 start = cmd->addr; 800 end = start + size; 801 if (end < start) 802 return -EINVAL; 803 804 /* Since the mm is for the mirrored process, get a reference first. */ 805 if (!mmget_not_zero(mm)) 806 return -EINVAL; 807 808 mmap_read_lock(mm); 809 for (addr = start; addr < end; addr = next) { 810 vma = vma_lookup(mm, addr); 811 if (!vma || !(vma->vm_flags & VM_READ)) { 812 ret = -EINVAL; 813 goto out; 814 } 815 next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 816 if (next > vma->vm_end) 817 next = vma->vm_end; 818 819 args.vma = vma; 820 args.src = src_pfns; 821 args.dst = dst_pfns; 822 args.start = addr; 823 args.end = next; 824 args.pgmap_owner = dmirror->mdevice; 825 args.flags = MIGRATE_VMA_SELECT_SYSTEM; 826 ret = migrate_vma_setup(&args); 827 if (ret) 828 goto out; 829 830 dmirror_migrate_alloc_and_copy(&args, dmirror); 831 migrate_vma_pages(&args); 832 dmirror_migrate_finalize_and_map(&args, dmirror); 833 migrate_vma_finalize(&args); 834 } 835 mmap_read_unlock(mm); 836 mmput(mm); 837 838 /* Return the migrated data for verification. */ 839 ret = dmirror_bounce_init(&bounce, start, size); 840 if (ret) 841 return ret; 842 mutex_lock(&dmirror->mutex); 843 ret = dmirror_do_read(dmirror, start, end, &bounce); 844 mutex_unlock(&dmirror->mutex); 845 if (ret == 0) { 846 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 847 bounce.size)) 848 ret = -EFAULT; 849 } 850 cmd->cpages = bounce.cpages; 851 dmirror_bounce_fini(&bounce); 852 return ret; 853 854 out: 855 mmap_read_unlock(mm); 856 mmput(mm); 857 return ret; 858 } 859 860 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 861 unsigned char *perm, unsigned long entry) 862 { 863 struct page *page; 864 865 if (entry & HMM_PFN_ERROR) { 866 *perm = HMM_DMIRROR_PROT_ERROR; 867 return; 868 } 869 if (!(entry & HMM_PFN_VALID)) { 870 *perm = HMM_DMIRROR_PROT_NONE; 871 return; 872 } 873 874 page = hmm_pfn_to_page(entry); 875 if (is_device_private_page(page)) { 876 /* Is the page migrated to this device or some other? */ 877 if (dmirror->mdevice == dmirror_page_to_device(page)) 878 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 879 else 880 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 881 } else if (is_zero_pfn(page_to_pfn(page))) 882 *perm = HMM_DMIRROR_PROT_ZERO; 883 else 884 *perm = HMM_DMIRROR_PROT_NONE; 885 if (entry & HMM_PFN_WRITE) 886 *perm |= HMM_DMIRROR_PROT_WRITE; 887 else 888 *perm |= HMM_DMIRROR_PROT_READ; 889 if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 890 *perm |= HMM_DMIRROR_PROT_PMD; 891 else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 892 *perm |= HMM_DMIRROR_PROT_PUD; 893 } 894 895 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 896 const struct mmu_notifier_range *range, 897 unsigned long cur_seq) 898 { 899 struct dmirror_interval *dmi = 900 container_of(mni, struct dmirror_interval, notifier); 901 struct dmirror *dmirror = dmi->dmirror; 902 903 if (mmu_notifier_range_blockable(range)) 904 mutex_lock(&dmirror->mutex); 905 else if (!mutex_trylock(&dmirror->mutex)) 906 return false; 907 908 /* 909 * Snapshots only need to set the sequence number since any 910 * invalidation in the interval invalidates the whole snapshot. 911 */ 912 mmu_interval_set_seq(mni, cur_seq); 913 914 mutex_unlock(&dmirror->mutex); 915 return true; 916 } 917 918 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 919 .invalidate = dmirror_snapshot_invalidate, 920 }; 921 922 static int dmirror_range_snapshot(struct dmirror *dmirror, 923 struct hmm_range *range, 924 unsigned char *perm) 925 { 926 struct mm_struct *mm = dmirror->notifier.mm; 927 struct dmirror_interval notifier; 928 unsigned long timeout = 929 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 930 unsigned long i; 931 unsigned long n; 932 int ret = 0; 933 934 notifier.dmirror = dmirror; 935 range->notifier = ¬ifier.notifier; 936 937 ret = mmu_interval_notifier_insert(range->notifier, mm, 938 range->start, range->end - range->start, 939 &dmirror_mrn_ops); 940 if (ret) 941 return ret; 942 943 while (true) { 944 if (time_after(jiffies, timeout)) { 945 ret = -EBUSY; 946 goto out; 947 } 948 949 range->notifier_seq = mmu_interval_read_begin(range->notifier); 950 951 mmap_read_lock(mm); 952 ret = hmm_range_fault(range); 953 mmap_read_unlock(mm); 954 if (ret) { 955 if (ret == -EBUSY) 956 continue; 957 goto out; 958 } 959 960 mutex_lock(&dmirror->mutex); 961 if (mmu_interval_read_retry(range->notifier, 962 range->notifier_seq)) { 963 mutex_unlock(&dmirror->mutex); 964 continue; 965 } 966 break; 967 } 968 969 n = (range->end - range->start) >> PAGE_SHIFT; 970 for (i = 0; i < n; i++) 971 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 972 973 mutex_unlock(&dmirror->mutex); 974 out: 975 mmu_interval_notifier_remove(range->notifier); 976 return ret; 977 } 978 979 static int dmirror_snapshot(struct dmirror *dmirror, 980 struct hmm_dmirror_cmd *cmd) 981 { 982 struct mm_struct *mm = dmirror->notifier.mm; 983 unsigned long start, end; 984 unsigned long size = cmd->npages << PAGE_SHIFT; 985 unsigned long addr; 986 unsigned long next; 987 unsigned long pfns[64]; 988 unsigned char perm[64]; 989 char __user *uptr; 990 struct hmm_range range = { 991 .hmm_pfns = pfns, 992 .dev_private_owner = dmirror->mdevice, 993 }; 994 int ret = 0; 995 996 start = cmd->addr; 997 end = start + size; 998 if (end < start) 999 return -EINVAL; 1000 1001 /* Since the mm is for the mirrored process, get a reference first. */ 1002 if (!mmget_not_zero(mm)) 1003 return -EINVAL; 1004 1005 /* 1006 * Register a temporary notifier to detect invalidations even if it 1007 * overlaps with other mmu_interval_notifiers. 1008 */ 1009 uptr = u64_to_user_ptr(cmd->ptr); 1010 for (addr = start; addr < end; addr = next) { 1011 unsigned long n; 1012 1013 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 1014 range.start = addr; 1015 range.end = next; 1016 1017 ret = dmirror_range_snapshot(dmirror, &range, perm); 1018 if (ret) 1019 break; 1020 1021 n = (range.end - range.start) >> PAGE_SHIFT; 1022 if (copy_to_user(uptr, perm, n)) { 1023 ret = -EFAULT; 1024 break; 1025 } 1026 1027 cmd->cpages += n; 1028 uptr += n; 1029 } 1030 mmput(mm); 1031 1032 return ret; 1033 } 1034 1035 static long dmirror_fops_unlocked_ioctl(struct file *filp, 1036 unsigned int command, 1037 unsigned long arg) 1038 { 1039 void __user *uarg = (void __user *)arg; 1040 struct hmm_dmirror_cmd cmd; 1041 struct dmirror *dmirror; 1042 int ret; 1043 1044 dmirror = filp->private_data; 1045 if (!dmirror) 1046 return -EINVAL; 1047 1048 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 1049 return -EFAULT; 1050 1051 if (cmd.addr & ~PAGE_MASK) 1052 return -EINVAL; 1053 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 1054 return -EINVAL; 1055 1056 cmd.cpages = 0; 1057 cmd.faults = 0; 1058 1059 switch (command) { 1060 case HMM_DMIRROR_READ: 1061 ret = dmirror_read(dmirror, &cmd); 1062 break; 1063 1064 case HMM_DMIRROR_WRITE: 1065 ret = dmirror_write(dmirror, &cmd); 1066 break; 1067 1068 case HMM_DMIRROR_MIGRATE: 1069 ret = dmirror_migrate(dmirror, &cmd); 1070 break; 1071 1072 case HMM_DMIRROR_EXCLUSIVE: 1073 ret = dmirror_exclusive(dmirror, &cmd); 1074 break; 1075 1076 case HMM_DMIRROR_CHECK_EXCLUSIVE: 1077 ret = dmirror_check_atomic(dmirror, cmd.addr, 1078 cmd.addr + (cmd.npages << PAGE_SHIFT)); 1079 break; 1080 1081 case HMM_DMIRROR_SNAPSHOT: 1082 ret = dmirror_snapshot(dmirror, &cmd); 1083 break; 1084 1085 default: 1086 return -EINVAL; 1087 } 1088 if (ret) 1089 return ret; 1090 1091 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 1092 return -EFAULT; 1093 1094 return 0; 1095 } 1096 1097 static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma) 1098 { 1099 unsigned long addr; 1100 1101 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 1102 struct page *page; 1103 int ret; 1104 1105 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1106 if (!page) 1107 return -ENOMEM; 1108 1109 ret = vm_insert_page(vma, addr, page); 1110 if (ret) { 1111 __free_page(page); 1112 return ret; 1113 } 1114 put_page(page); 1115 } 1116 1117 return 0; 1118 } 1119 1120 static const struct file_operations dmirror_fops = { 1121 .open = dmirror_fops_open, 1122 .release = dmirror_fops_release, 1123 .mmap = dmirror_fops_mmap, 1124 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 1125 .llseek = default_llseek, 1126 .owner = THIS_MODULE, 1127 }; 1128 1129 static void dmirror_devmem_free(struct page *page) 1130 { 1131 struct page *rpage = page->zone_device_data; 1132 struct dmirror_device *mdevice; 1133 1134 if (rpage) 1135 __free_page(rpage); 1136 1137 mdevice = dmirror_page_to_device(page); 1138 1139 spin_lock(&mdevice->lock); 1140 mdevice->cfree++; 1141 page->zone_device_data = mdevice->free_pages; 1142 mdevice->free_pages = page; 1143 spin_unlock(&mdevice->lock); 1144 } 1145 1146 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 1147 struct dmirror *dmirror) 1148 { 1149 const unsigned long *src = args->src; 1150 unsigned long *dst = args->dst; 1151 unsigned long start = args->start; 1152 unsigned long end = args->end; 1153 unsigned long addr; 1154 1155 for (addr = start; addr < end; addr += PAGE_SIZE, 1156 src++, dst++) { 1157 struct page *dpage, *spage; 1158 1159 spage = migrate_pfn_to_page(*src); 1160 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 1161 continue; 1162 spage = spage->zone_device_data; 1163 1164 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1165 if (!dpage) 1166 continue; 1167 1168 lock_page(dpage); 1169 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1170 copy_highpage(dpage, spage); 1171 *dst = migrate_pfn(page_to_pfn(dpage)); 1172 if (*src & MIGRATE_PFN_WRITE) 1173 *dst |= MIGRATE_PFN_WRITE; 1174 } 1175 return 0; 1176 } 1177 1178 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1179 { 1180 struct migrate_vma args; 1181 unsigned long src_pfns; 1182 unsigned long dst_pfns; 1183 struct page *rpage; 1184 struct dmirror *dmirror; 1185 vm_fault_t ret; 1186 1187 /* 1188 * Normally, a device would use the page->zone_device_data to point to 1189 * the mirror but here we use it to hold the page for the simulated 1190 * device memory and that page holds the pointer to the mirror. 1191 */ 1192 rpage = vmf->page->zone_device_data; 1193 dmirror = rpage->zone_device_data; 1194 1195 /* FIXME demonstrate how we can adjust migrate range */ 1196 args.vma = vmf->vma; 1197 args.start = vmf->address; 1198 args.end = args.start + PAGE_SIZE; 1199 args.src = &src_pfns; 1200 args.dst = &dst_pfns; 1201 args.pgmap_owner = dmirror->mdevice; 1202 args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 1203 1204 if (migrate_vma_setup(&args)) 1205 return VM_FAULT_SIGBUS; 1206 1207 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1208 if (ret) 1209 return ret; 1210 migrate_vma_pages(&args); 1211 /* 1212 * No device finalize step is needed since 1213 * dmirror_devmem_fault_alloc_and_copy() will have already 1214 * invalidated the device page table. 1215 */ 1216 migrate_vma_finalize(&args); 1217 return 0; 1218 } 1219 1220 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1221 .page_free = dmirror_devmem_free, 1222 .migrate_to_ram = dmirror_devmem_fault, 1223 }; 1224 1225 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1226 { 1227 dev_t dev; 1228 int ret; 1229 1230 dev = MKDEV(MAJOR(dmirror_dev), id); 1231 mutex_init(&mdevice->devmem_lock); 1232 spin_lock_init(&mdevice->lock); 1233 1234 cdev_init(&mdevice->cdevice, &dmirror_fops); 1235 mdevice->cdevice.owner = THIS_MODULE; 1236 ret = cdev_add(&mdevice->cdevice, dev, 1); 1237 if (ret) 1238 return ret; 1239 1240 /* Build a list of free ZONE_DEVICE private struct pages */ 1241 dmirror_allocate_chunk(mdevice, NULL); 1242 1243 return 0; 1244 } 1245 1246 static void dmirror_device_remove(struct dmirror_device *mdevice) 1247 { 1248 unsigned int i; 1249 1250 if (mdevice->devmem_chunks) { 1251 for (i = 0; i < mdevice->devmem_count; i++) { 1252 struct dmirror_chunk *devmem = 1253 mdevice->devmem_chunks[i]; 1254 1255 memunmap_pages(&devmem->pagemap); 1256 release_mem_region(devmem->pagemap.range.start, 1257 range_len(&devmem->pagemap.range)); 1258 kfree(devmem); 1259 } 1260 kfree(mdevice->devmem_chunks); 1261 } 1262 1263 cdev_del(&mdevice->cdevice); 1264 } 1265 1266 static int __init hmm_dmirror_init(void) 1267 { 1268 int ret; 1269 int id; 1270 1271 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1272 "HMM_DMIRROR"); 1273 if (ret) 1274 goto err_unreg; 1275 1276 for (id = 0; id < DMIRROR_NDEVICES; id++) { 1277 ret = dmirror_device_init(dmirror_devices + id, id); 1278 if (ret) 1279 goto err_chrdev; 1280 } 1281 1282 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1283 return 0; 1284 1285 err_chrdev: 1286 while (--id >= 0) 1287 dmirror_device_remove(dmirror_devices + id); 1288 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1289 err_unreg: 1290 return ret; 1291 } 1292 1293 static void __exit hmm_dmirror_exit(void) 1294 { 1295 int id; 1296 1297 for (id = 0; id < DMIRROR_NDEVICES; id++) 1298 dmirror_device_remove(dmirror_devices + id); 1299 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1300 } 1301 1302 module_init(hmm_dmirror_init); 1303 module_exit(hmm_dmirror_exit); 1304 MODULE_LICENSE("GPL"); 1305