1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/mutex.h> 16 #include <linux/rwsem.h> 17 #include <linux/sched.h> 18 #include <linux/slab.h> 19 #include <linux/highmem.h> 20 #include <linux/delay.h> 21 #include <linux/pagemap.h> 22 #include <linux/hmm.h> 23 #include <linux/vmalloc.h> 24 #include <linux/swap.h> 25 #include <linux/swapops.h> 26 #include <linux/sched/mm.h> 27 #include <linux/platform_device.h> 28 #include <linux/rmap.h> 29 #include <linux/mmu_notifier.h> 30 #include <linux/migrate.h> 31 32 #include "test_hmm_uapi.h" 33 34 #define DMIRROR_NDEVICES 2 35 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 36 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 37 #define DEVMEM_CHUNKS_RESERVE 16 38 39 static const struct dev_pagemap_ops dmirror_devmem_ops; 40 static const struct mmu_interval_notifier_ops dmirror_min_ops; 41 static dev_t dmirror_dev; 42 43 struct dmirror_device; 44 45 struct dmirror_bounce { 46 void *ptr; 47 unsigned long size; 48 unsigned long addr; 49 unsigned long cpages; 50 }; 51 52 #define DPT_XA_TAG_ATOMIC 1UL 53 #define DPT_XA_TAG_WRITE 3UL 54 55 /* 56 * Data structure to track address ranges and register for mmu interval 57 * notifier updates. 58 */ 59 struct dmirror_interval { 60 struct mmu_interval_notifier notifier; 61 struct dmirror *dmirror; 62 }; 63 64 /* 65 * Data attached to the open device file. 66 * Note that it might be shared after a fork(). 67 */ 68 struct dmirror { 69 struct dmirror_device *mdevice; 70 struct xarray pt; 71 struct mmu_interval_notifier notifier; 72 struct mutex mutex; 73 }; 74 75 /* 76 * ZONE_DEVICE pages for migration and simulating device memory. 77 */ 78 struct dmirror_chunk { 79 struct dev_pagemap pagemap; 80 struct dmirror_device *mdevice; 81 }; 82 83 /* 84 * Per device data. 85 */ 86 struct dmirror_device { 87 struct cdev cdevice; 88 struct hmm_devmem *devmem; 89 90 unsigned int devmem_capacity; 91 unsigned int devmem_count; 92 struct dmirror_chunk **devmem_chunks; 93 struct mutex devmem_lock; /* protects the above */ 94 95 unsigned long calloc; 96 unsigned long cfree; 97 struct page *free_pages; 98 spinlock_t lock; /* protects the above */ 99 }; 100 101 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 102 103 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 104 unsigned long addr, 105 unsigned long size) 106 { 107 bounce->addr = addr; 108 bounce->size = size; 109 bounce->cpages = 0; 110 bounce->ptr = vmalloc(size); 111 if (!bounce->ptr) 112 return -ENOMEM; 113 return 0; 114 } 115 116 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 117 { 118 vfree(bounce->ptr); 119 } 120 121 static int dmirror_fops_open(struct inode *inode, struct file *filp) 122 { 123 struct cdev *cdev = inode->i_cdev; 124 struct dmirror *dmirror; 125 int ret; 126 127 /* Mirror this process address space */ 128 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 129 if (dmirror == NULL) 130 return -ENOMEM; 131 132 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 133 mutex_init(&dmirror->mutex); 134 xa_init(&dmirror->pt); 135 136 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 137 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 138 if (ret) { 139 kfree(dmirror); 140 return ret; 141 } 142 143 filp->private_data = dmirror; 144 return 0; 145 } 146 147 static int dmirror_fops_release(struct inode *inode, struct file *filp) 148 { 149 struct dmirror *dmirror = filp->private_data; 150 151 mmu_interval_notifier_remove(&dmirror->notifier); 152 xa_destroy(&dmirror->pt); 153 kfree(dmirror); 154 return 0; 155 } 156 157 static struct dmirror_device *dmirror_page_to_device(struct page *page) 158 159 { 160 return container_of(page->pgmap, struct dmirror_chunk, 161 pagemap)->mdevice; 162 } 163 164 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 165 { 166 unsigned long *pfns = range->hmm_pfns; 167 unsigned long pfn; 168 169 for (pfn = (range->start >> PAGE_SHIFT); 170 pfn < (range->end >> PAGE_SHIFT); 171 pfn++, pfns++) { 172 struct page *page; 173 void *entry; 174 175 /* 176 * Since we asked for hmm_range_fault() to populate pages, 177 * it shouldn't return an error entry on success. 178 */ 179 WARN_ON(*pfns & HMM_PFN_ERROR); 180 WARN_ON(!(*pfns & HMM_PFN_VALID)); 181 182 page = hmm_pfn_to_page(*pfns); 183 WARN_ON(!page); 184 185 entry = page; 186 if (*pfns & HMM_PFN_WRITE) 187 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 188 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 189 return -EFAULT; 190 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 191 if (xa_is_err(entry)) 192 return xa_err(entry); 193 } 194 195 return 0; 196 } 197 198 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 199 unsigned long end) 200 { 201 unsigned long pfn; 202 void *entry; 203 204 /* 205 * The XArray doesn't hold references to pages since it relies on 206 * the mmu notifier to clear page pointers when they become stale. 207 * Therefore, it is OK to just clear the entry. 208 */ 209 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 210 end >> PAGE_SHIFT) 211 xa_erase(&dmirror->pt, pfn); 212 } 213 214 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 215 const struct mmu_notifier_range *range, 216 unsigned long cur_seq) 217 { 218 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 219 220 /* 221 * Ignore invalidation callbacks for device private pages since 222 * the invalidation is handled as part of the migration process. 223 */ 224 if (range->event == MMU_NOTIFY_MIGRATE && 225 range->owner == dmirror->mdevice) 226 return true; 227 228 if (mmu_notifier_range_blockable(range)) 229 mutex_lock(&dmirror->mutex); 230 else if (!mutex_trylock(&dmirror->mutex)) 231 return false; 232 233 mmu_interval_set_seq(mni, cur_seq); 234 dmirror_do_update(dmirror, range->start, range->end); 235 236 mutex_unlock(&dmirror->mutex); 237 return true; 238 } 239 240 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 241 .invalidate = dmirror_interval_invalidate, 242 }; 243 244 static int dmirror_range_fault(struct dmirror *dmirror, 245 struct hmm_range *range) 246 { 247 struct mm_struct *mm = dmirror->notifier.mm; 248 unsigned long timeout = 249 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 250 int ret; 251 252 while (true) { 253 if (time_after(jiffies, timeout)) { 254 ret = -EBUSY; 255 goto out; 256 } 257 258 range->notifier_seq = mmu_interval_read_begin(range->notifier); 259 mmap_read_lock(mm); 260 ret = hmm_range_fault(range); 261 mmap_read_unlock(mm); 262 if (ret) { 263 if (ret == -EBUSY) 264 continue; 265 goto out; 266 } 267 268 mutex_lock(&dmirror->mutex); 269 if (mmu_interval_read_retry(range->notifier, 270 range->notifier_seq)) { 271 mutex_unlock(&dmirror->mutex); 272 continue; 273 } 274 break; 275 } 276 277 ret = dmirror_do_fault(dmirror, range); 278 279 mutex_unlock(&dmirror->mutex); 280 out: 281 return ret; 282 } 283 284 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 285 unsigned long end, bool write) 286 { 287 struct mm_struct *mm = dmirror->notifier.mm; 288 unsigned long addr; 289 unsigned long pfns[64]; 290 struct hmm_range range = { 291 .notifier = &dmirror->notifier, 292 .hmm_pfns = pfns, 293 .pfn_flags_mask = 0, 294 .default_flags = 295 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 296 .dev_private_owner = dmirror->mdevice, 297 }; 298 int ret = 0; 299 300 /* Since the mm is for the mirrored process, get a reference first. */ 301 if (!mmget_not_zero(mm)) 302 return 0; 303 304 for (addr = start; addr < end; addr = range.end) { 305 range.start = addr; 306 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 307 308 ret = dmirror_range_fault(dmirror, &range); 309 if (ret) 310 break; 311 } 312 313 mmput(mm); 314 return ret; 315 } 316 317 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 318 unsigned long end, struct dmirror_bounce *bounce) 319 { 320 unsigned long pfn; 321 void *ptr; 322 323 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 324 325 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 326 void *entry; 327 struct page *page; 328 void *tmp; 329 330 entry = xa_load(&dmirror->pt, pfn); 331 page = xa_untag_pointer(entry); 332 if (!page) 333 return -ENOENT; 334 335 tmp = kmap(page); 336 memcpy(ptr, tmp, PAGE_SIZE); 337 kunmap(page); 338 339 ptr += PAGE_SIZE; 340 bounce->cpages++; 341 } 342 343 return 0; 344 } 345 346 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 347 { 348 struct dmirror_bounce bounce; 349 unsigned long start, end; 350 unsigned long size = cmd->npages << PAGE_SHIFT; 351 int ret; 352 353 start = cmd->addr; 354 end = start + size; 355 if (end < start) 356 return -EINVAL; 357 358 ret = dmirror_bounce_init(&bounce, start, size); 359 if (ret) 360 return ret; 361 362 while (1) { 363 mutex_lock(&dmirror->mutex); 364 ret = dmirror_do_read(dmirror, start, end, &bounce); 365 mutex_unlock(&dmirror->mutex); 366 if (ret != -ENOENT) 367 break; 368 369 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 370 ret = dmirror_fault(dmirror, start, end, false); 371 if (ret) 372 break; 373 cmd->faults++; 374 } 375 376 if (ret == 0) { 377 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 378 bounce.size)) 379 ret = -EFAULT; 380 } 381 cmd->cpages = bounce.cpages; 382 dmirror_bounce_fini(&bounce); 383 return ret; 384 } 385 386 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 387 unsigned long end, struct dmirror_bounce *bounce) 388 { 389 unsigned long pfn; 390 void *ptr; 391 392 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 393 394 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 395 void *entry; 396 struct page *page; 397 void *tmp; 398 399 entry = xa_load(&dmirror->pt, pfn); 400 page = xa_untag_pointer(entry); 401 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 402 return -ENOENT; 403 404 tmp = kmap(page); 405 memcpy(tmp, ptr, PAGE_SIZE); 406 kunmap(page); 407 408 ptr += PAGE_SIZE; 409 bounce->cpages++; 410 } 411 412 return 0; 413 } 414 415 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 416 { 417 struct dmirror_bounce bounce; 418 unsigned long start, end; 419 unsigned long size = cmd->npages << PAGE_SHIFT; 420 int ret; 421 422 start = cmd->addr; 423 end = start + size; 424 if (end < start) 425 return -EINVAL; 426 427 ret = dmirror_bounce_init(&bounce, start, size); 428 if (ret) 429 return ret; 430 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 431 bounce.size)) { 432 ret = -EFAULT; 433 goto fini; 434 } 435 436 while (1) { 437 mutex_lock(&dmirror->mutex); 438 ret = dmirror_do_write(dmirror, start, end, &bounce); 439 mutex_unlock(&dmirror->mutex); 440 if (ret != -ENOENT) 441 break; 442 443 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 444 ret = dmirror_fault(dmirror, start, end, true); 445 if (ret) 446 break; 447 cmd->faults++; 448 } 449 450 fini: 451 cmd->cpages = bounce.cpages; 452 dmirror_bounce_fini(&bounce); 453 return ret; 454 } 455 456 static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, 457 struct page **ppage) 458 { 459 struct dmirror_chunk *devmem; 460 struct resource *res; 461 unsigned long pfn; 462 unsigned long pfn_first; 463 unsigned long pfn_last; 464 void *ptr; 465 466 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 467 if (!devmem) 468 return false; 469 470 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 471 "hmm_dmirror"); 472 if (IS_ERR(res)) 473 goto err_devmem; 474 475 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 476 devmem->pagemap.range.start = res->start; 477 devmem->pagemap.range.end = res->end; 478 devmem->pagemap.nr_range = 1; 479 devmem->pagemap.ops = &dmirror_devmem_ops; 480 devmem->pagemap.owner = mdevice; 481 482 mutex_lock(&mdevice->devmem_lock); 483 484 if (mdevice->devmem_count == mdevice->devmem_capacity) { 485 struct dmirror_chunk **new_chunks; 486 unsigned int new_capacity; 487 488 new_capacity = mdevice->devmem_capacity + 489 DEVMEM_CHUNKS_RESERVE; 490 new_chunks = krealloc(mdevice->devmem_chunks, 491 sizeof(new_chunks[0]) * new_capacity, 492 GFP_KERNEL); 493 if (!new_chunks) 494 goto err_release; 495 mdevice->devmem_capacity = new_capacity; 496 mdevice->devmem_chunks = new_chunks; 497 } 498 499 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 500 if (IS_ERR(ptr)) 501 goto err_release; 502 503 devmem->mdevice = mdevice; 504 pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; 505 pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); 506 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 507 508 mutex_unlock(&mdevice->devmem_lock); 509 510 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 511 DEVMEM_CHUNK_SIZE / (1024 * 1024), 512 mdevice->devmem_count, 513 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 514 pfn_first, pfn_last); 515 516 spin_lock(&mdevice->lock); 517 for (pfn = pfn_first; pfn < pfn_last; pfn++) { 518 struct page *page = pfn_to_page(pfn); 519 520 page->zone_device_data = mdevice->free_pages; 521 mdevice->free_pages = page; 522 } 523 if (ppage) { 524 *ppage = mdevice->free_pages; 525 mdevice->free_pages = (*ppage)->zone_device_data; 526 mdevice->calloc++; 527 } 528 spin_unlock(&mdevice->lock); 529 530 return true; 531 532 err_release: 533 mutex_unlock(&mdevice->devmem_lock); 534 release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); 535 err_devmem: 536 kfree(devmem); 537 538 return false; 539 } 540 541 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 542 { 543 struct page *dpage = NULL; 544 struct page *rpage; 545 546 /* 547 * This is a fake device so we alloc real system memory to store 548 * our device memory. 549 */ 550 rpage = alloc_page(GFP_HIGHUSER); 551 if (!rpage) 552 return NULL; 553 554 spin_lock(&mdevice->lock); 555 556 if (mdevice->free_pages) { 557 dpage = mdevice->free_pages; 558 mdevice->free_pages = dpage->zone_device_data; 559 mdevice->calloc++; 560 spin_unlock(&mdevice->lock); 561 } else { 562 spin_unlock(&mdevice->lock); 563 if (!dmirror_allocate_chunk(mdevice, &dpage)) 564 goto error; 565 } 566 567 dpage->zone_device_data = rpage; 568 get_page(dpage); 569 lock_page(dpage); 570 return dpage; 571 572 error: 573 __free_page(rpage); 574 return NULL; 575 } 576 577 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 578 struct dmirror *dmirror) 579 { 580 struct dmirror_device *mdevice = dmirror->mdevice; 581 const unsigned long *src = args->src; 582 unsigned long *dst = args->dst; 583 unsigned long addr; 584 585 for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 586 src++, dst++) { 587 struct page *spage; 588 struct page *dpage; 589 struct page *rpage; 590 591 if (!(*src & MIGRATE_PFN_MIGRATE)) 592 continue; 593 594 /* 595 * Note that spage might be NULL which is OK since it is an 596 * unallocated pte_none() or read-only zero page. 597 */ 598 spage = migrate_pfn_to_page(*src); 599 600 dpage = dmirror_devmem_alloc_page(mdevice); 601 if (!dpage) 602 continue; 603 604 rpage = dpage->zone_device_data; 605 if (spage) 606 copy_highpage(rpage, spage); 607 else 608 clear_highpage(rpage); 609 610 /* 611 * Normally, a device would use the page->zone_device_data to 612 * point to the mirror but here we use it to hold the page for 613 * the simulated device memory and that page holds the pointer 614 * to the mirror. 615 */ 616 rpage->zone_device_data = dmirror; 617 618 *dst = migrate_pfn(page_to_pfn(dpage)); 619 if ((*src & MIGRATE_PFN_WRITE) || 620 (!spage && args->vma->vm_flags & VM_WRITE)) 621 *dst |= MIGRATE_PFN_WRITE; 622 } 623 } 624 625 static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start, 626 unsigned long end) 627 { 628 unsigned long pfn; 629 630 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 631 void *entry; 632 633 entry = xa_load(&dmirror->pt, pfn); 634 if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC) 635 return -EPERM; 636 } 637 638 return 0; 639 } 640 641 static int dmirror_atomic_map(unsigned long start, unsigned long end, 642 struct page **pages, struct dmirror *dmirror) 643 { 644 unsigned long pfn, mapped = 0; 645 int i; 646 647 /* Map the migrated pages into the device's page tables. */ 648 mutex_lock(&dmirror->mutex); 649 650 for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) { 651 void *entry; 652 653 if (!pages[i]) 654 continue; 655 656 entry = pages[i]; 657 entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC); 658 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 659 if (xa_is_err(entry)) { 660 mutex_unlock(&dmirror->mutex); 661 return xa_err(entry); 662 } 663 664 mapped++; 665 } 666 667 mutex_unlock(&dmirror->mutex); 668 return mapped; 669 } 670 671 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 672 struct dmirror *dmirror) 673 { 674 unsigned long start = args->start; 675 unsigned long end = args->end; 676 const unsigned long *src = args->src; 677 const unsigned long *dst = args->dst; 678 unsigned long pfn; 679 680 /* Map the migrated pages into the device's page tables. */ 681 mutex_lock(&dmirror->mutex); 682 683 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 684 src++, dst++) { 685 struct page *dpage; 686 void *entry; 687 688 if (!(*src & MIGRATE_PFN_MIGRATE)) 689 continue; 690 691 dpage = migrate_pfn_to_page(*dst); 692 if (!dpage) 693 continue; 694 695 /* 696 * Store the page that holds the data so the page table 697 * doesn't have to deal with ZONE_DEVICE private pages. 698 */ 699 entry = dpage->zone_device_data; 700 if (*dst & MIGRATE_PFN_WRITE) 701 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 702 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 703 if (xa_is_err(entry)) { 704 mutex_unlock(&dmirror->mutex); 705 return xa_err(entry); 706 } 707 } 708 709 mutex_unlock(&dmirror->mutex); 710 return 0; 711 } 712 713 static int dmirror_exclusive(struct dmirror *dmirror, 714 struct hmm_dmirror_cmd *cmd) 715 { 716 unsigned long start, end, addr; 717 unsigned long size = cmd->npages << PAGE_SHIFT; 718 struct mm_struct *mm = dmirror->notifier.mm; 719 struct page *pages[64]; 720 struct dmirror_bounce bounce; 721 unsigned long next; 722 int ret; 723 724 start = cmd->addr; 725 end = start + size; 726 if (end < start) 727 return -EINVAL; 728 729 /* Since the mm is for the mirrored process, get a reference first. */ 730 if (!mmget_not_zero(mm)) 731 return -EINVAL; 732 733 mmap_read_lock(mm); 734 for (addr = start; addr < end; addr = next) { 735 unsigned long mapped; 736 int i; 737 738 if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)) 739 next = end; 740 else 741 next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT); 742 743 ret = make_device_exclusive_range(mm, addr, next, pages, NULL); 744 mapped = dmirror_atomic_map(addr, next, pages, dmirror); 745 for (i = 0; i < ret; i++) { 746 if (pages[i]) { 747 unlock_page(pages[i]); 748 put_page(pages[i]); 749 } 750 } 751 752 if (addr + (mapped << PAGE_SHIFT) < next) { 753 mmap_read_unlock(mm); 754 mmput(mm); 755 return -EBUSY; 756 } 757 } 758 mmap_read_unlock(mm); 759 mmput(mm); 760 761 /* Return the migrated data for verification. */ 762 ret = dmirror_bounce_init(&bounce, start, size); 763 if (ret) 764 return ret; 765 mutex_lock(&dmirror->mutex); 766 ret = dmirror_do_read(dmirror, start, end, &bounce); 767 mutex_unlock(&dmirror->mutex); 768 if (ret == 0) { 769 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 770 bounce.size)) 771 ret = -EFAULT; 772 } 773 774 cmd->cpages = bounce.cpages; 775 dmirror_bounce_fini(&bounce); 776 return ret; 777 } 778 779 static int dmirror_migrate(struct dmirror *dmirror, 780 struct hmm_dmirror_cmd *cmd) 781 { 782 unsigned long start, end, addr; 783 unsigned long size = cmd->npages << PAGE_SHIFT; 784 struct mm_struct *mm = dmirror->notifier.mm; 785 struct vm_area_struct *vma; 786 unsigned long src_pfns[64]; 787 unsigned long dst_pfns[64]; 788 struct dmirror_bounce bounce; 789 struct migrate_vma args; 790 unsigned long next; 791 int ret; 792 793 start = cmd->addr; 794 end = start + size; 795 if (end < start) 796 return -EINVAL; 797 798 /* Since the mm is for the mirrored process, get a reference first. */ 799 if (!mmget_not_zero(mm)) 800 return -EINVAL; 801 802 mmap_read_lock(mm); 803 for (addr = start; addr < end; addr = next) { 804 vma = vma_lookup(mm, addr); 805 if (!vma || !(vma->vm_flags & VM_READ)) { 806 ret = -EINVAL; 807 goto out; 808 } 809 next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 810 if (next > vma->vm_end) 811 next = vma->vm_end; 812 813 args.vma = vma; 814 args.src = src_pfns; 815 args.dst = dst_pfns; 816 args.start = addr; 817 args.end = next; 818 args.pgmap_owner = dmirror->mdevice; 819 args.flags = MIGRATE_VMA_SELECT_SYSTEM; 820 ret = migrate_vma_setup(&args); 821 if (ret) 822 goto out; 823 824 dmirror_migrate_alloc_and_copy(&args, dmirror); 825 migrate_vma_pages(&args); 826 dmirror_migrate_finalize_and_map(&args, dmirror); 827 migrate_vma_finalize(&args); 828 } 829 mmap_read_unlock(mm); 830 mmput(mm); 831 832 /* Return the migrated data for verification. */ 833 ret = dmirror_bounce_init(&bounce, start, size); 834 if (ret) 835 return ret; 836 mutex_lock(&dmirror->mutex); 837 ret = dmirror_do_read(dmirror, start, end, &bounce); 838 mutex_unlock(&dmirror->mutex); 839 if (ret == 0) { 840 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 841 bounce.size)) 842 ret = -EFAULT; 843 } 844 cmd->cpages = bounce.cpages; 845 dmirror_bounce_fini(&bounce); 846 return ret; 847 848 out: 849 mmap_read_unlock(mm); 850 mmput(mm); 851 return ret; 852 } 853 854 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 855 unsigned char *perm, unsigned long entry) 856 { 857 struct page *page; 858 859 if (entry & HMM_PFN_ERROR) { 860 *perm = HMM_DMIRROR_PROT_ERROR; 861 return; 862 } 863 if (!(entry & HMM_PFN_VALID)) { 864 *perm = HMM_DMIRROR_PROT_NONE; 865 return; 866 } 867 868 page = hmm_pfn_to_page(entry); 869 if (is_device_private_page(page)) { 870 /* Is the page migrated to this device or some other? */ 871 if (dmirror->mdevice == dmirror_page_to_device(page)) 872 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 873 else 874 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 875 } else if (is_zero_pfn(page_to_pfn(page))) 876 *perm = HMM_DMIRROR_PROT_ZERO; 877 else 878 *perm = HMM_DMIRROR_PROT_NONE; 879 if (entry & HMM_PFN_WRITE) 880 *perm |= HMM_DMIRROR_PROT_WRITE; 881 else 882 *perm |= HMM_DMIRROR_PROT_READ; 883 if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 884 *perm |= HMM_DMIRROR_PROT_PMD; 885 else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 886 *perm |= HMM_DMIRROR_PROT_PUD; 887 } 888 889 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 890 const struct mmu_notifier_range *range, 891 unsigned long cur_seq) 892 { 893 struct dmirror_interval *dmi = 894 container_of(mni, struct dmirror_interval, notifier); 895 struct dmirror *dmirror = dmi->dmirror; 896 897 if (mmu_notifier_range_blockable(range)) 898 mutex_lock(&dmirror->mutex); 899 else if (!mutex_trylock(&dmirror->mutex)) 900 return false; 901 902 /* 903 * Snapshots only need to set the sequence number since any 904 * invalidation in the interval invalidates the whole snapshot. 905 */ 906 mmu_interval_set_seq(mni, cur_seq); 907 908 mutex_unlock(&dmirror->mutex); 909 return true; 910 } 911 912 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 913 .invalidate = dmirror_snapshot_invalidate, 914 }; 915 916 static int dmirror_range_snapshot(struct dmirror *dmirror, 917 struct hmm_range *range, 918 unsigned char *perm) 919 { 920 struct mm_struct *mm = dmirror->notifier.mm; 921 struct dmirror_interval notifier; 922 unsigned long timeout = 923 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 924 unsigned long i; 925 unsigned long n; 926 int ret = 0; 927 928 notifier.dmirror = dmirror; 929 range->notifier = ¬ifier.notifier; 930 931 ret = mmu_interval_notifier_insert(range->notifier, mm, 932 range->start, range->end - range->start, 933 &dmirror_mrn_ops); 934 if (ret) 935 return ret; 936 937 while (true) { 938 if (time_after(jiffies, timeout)) { 939 ret = -EBUSY; 940 goto out; 941 } 942 943 range->notifier_seq = mmu_interval_read_begin(range->notifier); 944 945 mmap_read_lock(mm); 946 ret = hmm_range_fault(range); 947 mmap_read_unlock(mm); 948 if (ret) { 949 if (ret == -EBUSY) 950 continue; 951 goto out; 952 } 953 954 mutex_lock(&dmirror->mutex); 955 if (mmu_interval_read_retry(range->notifier, 956 range->notifier_seq)) { 957 mutex_unlock(&dmirror->mutex); 958 continue; 959 } 960 break; 961 } 962 963 n = (range->end - range->start) >> PAGE_SHIFT; 964 for (i = 0; i < n; i++) 965 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 966 967 mutex_unlock(&dmirror->mutex); 968 out: 969 mmu_interval_notifier_remove(range->notifier); 970 return ret; 971 } 972 973 static int dmirror_snapshot(struct dmirror *dmirror, 974 struct hmm_dmirror_cmd *cmd) 975 { 976 struct mm_struct *mm = dmirror->notifier.mm; 977 unsigned long start, end; 978 unsigned long size = cmd->npages << PAGE_SHIFT; 979 unsigned long addr; 980 unsigned long next; 981 unsigned long pfns[64]; 982 unsigned char perm[64]; 983 char __user *uptr; 984 struct hmm_range range = { 985 .hmm_pfns = pfns, 986 .dev_private_owner = dmirror->mdevice, 987 }; 988 int ret = 0; 989 990 start = cmd->addr; 991 end = start + size; 992 if (end < start) 993 return -EINVAL; 994 995 /* Since the mm is for the mirrored process, get a reference first. */ 996 if (!mmget_not_zero(mm)) 997 return -EINVAL; 998 999 /* 1000 * Register a temporary notifier to detect invalidations even if it 1001 * overlaps with other mmu_interval_notifiers. 1002 */ 1003 uptr = u64_to_user_ptr(cmd->ptr); 1004 for (addr = start; addr < end; addr = next) { 1005 unsigned long n; 1006 1007 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 1008 range.start = addr; 1009 range.end = next; 1010 1011 ret = dmirror_range_snapshot(dmirror, &range, perm); 1012 if (ret) 1013 break; 1014 1015 n = (range.end - range.start) >> PAGE_SHIFT; 1016 if (copy_to_user(uptr, perm, n)) { 1017 ret = -EFAULT; 1018 break; 1019 } 1020 1021 cmd->cpages += n; 1022 uptr += n; 1023 } 1024 mmput(mm); 1025 1026 return ret; 1027 } 1028 1029 static long dmirror_fops_unlocked_ioctl(struct file *filp, 1030 unsigned int command, 1031 unsigned long arg) 1032 { 1033 void __user *uarg = (void __user *)arg; 1034 struct hmm_dmirror_cmd cmd; 1035 struct dmirror *dmirror; 1036 int ret; 1037 1038 dmirror = filp->private_data; 1039 if (!dmirror) 1040 return -EINVAL; 1041 1042 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 1043 return -EFAULT; 1044 1045 if (cmd.addr & ~PAGE_MASK) 1046 return -EINVAL; 1047 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 1048 return -EINVAL; 1049 1050 cmd.cpages = 0; 1051 cmd.faults = 0; 1052 1053 switch (command) { 1054 case HMM_DMIRROR_READ: 1055 ret = dmirror_read(dmirror, &cmd); 1056 break; 1057 1058 case HMM_DMIRROR_WRITE: 1059 ret = dmirror_write(dmirror, &cmd); 1060 break; 1061 1062 case HMM_DMIRROR_MIGRATE: 1063 ret = dmirror_migrate(dmirror, &cmd); 1064 break; 1065 1066 case HMM_DMIRROR_EXCLUSIVE: 1067 ret = dmirror_exclusive(dmirror, &cmd); 1068 break; 1069 1070 case HMM_DMIRROR_CHECK_EXCLUSIVE: 1071 ret = dmirror_check_atomic(dmirror, cmd.addr, 1072 cmd.addr + (cmd.npages << PAGE_SHIFT)); 1073 break; 1074 1075 case HMM_DMIRROR_SNAPSHOT: 1076 ret = dmirror_snapshot(dmirror, &cmd); 1077 break; 1078 1079 default: 1080 return -EINVAL; 1081 } 1082 if (ret) 1083 return ret; 1084 1085 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 1086 return -EFAULT; 1087 1088 return 0; 1089 } 1090 1091 static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma) 1092 { 1093 unsigned long addr; 1094 1095 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 1096 struct page *page; 1097 int ret; 1098 1099 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1100 if (!page) 1101 return -ENOMEM; 1102 1103 ret = vm_insert_page(vma, addr, page); 1104 if (ret) { 1105 __free_page(page); 1106 return ret; 1107 } 1108 put_page(page); 1109 } 1110 1111 return 0; 1112 } 1113 1114 static const struct file_operations dmirror_fops = { 1115 .open = dmirror_fops_open, 1116 .release = dmirror_fops_release, 1117 .mmap = dmirror_fops_mmap, 1118 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 1119 .llseek = default_llseek, 1120 .owner = THIS_MODULE, 1121 }; 1122 1123 static void dmirror_devmem_free(struct page *page) 1124 { 1125 struct page *rpage = page->zone_device_data; 1126 struct dmirror_device *mdevice; 1127 1128 if (rpage) 1129 __free_page(rpage); 1130 1131 mdevice = dmirror_page_to_device(page); 1132 1133 spin_lock(&mdevice->lock); 1134 mdevice->cfree++; 1135 page->zone_device_data = mdevice->free_pages; 1136 mdevice->free_pages = page; 1137 spin_unlock(&mdevice->lock); 1138 } 1139 1140 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 1141 struct dmirror *dmirror) 1142 { 1143 const unsigned long *src = args->src; 1144 unsigned long *dst = args->dst; 1145 unsigned long start = args->start; 1146 unsigned long end = args->end; 1147 unsigned long addr; 1148 1149 for (addr = start; addr < end; addr += PAGE_SIZE, 1150 src++, dst++) { 1151 struct page *dpage, *spage; 1152 1153 spage = migrate_pfn_to_page(*src); 1154 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 1155 continue; 1156 spage = spage->zone_device_data; 1157 1158 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1159 if (!dpage) 1160 continue; 1161 1162 lock_page(dpage); 1163 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1164 copy_highpage(dpage, spage); 1165 *dst = migrate_pfn(page_to_pfn(dpage)); 1166 if (*src & MIGRATE_PFN_WRITE) 1167 *dst |= MIGRATE_PFN_WRITE; 1168 } 1169 return 0; 1170 } 1171 1172 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1173 { 1174 struct migrate_vma args; 1175 unsigned long src_pfns; 1176 unsigned long dst_pfns; 1177 struct page *rpage; 1178 struct dmirror *dmirror; 1179 vm_fault_t ret; 1180 1181 /* 1182 * Normally, a device would use the page->zone_device_data to point to 1183 * the mirror but here we use it to hold the page for the simulated 1184 * device memory and that page holds the pointer to the mirror. 1185 */ 1186 rpage = vmf->page->zone_device_data; 1187 dmirror = rpage->zone_device_data; 1188 1189 /* FIXME demonstrate how we can adjust migrate range */ 1190 args.vma = vmf->vma; 1191 args.start = vmf->address; 1192 args.end = args.start + PAGE_SIZE; 1193 args.src = &src_pfns; 1194 args.dst = &dst_pfns; 1195 args.pgmap_owner = dmirror->mdevice; 1196 args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 1197 1198 if (migrate_vma_setup(&args)) 1199 return VM_FAULT_SIGBUS; 1200 1201 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1202 if (ret) 1203 return ret; 1204 migrate_vma_pages(&args); 1205 /* 1206 * No device finalize step is needed since 1207 * dmirror_devmem_fault_alloc_and_copy() will have already 1208 * invalidated the device page table. 1209 */ 1210 migrate_vma_finalize(&args); 1211 return 0; 1212 } 1213 1214 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1215 .page_free = dmirror_devmem_free, 1216 .migrate_to_ram = dmirror_devmem_fault, 1217 }; 1218 1219 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1220 { 1221 dev_t dev; 1222 int ret; 1223 1224 dev = MKDEV(MAJOR(dmirror_dev), id); 1225 mutex_init(&mdevice->devmem_lock); 1226 spin_lock_init(&mdevice->lock); 1227 1228 cdev_init(&mdevice->cdevice, &dmirror_fops); 1229 mdevice->cdevice.owner = THIS_MODULE; 1230 ret = cdev_add(&mdevice->cdevice, dev, 1); 1231 if (ret) 1232 return ret; 1233 1234 /* Build a list of free ZONE_DEVICE private struct pages */ 1235 dmirror_allocate_chunk(mdevice, NULL); 1236 1237 return 0; 1238 } 1239 1240 static void dmirror_device_remove(struct dmirror_device *mdevice) 1241 { 1242 unsigned int i; 1243 1244 if (mdevice->devmem_chunks) { 1245 for (i = 0; i < mdevice->devmem_count; i++) { 1246 struct dmirror_chunk *devmem = 1247 mdevice->devmem_chunks[i]; 1248 1249 memunmap_pages(&devmem->pagemap); 1250 release_mem_region(devmem->pagemap.range.start, 1251 range_len(&devmem->pagemap.range)); 1252 kfree(devmem); 1253 } 1254 kfree(mdevice->devmem_chunks); 1255 } 1256 1257 cdev_del(&mdevice->cdevice); 1258 } 1259 1260 static int __init hmm_dmirror_init(void) 1261 { 1262 int ret; 1263 int id; 1264 1265 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1266 "HMM_DMIRROR"); 1267 if (ret) 1268 goto err_unreg; 1269 1270 for (id = 0; id < DMIRROR_NDEVICES; id++) { 1271 ret = dmirror_device_init(dmirror_devices + id, id); 1272 if (ret) 1273 goto err_chrdev; 1274 } 1275 1276 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1277 return 0; 1278 1279 err_chrdev: 1280 while (--id >= 0) 1281 dmirror_device_remove(dmirror_devices + id); 1282 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1283 err_unreg: 1284 return ret; 1285 } 1286 1287 static void __exit hmm_dmirror_exit(void) 1288 { 1289 int id; 1290 1291 for (id = 0; id < DMIRROR_NDEVICES; id++) 1292 dmirror_device_remove(dmirror_devices + id); 1293 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1294 } 1295 1296 module_init(hmm_dmirror_init); 1297 module_exit(hmm_dmirror_exit); 1298 MODULE_LICENSE("GPL"); 1299