1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/mutex.h> 16 #include <linux/rwsem.h> 17 #include <linux/sched.h> 18 #include <linux/slab.h> 19 #include <linux/highmem.h> 20 #include <linux/delay.h> 21 #include <linux/pagemap.h> 22 #include <linux/hmm.h> 23 #include <linux/vmalloc.h> 24 #include <linux/swap.h> 25 #include <linux/swapops.h> 26 #include <linux/sched/mm.h> 27 #include <linux/platform_device.h> 28 #include <linux/rmap.h> 29 30 #include "test_hmm_uapi.h" 31 32 #define DMIRROR_NDEVICES 2 33 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 34 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 35 #define DEVMEM_CHUNKS_RESERVE 16 36 37 static const struct dev_pagemap_ops dmirror_devmem_ops; 38 static const struct mmu_interval_notifier_ops dmirror_min_ops; 39 static dev_t dmirror_dev; 40 41 struct dmirror_device; 42 43 struct dmirror_bounce { 44 void *ptr; 45 unsigned long size; 46 unsigned long addr; 47 unsigned long cpages; 48 }; 49 50 #define DPT_XA_TAG_ATOMIC 1UL 51 #define DPT_XA_TAG_WRITE 3UL 52 53 /* 54 * Data structure to track address ranges and register for mmu interval 55 * notifier updates. 56 */ 57 struct dmirror_interval { 58 struct mmu_interval_notifier notifier; 59 struct dmirror *dmirror; 60 }; 61 62 /* 63 * Data attached to the open device file. 64 * Note that it might be shared after a fork(). 65 */ 66 struct dmirror { 67 struct dmirror_device *mdevice; 68 struct xarray pt; 69 struct mmu_interval_notifier notifier; 70 struct mutex mutex; 71 }; 72 73 /* 74 * ZONE_DEVICE pages for migration and simulating device memory. 75 */ 76 struct dmirror_chunk { 77 struct dev_pagemap pagemap; 78 struct dmirror_device *mdevice; 79 }; 80 81 /* 82 * Per device data. 83 */ 84 struct dmirror_device { 85 struct cdev cdevice; 86 struct hmm_devmem *devmem; 87 88 unsigned int devmem_capacity; 89 unsigned int devmem_count; 90 struct dmirror_chunk **devmem_chunks; 91 struct mutex devmem_lock; /* protects the above */ 92 93 unsigned long calloc; 94 unsigned long cfree; 95 struct page *free_pages; 96 spinlock_t lock; /* protects the above */ 97 }; 98 99 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 100 101 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 102 unsigned long addr, 103 unsigned long size) 104 { 105 bounce->addr = addr; 106 bounce->size = size; 107 bounce->cpages = 0; 108 bounce->ptr = vmalloc(size); 109 if (!bounce->ptr) 110 return -ENOMEM; 111 return 0; 112 } 113 114 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 115 { 116 vfree(bounce->ptr); 117 } 118 119 static int dmirror_fops_open(struct inode *inode, struct file *filp) 120 { 121 struct cdev *cdev = inode->i_cdev; 122 struct dmirror *dmirror; 123 int ret; 124 125 /* Mirror this process address space */ 126 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 127 if (dmirror == NULL) 128 return -ENOMEM; 129 130 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 131 mutex_init(&dmirror->mutex); 132 xa_init(&dmirror->pt); 133 134 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 135 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 136 if (ret) { 137 kfree(dmirror); 138 return ret; 139 } 140 141 filp->private_data = dmirror; 142 return 0; 143 } 144 145 static int dmirror_fops_release(struct inode *inode, struct file *filp) 146 { 147 struct dmirror *dmirror = filp->private_data; 148 149 mmu_interval_notifier_remove(&dmirror->notifier); 150 xa_destroy(&dmirror->pt); 151 kfree(dmirror); 152 return 0; 153 } 154 155 static struct dmirror_device *dmirror_page_to_device(struct page *page) 156 157 { 158 return container_of(page->pgmap, struct dmirror_chunk, 159 pagemap)->mdevice; 160 } 161 162 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 163 { 164 unsigned long *pfns = range->hmm_pfns; 165 unsigned long pfn; 166 167 for (pfn = (range->start >> PAGE_SHIFT); 168 pfn < (range->end >> PAGE_SHIFT); 169 pfn++, pfns++) { 170 struct page *page; 171 void *entry; 172 173 /* 174 * Since we asked for hmm_range_fault() to populate pages, 175 * it shouldn't return an error entry on success. 176 */ 177 WARN_ON(*pfns & HMM_PFN_ERROR); 178 WARN_ON(!(*pfns & HMM_PFN_VALID)); 179 180 page = hmm_pfn_to_page(*pfns); 181 WARN_ON(!page); 182 183 entry = page; 184 if (*pfns & HMM_PFN_WRITE) 185 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 186 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 187 return -EFAULT; 188 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 189 if (xa_is_err(entry)) 190 return xa_err(entry); 191 } 192 193 return 0; 194 } 195 196 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 197 unsigned long end) 198 { 199 unsigned long pfn; 200 void *entry; 201 202 /* 203 * The XArray doesn't hold references to pages since it relies on 204 * the mmu notifier to clear page pointers when they become stale. 205 * Therefore, it is OK to just clear the entry. 206 */ 207 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 208 end >> PAGE_SHIFT) 209 xa_erase(&dmirror->pt, pfn); 210 } 211 212 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 213 const struct mmu_notifier_range *range, 214 unsigned long cur_seq) 215 { 216 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 217 218 /* 219 * Ignore invalidation callbacks for device private pages since 220 * the invalidation is handled as part of the migration process. 221 */ 222 if (range->event == MMU_NOTIFY_MIGRATE && 223 range->owner == dmirror->mdevice) 224 return true; 225 226 if (mmu_notifier_range_blockable(range)) 227 mutex_lock(&dmirror->mutex); 228 else if (!mutex_trylock(&dmirror->mutex)) 229 return false; 230 231 mmu_interval_set_seq(mni, cur_seq); 232 dmirror_do_update(dmirror, range->start, range->end); 233 234 mutex_unlock(&dmirror->mutex); 235 return true; 236 } 237 238 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 239 .invalidate = dmirror_interval_invalidate, 240 }; 241 242 static int dmirror_range_fault(struct dmirror *dmirror, 243 struct hmm_range *range) 244 { 245 struct mm_struct *mm = dmirror->notifier.mm; 246 unsigned long timeout = 247 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 248 int ret; 249 250 while (true) { 251 if (time_after(jiffies, timeout)) { 252 ret = -EBUSY; 253 goto out; 254 } 255 256 range->notifier_seq = mmu_interval_read_begin(range->notifier); 257 mmap_read_lock(mm); 258 ret = hmm_range_fault(range); 259 mmap_read_unlock(mm); 260 if (ret) { 261 if (ret == -EBUSY) 262 continue; 263 goto out; 264 } 265 266 mutex_lock(&dmirror->mutex); 267 if (mmu_interval_read_retry(range->notifier, 268 range->notifier_seq)) { 269 mutex_unlock(&dmirror->mutex); 270 continue; 271 } 272 break; 273 } 274 275 ret = dmirror_do_fault(dmirror, range); 276 277 mutex_unlock(&dmirror->mutex); 278 out: 279 return ret; 280 } 281 282 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 283 unsigned long end, bool write) 284 { 285 struct mm_struct *mm = dmirror->notifier.mm; 286 unsigned long addr; 287 unsigned long pfns[64]; 288 struct hmm_range range = { 289 .notifier = &dmirror->notifier, 290 .hmm_pfns = pfns, 291 .pfn_flags_mask = 0, 292 .default_flags = 293 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 294 .dev_private_owner = dmirror->mdevice, 295 }; 296 int ret = 0; 297 298 /* Since the mm is for the mirrored process, get a reference first. */ 299 if (!mmget_not_zero(mm)) 300 return 0; 301 302 for (addr = start; addr < end; addr = range.end) { 303 range.start = addr; 304 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 305 306 ret = dmirror_range_fault(dmirror, &range); 307 if (ret) 308 break; 309 } 310 311 mmput(mm); 312 return ret; 313 } 314 315 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 316 unsigned long end, struct dmirror_bounce *bounce) 317 { 318 unsigned long pfn; 319 void *ptr; 320 321 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 322 323 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 324 void *entry; 325 struct page *page; 326 void *tmp; 327 328 entry = xa_load(&dmirror->pt, pfn); 329 page = xa_untag_pointer(entry); 330 if (!page) 331 return -ENOENT; 332 333 tmp = kmap(page); 334 memcpy(ptr, tmp, PAGE_SIZE); 335 kunmap(page); 336 337 ptr += PAGE_SIZE; 338 bounce->cpages++; 339 } 340 341 return 0; 342 } 343 344 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 345 { 346 struct dmirror_bounce bounce; 347 unsigned long start, end; 348 unsigned long size = cmd->npages << PAGE_SHIFT; 349 int ret; 350 351 start = cmd->addr; 352 end = start + size; 353 if (end < start) 354 return -EINVAL; 355 356 ret = dmirror_bounce_init(&bounce, start, size); 357 if (ret) 358 return ret; 359 360 while (1) { 361 mutex_lock(&dmirror->mutex); 362 ret = dmirror_do_read(dmirror, start, end, &bounce); 363 mutex_unlock(&dmirror->mutex); 364 if (ret != -ENOENT) 365 break; 366 367 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 368 ret = dmirror_fault(dmirror, start, end, false); 369 if (ret) 370 break; 371 cmd->faults++; 372 } 373 374 if (ret == 0) { 375 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 376 bounce.size)) 377 ret = -EFAULT; 378 } 379 cmd->cpages = bounce.cpages; 380 dmirror_bounce_fini(&bounce); 381 return ret; 382 } 383 384 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 385 unsigned long end, struct dmirror_bounce *bounce) 386 { 387 unsigned long pfn; 388 void *ptr; 389 390 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 391 392 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 393 void *entry; 394 struct page *page; 395 void *tmp; 396 397 entry = xa_load(&dmirror->pt, pfn); 398 page = xa_untag_pointer(entry); 399 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 400 return -ENOENT; 401 402 tmp = kmap(page); 403 memcpy(tmp, ptr, PAGE_SIZE); 404 kunmap(page); 405 406 ptr += PAGE_SIZE; 407 bounce->cpages++; 408 } 409 410 return 0; 411 } 412 413 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 414 { 415 struct dmirror_bounce bounce; 416 unsigned long start, end; 417 unsigned long size = cmd->npages << PAGE_SHIFT; 418 int ret; 419 420 start = cmd->addr; 421 end = start + size; 422 if (end < start) 423 return -EINVAL; 424 425 ret = dmirror_bounce_init(&bounce, start, size); 426 if (ret) 427 return ret; 428 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 429 bounce.size)) { 430 ret = -EFAULT; 431 goto fini; 432 } 433 434 while (1) { 435 mutex_lock(&dmirror->mutex); 436 ret = dmirror_do_write(dmirror, start, end, &bounce); 437 mutex_unlock(&dmirror->mutex); 438 if (ret != -ENOENT) 439 break; 440 441 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 442 ret = dmirror_fault(dmirror, start, end, true); 443 if (ret) 444 break; 445 cmd->faults++; 446 } 447 448 fini: 449 cmd->cpages = bounce.cpages; 450 dmirror_bounce_fini(&bounce); 451 return ret; 452 } 453 454 static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, 455 struct page **ppage) 456 { 457 struct dmirror_chunk *devmem; 458 struct resource *res; 459 unsigned long pfn; 460 unsigned long pfn_first; 461 unsigned long pfn_last; 462 void *ptr; 463 464 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 465 if (!devmem) 466 return false; 467 468 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 469 "hmm_dmirror"); 470 if (IS_ERR(res)) 471 goto err_devmem; 472 473 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 474 devmem->pagemap.range.start = res->start; 475 devmem->pagemap.range.end = res->end; 476 devmem->pagemap.nr_range = 1; 477 devmem->pagemap.ops = &dmirror_devmem_ops; 478 devmem->pagemap.owner = mdevice; 479 480 mutex_lock(&mdevice->devmem_lock); 481 482 if (mdevice->devmem_count == mdevice->devmem_capacity) { 483 struct dmirror_chunk **new_chunks; 484 unsigned int new_capacity; 485 486 new_capacity = mdevice->devmem_capacity + 487 DEVMEM_CHUNKS_RESERVE; 488 new_chunks = krealloc(mdevice->devmem_chunks, 489 sizeof(new_chunks[0]) * new_capacity, 490 GFP_KERNEL); 491 if (!new_chunks) 492 goto err_release; 493 mdevice->devmem_capacity = new_capacity; 494 mdevice->devmem_chunks = new_chunks; 495 } 496 497 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 498 if (IS_ERR(ptr)) 499 goto err_release; 500 501 devmem->mdevice = mdevice; 502 pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; 503 pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); 504 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 505 506 mutex_unlock(&mdevice->devmem_lock); 507 508 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 509 DEVMEM_CHUNK_SIZE / (1024 * 1024), 510 mdevice->devmem_count, 511 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 512 pfn_first, pfn_last); 513 514 spin_lock(&mdevice->lock); 515 for (pfn = pfn_first; pfn < pfn_last; pfn++) { 516 struct page *page = pfn_to_page(pfn); 517 518 page->zone_device_data = mdevice->free_pages; 519 mdevice->free_pages = page; 520 } 521 if (ppage) { 522 *ppage = mdevice->free_pages; 523 mdevice->free_pages = (*ppage)->zone_device_data; 524 mdevice->calloc++; 525 } 526 spin_unlock(&mdevice->lock); 527 528 return true; 529 530 err_release: 531 mutex_unlock(&mdevice->devmem_lock); 532 release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); 533 err_devmem: 534 kfree(devmem); 535 536 return false; 537 } 538 539 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 540 { 541 struct page *dpage = NULL; 542 struct page *rpage; 543 544 /* 545 * This is a fake device so we alloc real system memory to store 546 * our device memory. 547 */ 548 rpage = alloc_page(GFP_HIGHUSER); 549 if (!rpage) 550 return NULL; 551 552 spin_lock(&mdevice->lock); 553 554 if (mdevice->free_pages) { 555 dpage = mdevice->free_pages; 556 mdevice->free_pages = dpage->zone_device_data; 557 mdevice->calloc++; 558 spin_unlock(&mdevice->lock); 559 } else { 560 spin_unlock(&mdevice->lock); 561 if (!dmirror_allocate_chunk(mdevice, &dpage)) 562 goto error; 563 } 564 565 dpage->zone_device_data = rpage; 566 get_page(dpage); 567 lock_page(dpage); 568 return dpage; 569 570 error: 571 __free_page(rpage); 572 return NULL; 573 } 574 575 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 576 struct dmirror *dmirror) 577 { 578 struct dmirror_device *mdevice = dmirror->mdevice; 579 const unsigned long *src = args->src; 580 unsigned long *dst = args->dst; 581 unsigned long addr; 582 583 for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 584 src++, dst++) { 585 struct page *spage; 586 struct page *dpage; 587 struct page *rpage; 588 589 if (!(*src & MIGRATE_PFN_MIGRATE)) 590 continue; 591 592 /* 593 * Note that spage might be NULL which is OK since it is an 594 * unallocated pte_none() or read-only zero page. 595 */ 596 spage = migrate_pfn_to_page(*src); 597 598 dpage = dmirror_devmem_alloc_page(mdevice); 599 if (!dpage) 600 continue; 601 602 rpage = dpage->zone_device_data; 603 if (spage) 604 copy_highpage(rpage, spage); 605 else 606 clear_highpage(rpage); 607 608 /* 609 * Normally, a device would use the page->zone_device_data to 610 * point to the mirror but here we use it to hold the page for 611 * the simulated device memory and that page holds the pointer 612 * to the mirror. 613 */ 614 rpage->zone_device_data = dmirror; 615 616 *dst = migrate_pfn(page_to_pfn(dpage)); 617 if ((*src & MIGRATE_PFN_WRITE) || 618 (!spage && args->vma->vm_flags & VM_WRITE)) 619 *dst |= MIGRATE_PFN_WRITE; 620 } 621 } 622 623 static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start, 624 unsigned long end) 625 { 626 unsigned long pfn; 627 628 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 629 void *entry; 630 631 entry = xa_load(&dmirror->pt, pfn); 632 if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC) 633 return -EPERM; 634 } 635 636 return 0; 637 } 638 639 static int dmirror_atomic_map(unsigned long start, unsigned long end, 640 struct page **pages, struct dmirror *dmirror) 641 { 642 unsigned long pfn, mapped = 0; 643 int i; 644 645 /* Map the migrated pages into the device's page tables. */ 646 mutex_lock(&dmirror->mutex); 647 648 for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) { 649 void *entry; 650 651 if (!pages[i]) 652 continue; 653 654 entry = pages[i]; 655 entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC); 656 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 657 if (xa_is_err(entry)) { 658 mutex_unlock(&dmirror->mutex); 659 return xa_err(entry); 660 } 661 662 mapped++; 663 } 664 665 mutex_unlock(&dmirror->mutex); 666 return mapped; 667 } 668 669 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 670 struct dmirror *dmirror) 671 { 672 unsigned long start = args->start; 673 unsigned long end = args->end; 674 const unsigned long *src = args->src; 675 const unsigned long *dst = args->dst; 676 unsigned long pfn; 677 678 /* Map the migrated pages into the device's page tables. */ 679 mutex_lock(&dmirror->mutex); 680 681 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 682 src++, dst++) { 683 struct page *dpage; 684 void *entry; 685 686 if (!(*src & MIGRATE_PFN_MIGRATE)) 687 continue; 688 689 dpage = migrate_pfn_to_page(*dst); 690 if (!dpage) 691 continue; 692 693 /* 694 * Store the page that holds the data so the page table 695 * doesn't have to deal with ZONE_DEVICE private pages. 696 */ 697 entry = dpage->zone_device_data; 698 if (*dst & MIGRATE_PFN_WRITE) 699 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 700 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 701 if (xa_is_err(entry)) { 702 mutex_unlock(&dmirror->mutex); 703 return xa_err(entry); 704 } 705 } 706 707 mutex_unlock(&dmirror->mutex); 708 return 0; 709 } 710 711 static int dmirror_exclusive(struct dmirror *dmirror, 712 struct hmm_dmirror_cmd *cmd) 713 { 714 unsigned long start, end, addr; 715 unsigned long size = cmd->npages << PAGE_SHIFT; 716 struct mm_struct *mm = dmirror->notifier.mm; 717 struct page *pages[64]; 718 struct dmirror_bounce bounce; 719 unsigned long next; 720 int ret; 721 722 start = cmd->addr; 723 end = start + size; 724 if (end < start) 725 return -EINVAL; 726 727 /* Since the mm is for the mirrored process, get a reference first. */ 728 if (!mmget_not_zero(mm)) 729 return -EINVAL; 730 731 mmap_read_lock(mm); 732 for (addr = start; addr < end; addr = next) { 733 unsigned long mapped; 734 int i; 735 736 if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)) 737 next = end; 738 else 739 next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT); 740 741 ret = make_device_exclusive_range(mm, addr, next, pages, NULL); 742 mapped = dmirror_atomic_map(addr, next, pages, dmirror); 743 for (i = 0; i < ret; i++) { 744 if (pages[i]) { 745 unlock_page(pages[i]); 746 put_page(pages[i]); 747 } 748 } 749 750 if (addr + (mapped << PAGE_SHIFT) < next) { 751 mmap_read_unlock(mm); 752 mmput(mm); 753 return -EBUSY; 754 } 755 } 756 mmap_read_unlock(mm); 757 mmput(mm); 758 759 /* Return the migrated data for verification. */ 760 ret = dmirror_bounce_init(&bounce, start, size); 761 if (ret) 762 return ret; 763 mutex_lock(&dmirror->mutex); 764 ret = dmirror_do_read(dmirror, start, end, &bounce); 765 mutex_unlock(&dmirror->mutex); 766 if (ret == 0) { 767 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 768 bounce.size)) 769 ret = -EFAULT; 770 } 771 772 cmd->cpages = bounce.cpages; 773 dmirror_bounce_fini(&bounce); 774 return ret; 775 } 776 777 static int dmirror_migrate(struct dmirror *dmirror, 778 struct hmm_dmirror_cmd *cmd) 779 { 780 unsigned long start, end, addr; 781 unsigned long size = cmd->npages << PAGE_SHIFT; 782 struct mm_struct *mm = dmirror->notifier.mm; 783 struct vm_area_struct *vma; 784 unsigned long src_pfns[64]; 785 unsigned long dst_pfns[64]; 786 struct dmirror_bounce bounce; 787 struct migrate_vma args; 788 unsigned long next; 789 int ret; 790 791 start = cmd->addr; 792 end = start + size; 793 if (end < start) 794 return -EINVAL; 795 796 /* Since the mm is for the mirrored process, get a reference first. */ 797 if (!mmget_not_zero(mm)) 798 return -EINVAL; 799 800 mmap_read_lock(mm); 801 for (addr = start; addr < end; addr = next) { 802 vma = vma_lookup(mm, addr); 803 if (!vma || !(vma->vm_flags & VM_READ)) { 804 ret = -EINVAL; 805 goto out; 806 } 807 next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 808 if (next > vma->vm_end) 809 next = vma->vm_end; 810 811 args.vma = vma; 812 args.src = src_pfns; 813 args.dst = dst_pfns; 814 args.start = addr; 815 args.end = next; 816 args.pgmap_owner = dmirror->mdevice; 817 args.flags = MIGRATE_VMA_SELECT_SYSTEM; 818 ret = migrate_vma_setup(&args); 819 if (ret) 820 goto out; 821 822 dmirror_migrate_alloc_and_copy(&args, dmirror); 823 migrate_vma_pages(&args); 824 dmirror_migrate_finalize_and_map(&args, dmirror); 825 migrate_vma_finalize(&args); 826 } 827 mmap_read_unlock(mm); 828 mmput(mm); 829 830 /* Return the migrated data for verification. */ 831 ret = dmirror_bounce_init(&bounce, start, size); 832 if (ret) 833 return ret; 834 mutex_lock(&dmirror->mutex); 835 ret = dmirror_do_read(dmirror, start, end, &bounce); 836 mutex_unlock(&dmirror->mutex); 837 if (ret == 0) { 838 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 839 bounce.size)) 840 ret = -EFAULT; 841 } 842 cmd->cpages = bounce.cpages; 843 dmirror_bounce_fini(&bounce); 844 return ret; 845 846 out: 847 mmap_read_unlock(mm); 848 mmput(mm); 849 return ret; 850 } 851 852 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 853 unsigned char *perm, unsigned long entry) 854 { 855 struct page *page; 856 857 if (entry & HMM_PFN_ERROR) { 858 *perm = HMM_DMIRROR_PROT_ERROR; 859 return; 860 } 861 if (!(entry & HMM_PFN_VALID)) { 862 *perm = HMM_DMIRROR_PROT_NONE; 863 return; 864 } 865 866 page = hmm_pfn_to_page(entry); 867 if (is_device_private_page(page)) { 868 /* Is the page migrated to this device or some other? */ 869 if (dmirror->mdevice == dmirror_page_to_device(page)) 870 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 871 else 872 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 873 } else if (is_zero_pfn(page_to_pfn(page))) 874 *perm = HMM_DMIRROR_PROT_ZERO; 875 else 876 *perm = HMM_DMIRROR_PROT_NONE; 877 if (entry & HMM_PFN_WRITE) 878 *perm |= HMM_DMIRROR_PROT_WRITE; 879 else 880 *perm |= HMM_DMIRROR_PROT_READ; 881 if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 882 *perm |= HMM_DMIRROR_PROT_PMD; 883 else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 884 *perm |= HMM_DMIRROR_PROT_PUD; 885 } 886 887 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 888 const struct mmu_notifier_range *range, 889 unsigned long cur_seq) 890 { 891 struct dmirror_interval *dmi = 892 container_of(mni, struct dmirror_interval, notifier); 893 struct dmirror *dmirror = dmi->dmirror; 894 895 if (mmu_notifier_range_blockable(range)) 896 mutex_lock(&dmirror->mutex); 897 else if (!mutex_trylock(&dmirror->mutex)) 898 return false; 899 900 /* 901 * Snapshots only need to set the sequence number since any 902 * invalidation in the interval invalidates the whole snapshot. 903 */ 904 mmu_interval_set_seq(mni, cur_seq); 905 906 mutex_unlock(&dmirror->mutex); 907 return true; 908 } 909 910 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 911 .invalidate = dmirror_snapshot_invalidate, 912 }; 913 914 static int dmirror_range_snapshot(struct dmirror *dmirror, 915 struct hmm_range *range, 916 unsigned char *perm) 917 { 918 struct mm_struct *mm = dmirror->notifier.mm; 919 struct dmirror_interval notifier; 920 unsigned long timeout = 921 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 922 unsigned long i; 923 unsigned long n; 924 int ret = 0; 925 926 notifier.dmirror = dmirror; 927 range->notifier = ¬ifier.notifier; 928 929 ret = mmu_interval_notifier_insert(range->notifier, mm, 930 range->start, range->end - range->start, 931 &dmirror_mrn_ops); 932 if (ret) 933 return ret; 934 935 while (true) { 936 if (time_after(jiffies, timeout)) { 937 ret = -EBUSY; 938 goto out; 939 } 940 941 range->notifier_seq = mmu_interval_read_begin(range->notifier); 942 943 mmap_read_lock(mm); 944 ret = hmm_range_fault(range); 945 mmap_read_unlock(mm); 946 if (ret) { 947 if (ret == -EBUSY) 948 continue; 949 goto out; 950 } 951 952 mutex_lock(&dmirror->mutex); 953 if (mmu_interval_read_retry(range->notifier, 954 range->notifier_seq)) { 955 mutex_unlock(&dmirror->mutex); 956 continue; 957 } 958 break; 959 } 960 961 n = (range->end - range->start) >> PAGE_SHIFT; 962 for (i = 0; i < n; i++) 963 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 964 965 mutex_unlock(&dmirror->mutex); 966 out: 967 mmu_interval_notifier_remove(range->notifier); 968 return ret; 969 } 970 971 static int dmirror_snapshot(struct dmirror *dmirror, 972 struct hmm_dmirror_cmd *cmd) 973 { 974 struct mm_struct *mm = dmirror->notifier.mm; 975 unsigned long start, end; 976 unsigned long size = cmd->npages << PAGE_SHIFT; 977 unsigned long addr; 978 unsigned long next; 979 unsigned long pfns[64]; 980 unsigned char perm[64]; 981 char __user *uptr; 982 struct hmm_range range = { 983 .hmm_pfns = pfns, 984 .dev_private_owner = dmirror->mdevice, 985 }; 986 int ret = 0; 987 988 start = cmd->addr; 989 end = start + size; 990 if (end < start) 991 return -EINVAL; 992 993 /* Since the mm is for the mirrored process, get a reference first. */ 994 if (!mmget_not_zero(mm)) 995 return -EINVAL; 996 997 /* 998 * Register a temporary notifier to detect invalidations even if it 999 * overlaps with other mmu_interval_notifiers. 1000 */ 1001 uptr = u64_to_user_ptr(cmd->ptr); 1002 for (addr = start; addr < end; addr = next) { 1003 unsigned long n; 1004 1005 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 1006 range.start = addr; 1007 range.end = next; 1008 1009 ret = dmirror_range_snapshot(dmirror, &range, perm); 1010 if (ret) 1011 break; 1012 1013 n = (range.end - range.start) >> PAGE_SHIFT; 1014 if (copy_to_user(uptr, perm, n)) { 1015 ret = -EFAULT; 1016 break; 1017 } 1018 1019 cmd->cpages += n; 1020 uptr += n; 1021 } 1022 mmput(mm); 1023 1024 return ret; 1025 } 1026 1027 static long dmirror_fops_unlocked_ioctl(struct file *filp, 1028 unsigned int command, 1029 unsigned long arg) 1030 { 1031 void __user *uarg = (void __user *)arg; 1032 struct hmm_dmirror_cmd cmd; 1033 struct dmirror *dmirror; 1034 int ret; 1035 1036 dmirror = filp->private_data; 1037 if (!dmirror) 1038 return -EINVAL; 1039 1040 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 1041 return -EFAULT; 1042 1043 if (cmd.addr & ~PAGE_MASK) 1044 return -EINVAL; 1045 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 1046 return -EINVAL; 1047 1048 cmd.cpages = 0; 1049 cmd.faults = 0; 1050 1051 switch (command) { 1052 case HMM_DMIRROR_READ: 1053 ret = dmirror_read(dmirror, &cmd); 1054 break; 1055 1056 case HMM_DMIRROR_WRITE: 1057 ret = dmirror_write(dmirror, &cmd); 1058 break; 1059 1060 case HMM_DMIRROR_MIGRATE: 1061 ret = dmirror_migrate(dmirror, &cmd); 1062 break; 1063 1064 case HMM_DMIRROR_EXCLUSIVE: 1065 ret = dmirror_exclusive(dmirror, &cmd); 1066 break; 1067 1068 case HMM_DMIRROR_CHECK_EXCLUSIVE: 1069 ret = dmirror_check_atomic(dmirror, cmd.addr, 1070 cmd.addr + (cmd.npages << PAGE_SHIFT)); 1071 break; 1072 1073 case HMM_DMIRROR_SNAPSHOT: 1074 ret = dmirror_snapshot(dmirror, &cmd); 1075 break; 1076 1077 default: 1078 return -EINVAL; 1079 } 1080 if (ret) 1081 return ret; 1082 1083 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 1084 return -EFAULT; 1085 1086 return 0; 1087 } 1088 1089 static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma) 1090 { 1091 unsigned long addr; 1092 1093 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 1094 struct page *page; 1095 int ret; 1096 1097 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1098 if (!page) 1099 return -ENOMEM; 1100 1101 ret = vm_insert_page(vma, addr, page); 1102 if (ret) { 1103 __free_page(page); 1104 return ret; 1105 } 1106 put_page(page); 1107 } 1108 1109 return 0; 1110 } 1111 1112 static const struct file_operations dmirror_fops = { 1113 .open = dmirror_fops_open, 1114 .release = dmirror_fops_release, 1115 .mmap = dmirror_fops_mmap, 1116 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 1117 .llseek = default_llseek, 1118 .owner = THIS_MODULE, 1119 }; 1120 1121 static void dmirror_devmem_free(struct page *page) 1122 { 1123 struct page *rpage = page->zone_device_data; 1124 struct dmirror_device *mdevice; 1125 1126 if (rpage) 1127 __free_page(rpage); 1128 1129 mdevice = dmirror_page_to_device(page); 1130 1131 spin_lock(&mdevice->lock); 1132 mdevice->cfree++; 1133 page->zone_device_data = mdevice->free_pages; 1134 mdevice->free_pages = page; 1135 spin_unlock(&mdevice->lock); 1136 } 1137 1138 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 1139 struct dmirror *dmirror) 1140 { 1141 const unsigned long *src = args->src; 1142 unsigned long *dst = args->dst; 1143 unsigned long start = args->start; 1144 unsigned long end = args->end; 1145 unsigned long addr; 1146 1147 for (addr = start; addr < end; addr += PAGE_SIZE, 1148 src++, dst++) { 1149 struct page *dpage, *spage; 1150 1151 spage = migrate_pfn_to_page(*src); 1152 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 1153 continue; 1154 spage = spage->zone_device_data; 1155 1156 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1157 if (!dpage) 1158 continue; 1159 1160 lock_page(dpage); 1161 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1162 copy_highpage(dpage, spage); 1163 *dst = migrate_pfn(page_to_pfn(dpage)); 1164 if (*src & MIGRATE_PFN_WRITE) 1165 *dst |= MIGRATE_PFN_WRITE; 1166 } 1167 return 0; 1168 } 1169 1170 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1171 { 1172 struct migrate_vma args; 1173 unsigned long src_pfns; 1174 unsigned long dst_pfns; 1175 struct page *rpage; 1176 struct dmirror *dmirror; 1177 vm_fault_t ret; 1178 1179 /* 1180 * Normally, a device would use the page->zone_device_data to point to 1181 * the mirror but here we use it to hold the page for the simulated 1182 * device memory and that page holds the pointer to the mirror. 1183 */ 1184 rpage = vmf->page->zone_device_data; 1185 dmirror = rpage->zone_device_data; 1186 1187 /* FIXME demonstrate how we can adjust migrate range */ 1188 args.vma = vmf->vma; 1189 args.start = vmf->address; 1190 args.end = args.start + PAGE_SIZE; 1191 args.src = &src_pfns; 1192 args.dst = &dst_pfns; 1193 args.pgmap_owner = dmirror->mdevice; 1194 args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 1195 1196 if (migrate_vma_setup(&args)) 1197 return VM_FAULT_SIGBUS; 1198 1199 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1200 if (ret) 1201 return ret; 1202 migrate_vma_pages(&args); 1203 /* 1204 * No device finalize step is needed since 1205 * dmirror_devmem_fault_alloc_and_copy() will have already 1206 * invalidated the device page table. 1207 */ 1208 migrate_vma_finalize(&args); 1209 return 0; 1210 } 1211 1212 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1213 .page_free = dmirror_devmem_free, 1214 .migrate_to_ram = dmirror_devmem_fault, 1215 }; 1216 1217 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1218 { 1219 dev_t dev; 1220 int ret; 1221 1222 dev = MKDEV(MAJOR(dmirror_dev), id); 1223 mutex_init(&mdevice->devmem_lock); 1224 spin_lock_init(&mdevice->lock); 1225 1226 cdev_init(&mdevice->cdevice, &dmirror_fops); 1227 mdevice->cdevice.owner = THIS_MODULE; 1228 ret = cdev_add(&mdevice->cdevice, dev, 1); 1229 if (ret) 1230 return ret; 1231 1232 /* Build a list of free ZONE_DEVICE private struct pages */ 1233 dmirror_allocate_chunk(mdevice, NULL); 1234 1235 return 0; 1236 } 1237 1238 static void dmirror_device_remove(struct dmirror_device *mdevice) 1239 { 1240 unsigned int i; 1241 1242 if (mdevice->devmem_chunks) { 1243 for (i = 0; i < mdevice->devmem_count; i++) { 1244 struct dmirror_chunk *devmem = 1245 mdevice->devmem_chunks[i]; 1246 1247 memunmap_pages(&devmem->pagemap); 1248 release_mem_region(devmem->pagemap.range.start, 1249 range_len(&devmem->pagemap.range)); 1250 kfree(devmem); 1251 } 1252 kfree(mdevice->devmem_chunks); 1253 } 1254 1255 cdev_del(&mdevice->cdevice); 1256 } 1257 1258 static int __init hmm_dmirror_init(void) 1259 { 1260 int ret; 1261 int id; 1262 1263 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1264 "HMM_DMIRROR"); 1265 if (ret) 1266 goto err_unreg; 1267 1268 for (id = 0; id < DMIRROR_NDEVICES; id++) { 1269 ret = dmirror_device_init(dmirror_devices + id, id); 1270 if (ret) 1271 goto err_chrdev; 1272 } 1273 1274 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1275 return 0; 1276 1277 err_chrdev: 1278 while (--id >= 0) 1279 dmirror_device_remove(dmirror_devices + id); 1280 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1281 err_unreg: 1282 return ret; 1283 } 1284 1285 static void __exit hmm_dmirror_exit(void) 1286 { 1287 int id; 1288 1289 for (id = 0; id < DMIRROR_NDEVICES; id++) 1290 dmirror_device_remove(dmirror_devices + id); 1291 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1292 } 1293 1294 module_init(hmm_dmirror_init); 1295 module_exit(hmm_dmirror_exit); 1296 MODULE_LICENSE("GPL"); 1297