1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/memremap.h> 16 #include <linux/mutex.h> 17 #include <linux/rwsem.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/highmem.h> 21 #include <linux/delay.h> 22 #include <linux/pagemap.h> 23 #include <linux/hmm.h> 24 #include <linux/vmalloc.h> 25 #include <linux/swap.h> 26 #include <linux/swapops.h> 27 #include <linux/sched/mm.h> 28 #include <linux/platform_device.h> 29 #include <linux/rmap.h> 30 #include <linux/mmu_notifier.h> 31 #include <linux/migrate.h> 32 33 #include "test_hmm_uapi.h" 34 35 #define DMIRROR_NDEVICES 2 36 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 37 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 38 #define DEVMEM_CHUNKS_RESERVE 16 39 40 static const struct dev_pagemap_ops dmirror_devmem_ops; 41 static const struct mmu_interval_notifier_ops dmirror_min_ops; 42 static dev_t dmirror_dev; 43 44 struct dmirror_device; 45 46 struct dmirror_bounce { 47 void *ptr; 48 unsigned long size; 49 unsigned long addr; 50 unsigned long cpages; 51 }; 52 53 #define DPT_XA_TAG_ATOMIC 1UL 54 #define DPT_XA_TAG_WRITE 3UL 55 56 /* 57 * Data structure to track address ranges and register for mmu interval 58 * notifier updates. 59 */ 60 struct dmirror_interval { 61 struct mmu_interval_notifier notifier; 62 struct dmirror *dmirror; 63 }; 64 65 /* 66 * Data attached to the open device file. 67 * Note that it might be shared after a fork(). 68 */ 69 struct dmirror { 70 struct dmirror_device *mdevice; 71 struct xarray pt; 72 struct mmu_interval_notifier notifier; 73 struct mutex mutex; 74 }; 75 76 /* 77 * ZONE_DEVICE pages for migration and simulating device memory. 78 */ 79 struct dmirror_chunk { 80 struct dev_pagemap pagemap; 81 struct dmirror_device *mdevice; 82 }; 83 84 /* 85 * Per device data. 86 */ 87 struct dmirror_device { 88 struct cdev cdevice; 89 struct hmm_devmem *devmem; 90 91 unsigned int devmem_capacity; 92 unsigned int devmem_count; 93 struct dmirror_chunk **devmem_chunks; 94 struct mutex devmem_lock; /* protects the above */ 95 96 unsigned long calloc; 97 unsigned long cfree; 98 struct page *free_pages; 99 spinlock_t lock; /* protects the above */ 100 }; 101 102 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 103 104 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 105 unsigned long addr, 106 unsigned long size) 107 { 108 bounce->addr = addr; 109 bounce->size = size; 110 bounce->cpages = 0; 111 bounce->ptr = vmalloc(size); 112 if (!bounce->ptr) 113 return -ENOMEM; 114 return 0; 115 } 116 117 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 118 { 119 vfree(bounce->ptr); 120 } 121 122 static int dmirror_fops_open(struct inode *inode, struct file *filp) 123 { 124 struct cdev *cdev = inode->i_cdev; 125 struct dmirror *dmirror; 126 int ret; 127 128 /* Mirror this process address space */ 129 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 130 if (dmirror == NULL) 131 return -ENOMEM; 132 133 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 134 mutex_init(&dmirror->mutex); 135 xa_init(&dmirror->pt); 136 137 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 138 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 139 if (ret) { 140 kfree(dmirror); 141 return ret; 142 } 143 144 filp->private_data = dmirror; 145 return 0; 146 } 147 148 static int dmirror_fops_release(struct inode *inode, struct file *filp) 149 { 150 struct dmirror *dmirror = filp->private_data; 151 152 mmu_interval_notifier_remove(&dmirror->notifier); 153 xa_destroy(&dmirror->pt); 154 kfree(dmirror); 155 return 0; 156 } 157 158 static struct dmirror_device *dmirror_page_to_device(struct page *page) 159 160 { 161 return container_of(page->pgmap, struct dmirror_chunk, 162 pagemap)->mdevice; 163 } 164 165 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 166 { 167 unsigned long *pfns = range->hmm_pfns; 168 unsigned long pfn; 169 170 for (pfn = (range->start >> PAGE_SHIFT); 171 pfn < (range->end >> PAGE_SHIFT); 172 pfn++, pfns++) { 173 struct page *page; 174 void *entry; 175 176 /* 177 * Since we asked for hmm_range_fault() to populate pages, 178 * it shouldn't return an error entry on success. 179 */ 180 WARN_ON(*pfns & HMM_PFN_ERROR); 181 WARN_ON(!(*pfns & HMM_PFN_VALID)); 182 183 page = hmm_pfn_to_page(*pfns); 184 WARN_ON(!page); 185 186 entry = page; 187 if (*pfns & HMM_PFN_WRITE) 188 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 189 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 190 return -EFAULT; 191 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 192 if (xa_is_err(entry)) 193 return xa_err(entry); 194 } 195 196 return 0; 197 } 198 199 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 200 unsigned long end) 201 { 202 unsigned long pfn; 203 void *entry; 204 205 /* 206 * The XArray doesn't hold references to pages since it relies on 207 * the mmu notifier to clear page pointers when they become stale. 208 * Therefore, it is OK to just clear the entry. 209 */ 210 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 211 end >> PAGE_SHIFT) 212 xa_erase(&dmirror->pt, pfn); 213 } 214 215 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 216 const struct mmu_notifier_range *range, 217 unsigned long cur_seq) 218 { 219 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 220 221 /* 222 * Ignore invalidation callbacks for device private pages since 223 * the invalidation is handled as part of the migration process. 224 */ 225 if (range->event == MMU_NOTIFY_MIGRATE && 226 range->owner == dmirror->mdevice) 227 return true; 228 229 if (mmu_notifier_range_blockable(range)) 230 mutex_lock(&dmirror->mutex); 231 else if (!mutex_trylock(&dmirror->mutex)) 232 return false; 233 234 mmu_interval_set_seq(mni, cur_seq); 235 dmirror_do_update(dmirror, range->start, range->end); 236 237 mutex_unlock(&dmirror->mutex); 238 return true; 239 } 240 241 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 242 .invalidate = dmirror_interval_invalidate, 243 }; 244 245 static int dmirror_range_fault(struct dmirror *dmirror, 246 struct hmm_range *range) 247 { 248 struct mm_struct *mm = dmirror->notifier.mm; 249 unsigned long timeout = 250 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 251 int ret; 252 253 while (true) { 254 if (time_after(jiffies, timeout)) { 255 ret = -EBUSY; 256 goto out; 257 } 258 259 range->notifier_seq = mmu_interval_read_begin(range->notifier); 260 mmap_read_lock(mm); 261 ret = hmm_range_fault(range); 262 mmap_read_unlock(mm); 263 if (ret) { 264 if (ret == -EBUSY) 265 continue; 266 goto out; 267 } 268 269 mutex_lock(&dmirror->mutex); 270 if (mmu_interval_read_retry(range->notifier, 271 range->notifier_seq)) { 272 mutex_unlock(&dmirror->mutex); 273 continue; 274 } 275 break; 276 } 277 278 ret = dmirror_do_fault(dmirror, range); 279 280 mutex_unlock(&dmirror->mutex); 281 out: 282 return ret; 283 } 284 285 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 286 unsigned long end, bool write) 287 { 288 struct mm_struct *mm = dmirror->notifier.mm; 289 unsigned long addr; 290 unsigned long pfns[64]; 291 struct hmm_range range = { 292 .notifier = &dmirror->notifier, 293 .hmm_pfns = pfns, 294 .pfn_flags_mask = 0, 295 .default_flags = 296 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 297 .dev_private_owner = dmirror->mdevice, 298 }; 299 int ret = 0; 300 301 /* Since the mm is for the mirrored process, get a reference first. */ 302 if (!mmget_not_zero(mm)) 303 return 0; 304 305 for (addr = start; addr < end; addr = range.end) { 306 range.start = addr; 307 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 308 309 ret = dmirror_range_fault(dmirror, &range); 310 if (ret) 311 break; 312 } 313 314 mmput(mm); 315 return ret; 316 } 317 318 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 319 unsigned long end, struct dmirror_bounce *bounce) 320 { 321 unsigned long pfn; 322 void *ptr; 323 324 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 325 326 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 327 void *entry; 328 struct page *page; 329 void *tmp; 330 331 entry = xa_load(&dmirror->pt, pfn); 332 page = xa_untag_pointer(entry); 333 if (!page) 334 return -ENOENT; 335 336 tmp = kmap(page); 337 memcpy(ptr, tmp, PAGE_SIZE); 338 kunmap(page); 339 340 ptr += PAGE_SIZE; 341 bounce->cpages++; 342 } 343 344 return 0; 345 } 346 347 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 348 { 349 struct dmirror_bounce bounce; 350 unsigned long start, end; 351 unsigned long size = cmd->npages << PAGE_SHIFT; 352 int ret; 353 354 start = cmd->addr; 355 end = start + size; 356 if (end < start) 357 return -EINVAL; 358 359 ret = dmirror_bounce_init(&bounce, start, size); 360 if (ret) 361 return ret; 362 363 while (1) { 364 mutex_lock(&dmirror->mutex); 365 ret = dmirror_do_read(dmirror, start, end, &bounce); 366 mutex_unlock(&dmirror->mutex); 367 if (ret != -ENOENT) 368 break; 369 370 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 371 ret = dmirror_fault(dmirror, start, end, false); 372 if (ret) 373 break; 374 cmd->faults++; 375 } 376 377 if (ret == 0) { 378 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 379 bounce.size)) 380 ret = -EFAULT; 381 } 382 cmd->cpages = bounce.cpages; 383 dmirror_bounce_fini(&bounce); 384 return ret; 385 } 386 387 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 388 unsigned long end, struct dmirror_bounce *bounce) 389 { 390 unsigned long pfn; 391 void *ptr; 392 393 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 394 395 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 396 void *entry; 397 struct page *page; 398 void *tmp; 399 400 entry = xa_load(&dmirror->pt, pfn); 401 page = xa_untag_pointer(entry); 402 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 403 return -ENOENT; 404 405 tmp = kmap(page); 406 memcpy(tmp, ptr, PAGE_SIZE); 407 kunmap(page); 408 409 ptr += PAGE_SIZE; 410 bounce->cpages++; 411 } 412 413 return 0; 414 } 415 416 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 417 { 418 struct dmirror_bounce bounce; 419 unsigned long start, end; 420 unsigned long size = cmd->npages << PAGE_SHIFT; 421 int ret; 422 423 start = cmd->addr; 424 end = start + size; 425 if (end < start) 426 return -EINVAL; 427 428 ret = dmirror_bounce_init(&bounce, start, size); 429 if (ret) 430 return ret; 431 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 432 bounce.size)) { 433 ret = -EFAULT; 434 goto fini; 435 } 436 437 while (1) { 438 mutex_lock(&dmirror->mutex); 439 ret = dmirror_do_write(dmirror, start, end, &bounce); 440 mutex_unlock(&dmirror->mutex); 441 if (ret != -ENOENT) 442 break; 443 444 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 445 ret = dmirror_fault(dmirror, start, end, true); 446 if (ret) 447 break; 448 cmd->faults++; 449 } 450 451 fini: 452 cmd->cpages = bounce.cpages; 453 dmirror_bounce_fini(&bounce); 454 return ret; 455 } 456 457 static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, 458 struct page **ppage) 459 { 460 struct dmirror_chunk *devmem; 461 struct resource *res; 462 unsigned long pfn; 463 unsigned long pfn_first; 464 unsigned long pfn_last; 465 void *ptr; 466 467 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 468 if (!devmem) 469 return false; 470 471 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 472 "hmm_dmirror"); 473 if (IS_ERR(res)) 474 goto err_devmem; 475 476 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 477 devmem->pagemap.range.start = res->start; 478 devmem->pagemap.range.end = res->end; 479 devmem->pagemap.nr_range = 1; 480 devmem->pagemap.ops = &dmirror_devmem_ops; 481 devmem->pagemap.owner = mdevice; 482 483 mutex_lock(&mdevice->devmem_lock); 484 485 if (mdevice->devmem_count == mdevice->devmem_capacity) { 486 struct dmirror_chunk **new_chunks; 487 unsigned int new_capacity; 488 489 new_capacity = mdevice->devmem_capacity + 490 DEVMEM_CHUNKS_RESERVE; 491 new_chunks = krealloc(mdevice->devmem_chunks, 492 sizeof(new_chunks[0]) * new_capacity, 493 GFP_KERNEL); 494 if (!new_chunks) 495 goto err_release; 496 mdevice->devmem_capacity = new_capacity; 497 mdevice->devmem_chunks = new_chunks; 498 } 499 500 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 501 if (IS_ERR(ptr)) 502 goto err_release; 503 504 devmem->mdevice = mdevice; 505 pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; 506 pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); 507 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 508 509 mutex_unlock(&mdevice->devmem_lock); 510 511 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 512 DEVMEM_CHUNK_SIZE / (1024 * 1024), 513 mdevice->devmem_count, 514 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 515 pfn_first, pfn_last); 516 517 spin_lock(&mdevice->lock); 518 for (pfn = pfn_first; pfn < pfn_last; pfn++) { 519 struct page *page = pfn_to_page(pfn); 520 521 page->zone_device_data = mdevice->free_pages; 522 mdevice->free_pages = page; 523 } 524 if (ppage) { 525 *ppage = mdevice->free_pages; 526 mdevice->free_pages = (*ppage)->zone_device_data; 527 mdevice->calloc++; 528 } 529 spin_unlock(&mdevice->lock); 530 531 return true; 532 533 err_release: 534 mutex_unlock(&mdevice->devmem_lock); 535 release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); 536 err_devmem: 537 kfree(devmem); 538 539 return false; 540 } 541 542 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 543 { 544 struct page *dpage = NULL; 545 struct page *rpage; 546 547 /* 548 * This is a fake device so we alloc real system memory to store 549 * our device memory. 550 */ 551 rpage = alloc_page(GFP_HIGHUSER); 552 if (!rpage) 553 return NULL; 554 555 spin_lock(&mdevice->lock); 556 557 if (mdevice->free_pages) { 558 dpage = mdevice->free_pages; 559 mdevice->free_pages = dpage->zone_device_data; 560 mdevice->calloc++; 561 spin_unlock(&mdevice->lock); 562 } else { 563 spin_unlock(&mdevice->lock); 564 if (!dmirror_allocate_chunk(mdevice, &dpage)) 565 goto error; 566 } 567 568 dpage->zone_device_data = rpage; 569 get_page(dpage); 570 lock_page(dpage); 571 return dpage; 572 573 error: 574 __free_page(rpage); 575 return NULL; 576 } 577 578 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 579 struct dmirror *dmirror) 580 { 581 struct dmirror_device *mdevice = dmirror->mdevice; 582 const unsigned long *src = args->src; 583 unsigned long *dst = args->dst; 584 unsigned long addr; 585 586 for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 587 src++, dst++) { 588 struct page *spage; 589 struct page *dpage; 590 struct page *rpage; 591 592 if (!(*src & MIGRATE_PFN_MIGRATE)) 593 continue; 594 595 /* 596 * Note that spage might be NULL which is OK since it is an 597 * unallocated pte_none() or read-only zero page. 598 */ 599 spage = migrate_pfn_to_page(*src); 600 601 dpage = dmirror_devmem_alloc_page(mdevice); 602 if (!dpage) 603 continue; 604 605 rpage = dpage->zone_device_data; 606 if (spage) 607 copy_highpage(rpage, spage); 608 else 609 clear_highpage(rpage); 610 611 /* 612 * Normally, a device would use the page->zone_device_data to 613 * point to the mirror but here we use it to hold the page for 614 * the simulated device memory and that page holds the pointer 615 * to the mirror. 616 */ 617 rpage->zone_device_data = dmirror; 618 619 *dst = migrate_pfn(page_to_pfn(dpage)); 620 if ((*src & MIGRATE_PFN_WRITE) || 621 (!spage && args->vma->vm_flags & VM_WRITE)) 622 *dst |= MIGRATE_PFN_WRITE; 623 } 624 } 625 626 static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start, 627 unsigned long end) 628 { 629 unsigned long pfn; 630 631 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 632 void *entry; 633 634 entry = xa_load(&dmirror->pt, pfn); 635 if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC) 636 return -EPERM; 637 } 638 639 return 0; 640 } 641 642 static int dmirror_atomic_map(unsigned long start, unsigned long end, 643 struct page **pages, struct dmirror *dmirror) 644 { 645 unsigned long pfn, mapped = 0; 646 int i; 647 648 /* Map the migrated pages into the device's page tables. */ 649 mutex_lock(&dmirror->mutex); 650 651 for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) { 652 void *entry; 653 654 if (!pages[i]) 655 continue; 656 657 entry = pages[i]; 658 entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC); 659 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 660 if (xa_is_err(entry)) { 661 mutex_unlock(&dmirror->mutex); 662 return xa_err(entry); 663 } 664 665 mapped++; 666 } 667 668 mutex_unlock(&dmirror->mutex); 669 return mapped; 670 } 671 672 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 673 struct dmirror *dmirror) 674 { 675 unsigned long start = args->start; 676 unsigned long end = args->end; 677 const unsigned long *src = args->src; 678 const unsigned long *dst = args->dst; 679 unsigned long pfn; 680 681 /* Map the migrated pages into the device's page tables. */ 682 mutex_lock(&dmirror->mutex); 683 684 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 685 src++, dst++) { 686 struct page *dpage; 687 void *entry; 688 689 if (!(*src & MIGRATE_PFN_MIGRATE)) 690 continue; 691 692 dpage = migrate_pfn_to_page(*dst); 693 if (!dpage) 694 continue; 695 696 /* 697 * Store the page that holds the data so the page table 698 * doesn't have to deal with ZONE_DEVICE private pages. 699 */ 700 entry = dpage->zone_device_data; 701 if (*dst & MIGRATE_PFN_WRITE) 702 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 703 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 704 if (xa_is_err(entry)) { 705 mutex_unlock(&dmirror->mutex); 706 return xa_err(entry); 707 } 708 } 709 710 mutex_unlock(&dmirror->mutex); 711 return 0; 712 } 713 714 static int dmirror_exclusive(struct dmirror *dmirror, 715 struct hmm_dmirror_cmd *cmd) 716 { 717 unsigned long start, end, addr; 718 unsigned long size = cmd->npages << PAGE_SHIFT; 719 struct mm_struct *mm = dmirror->notifier.mm; 720 struct page *pages[64]; 721 struct dmirror_bounce bounce; 722 unsigned long next; 723 int ret; 724 725 start = cmd->addr; 726 end = start + size; 727 if (end < start) 728 return -EINVAL; 729 730 /* Since the mm is for the mirrored process, get a reference first. */ 731 if (!mmget_not_zero(mm)) 732 return -EINVAL; 733 734 mmap_read_lock(mm); 735 for (addr = start; addr < end; addr = next) { 736 unsigned long mapped; 737 int i; 738 739 if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)) 740 next = end; 741 else 742 next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT); 743 744 ret = make_device_exclusive_range(mm, addr, next, pages, NULL); 745 mapped = dmirror_atomic_map(addr, next, pages, dmirror); 746 for (i = 0; i < ret; i++) { 747 if (pages[i]) { 748 unlock_page(pages[i]); 749 put_page(pages[i]); 750 } 751 } 752 753 if (addr + (mapped << PAGE_SHIFT) < next) { 754 mmap_read_unlock(mm); 755 mmput(mm); 756 return -EBUSY; 757 } 758 } 759 mmap_read_unlock(mm); 760 mmput(mm); 761 762 /* Return the migrated data for verification. */ 763 ret = dmirror_bounce_init(&bounce, start, size); 764 if (ret) 765 return ret; 766 mutex_lock(&dmirror->mutex); 767 ret = dmirror_do_read(dmirror, start, end, &bounce); 768 mutex_unlock(&dmirror->mutex); 769 if (ret == 0) { 770 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 771 bounce.size)) 772 ret = -EFAULT; 773 } 774 775 cmd->cpages = bounce.cpages; 776 dmirror_bounce_fini(&bounce); 777 return ret; 778 } 779 780 static int dmirror_migrate(struct dmirror *dmirror, 781 struct hmm_dmirror_cmd *cmd) 782 { 783 unsigned long start, end, addr; 784 unsigned long size = cmd->npages << PAGE_SHIFT; 785 struct mm_struct *mm = dmirror->notifier.mm; 786 struct vm_area_struct *vma; 787 unsigned long src_pfns[64]; 788 unsigned long dst_pfns[64]; 789 struct dmirror_bounce bounce; 790 struct migrate_vma args; 791 unsigned long next; 792 int ret; 793 794 start = cmd->addr; 795 end = start + size; 796 if (end < start) 797 return -EINVAL; 798 799 /* Since the mm is for the mirrored process, get a reference first. */ 800 if (!mmget_not_zero(mm)) 801 return -EINVAL; 802 803 mmap_read_lock(mm); 804 for (addr = start; addr < end; addr = next) { 805 vma = vma_lookup(mm, addr); 806 if (!vma || !(vma->vm_flags & VM_READ)) { 807 ret = -EINVAL; 808 goto out; 809 } 810 next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 811 if (next > vma->vm_end) 812 next = vma->vm_end; 813 814 args.vma = vma; 815 args.src = src_pfns; 816 args.dst = dst_pfns; 817 args.start = addr; 818 args.end = next; 819 args.pgmap_owner = dmirror->mdevice; 820 args.flags = MIGRATE_VMA_SELECT_SYSTEM; 821 ret = migrate_vma_setup(&args); 822 if (ret) 823 goto out; 824 825 dmirror_migrate_alloc_and_copy(&args, dmirror); 826 migrate_vma_pages(&args); 827 dmirror_migrate_finalize_and_map(&args, dmirror); 828 migrate_vma_finalize(&args); 829 } 830 mmap_read_unlock(mm); 831 mmput(mm); 832 833 /* Return the migrated data for verification. */ 834 ret = dmirror_bounce_init(&bounce, start, size); 835 if (ret) 836 return ret; 837 mutex_lock(&dmirror->mutex); 838 ret = dmirror_do_read(dmirror, start, end, &bounce); 839 mutex_unlock(&dmirror->mutex); 840 if (ret == 0) { 841 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 842 bounce.size)) 843 ret = -EFAULT; 844 } 845 cmd->cpages = bounce.cpages; 846 dmirror_bounce_fini(&bounce); 847 return ret; 848 849 out: 850 mmap_read_unlock(mm); 851 mmput(mm); 852 return ret; 853 } 854 855 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 856 unsigned char *perm, unsigned long entry) 857 { 858 struct page *page; 859 860 if (entry & HMM_PFN_ERROR) { 861 *perm = HMM_DMIRROR_PROT_ERROR; 862 return; 863 } 864 if (!(entry & HMM_PFN_VALID)) { 865 *perm = HMM_DMIRROR_PROT_NONE; 866 return; 867 } 868 869 page = hmm_pfn_to_page(entry); 870 if (is_device_private_page(page)) { 871 /* Is the page migrated to this device or some other? */ 872 if (dmirror->mdevice == dmirror_page_to_device(page)) 873 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 874 else 875 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 876 } else if (is_zero_pfn(page_to_pfn(page))) 877 *perm = HMM_DMIRROR_PROT_ZERO; 878 else 879 *perm = HMM_DMIRROR_PROT_NONE; 880 if (entry & HMM_PFN_WRITE) 881 *perm |= HMM_DMIRROR_PROT_WRITE; 882 else 883 *perm |= HMM_DMIRROR_PROT_READ; 884 if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 885 *perm |= HMM_DMIRROR_PROT_PMD; 886 else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 887 *perm |= HMM_DMIRROR_PROT_PUD; 888 } 889 890 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 891 const struct mmu_notifier_range *range, 892 unsigned long cur_seq) 893 { 894 struct dmirror_interval *dmi = 895 container_of(mni, struct dmirror_interval, notifier); 896 struct dmirror *dmirror = dmi->dmirror; 897 898 if (mmu_notifier_range_blockable(range)) 899 mutex_lock(&dmirror->mutex); 900 else if (!mutex_trylock(&dmirror->mutex)) 901 return false; 902 903 /* 904 * Snapshots only need to set the sequence number since any 905 * invalidation in the interval invalidates the whole snapshot. 906 */ 907 mmu_interval_set_seq(mni, cur_seq); 908 909 mutex_unlock(&dmirror->mutex); 910 return true; 911 } 912 913 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 914 .invalidate = dmirror_snapshot_invalidate, 915 }; 916 917 static int dmirror_range_snapshot(struct dmirror *dmirror, 918 struct hmm_range *range, 919 unsigned char *perm) 920 { 921 struct mm_struct *mm = dmirror->notifier.mm; 922 struct dmirror_interval notifier; 923 unsigned long timeout = 924 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 925 unsigned long i; 926 unsigned long n; 927 int ret = 0; 928 929 notifier.dmirror = dmirror; 930 range->notifier = ¬ifier.notifier; 931 932 ret = mmu_interval_notifier_insert(range->notifier, mm, 933 range->start, range->end - range->start, 934 &dmirror_mrn_ops); 935 if (ret) 936 return ret; 937 938 while (true) { 939 if (time_after(jiffies, timeout)) { 940 ret = -EBUSY; 941 goto out; 942 } 943 944 range->notifier_seq = mmu_interval_read_begin(range->notifier); 945 946 mmap_read_lock(mm); 947 ret = hmm_range_fault(range); 948 mmap_read_unlock(mm); 949 if (ret) { 950 if (ret == -EBUSY) 951 continue; 952 goto out; 953 } 954 955 mutex_lock(&dmirror->mutex); 956 if (mmu_interval_read_retry(range->notifier, 957 range->notifier_seq)) { 958 mutex_unlock(&dmirror->mutex); 959 continue; 960 } 961 break; 962 } 963 964 n = (range->end - range->start) >> PAGE_SHIFT; 965 for (i = 0; i < n; i++) 966 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 967 968 mutex_unlock(&dmirror->mutex); 969 out: 970 mmu_interval_notifier_remove(range->notifier); 971 return ret; 972 } 973 974 static int dmirror_snapshot(struct dmirror *dmirror, 975 struct hmm_dmirror_cmd *cmd) 976 { 977 struct mm_struct *mm = dmirror->notifier.mm; 978 unsigned long start, end; 979 unsigned long size = cmd->npages << PAGE_SHIFT; 980 unsigned long addr; 981 unsigned long next; 982 unsigned long pfns[64]; 983 unsigned char perm[64]; 984 char __user *uptr; 985 struct hmm_range range = { 986 .hmm_pfns = pfns, 987 .dev_private_owner = dmirror->mdevice, 988 }; 989 int ret = 0; 990 991 start = cmd->addr; 992 end = start + size; 993 if (end < start) 994 return -EINVAL; 995 996 /* Since the mm is for the mirrored process, get a reference first. */ 997 if (!mmget_not_zero(mm)) 998 return -EINVAL; 999 1000 /* 1001 * Register a temporary notifier to detect invalidations even if it 1002 * overlaps with other mmu_interval_notifiers. 1003 */ 1004 uptr = u64_to_user_ptr(cmd->ptr); 1005 for (addr = start; addr < end; addr = next) { 1006 unsigned long n; 1007 1008 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 1009 range.start = addr; 1010 range.end = next; 1011 1012 ret = dmirror_range_snapshot(dmirror, &range, perm); 1013 if (ret) 1014 break; 1015 1016 n = (range.end - range.start) >> PAGE_SHIFT; 1017 if (copy_to_user(uptr, perm, n)) { 1018 ret = -EFAULT; 1019 break; 1020 } 1021 1022 cmd->cpages += n; 1023 uptr += n; 1024 } 1025 mmput(mm); 1026 1027 return ret; 1028 } 1029 1030 static long dmirror_fops_unlocked_ioctl(struct file *filp, 1031 unsigned int command, 1032 unsigned long arg) 1033 { 1034 void __user *uarg = (void __user *)arg; 1035 struct hmm_dmirror_cmd cmd; 1036 struct dmirror *dmirror; 1037 int ret; 1038 1039 dmirror = filp->private_data; 1040 if (!dmirror) 1041 return -EINVAL; 1042 1043 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 1044 return -EFAULT; 1045 1046 if (cmd.addr & ~PAGE_MASK) 1047 return -EINVAL; 1048 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 1049 return -EINVAL; 1050 1051 cmd.cpages = 0; 1052 cmd.faults = 0; 1053 1054 switch (command) { 1055 case HMM_DMIRROR_READ: 1056 ret = dmirror_read(dmirror, &cmd); 1057 break; 1058 1059 case HMM_DMIRROR_WRITE: 1060 ret = dmirror_write(dmirror, &cmd); 1061 break; 1062 1063 case HMM_DMIRROR_MIGRATE: 1064 ret = dmirror_migrate(dmirror, &cmd); 1065 break; 1066 1067 case HMM_DMIRROR_EXCLUSIVE: 1068 ret = dmirror_exclusive(dmirror, &cmd); 1069 break; 1070 1071 case HMM_DMIRROR_CHECK_EXCLUSIVE: 1072 ret = dmirror_check_atomic(dmirror, cmd.addr, 1073 cmd.addr + (cmd.npages << PAGE_SHIFT)); 1074 break; 1075 1076 case HMM_DMIRROR_SNAPSHOT: 1077 ret = dmirror_snapshot(dmirror, &cmd); 1078 break; 1079 1080 default: 1081 return -EINVAL; 1082 } 1083 if (ret) 1084 return ret; 1085 1086 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 1087 return -EFAULT; 1088 1089 return 0; 1090 } 1091 1092 static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma) 1093 { 1094 unsigned long addr; 1095 1096 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 1097 struct page *page; 1098 int ret; 1099 1100 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1101 if (!page) 1102 return -ENOMEM; 1103 1104 ret = vm_insert_page(vma, addr, page); 1105 if (ret) { 1106 __free_page(page); 1107 return ret; 1108 } 1109 put_page(page); 1110 } 1111 1112 return 0; 1113 } 1114 1115 static const struct file_operations dmirror_fops = { 1116 .open = dmirror_fops_open, 1117 .release = dmirror_fops_release, 1118 .mmap = dmirror_fops_mmap, 1119 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 1120 .llseek = default_llseek, 1121 .owner = THIS_MODULE, 1122 }; 1123 1124 static void dmirror_devmem_free(struct page *page) 1125 { 1126 struct page *rpage = page->zone_device_data; 1127 struct dmirror_device *mdevice; 1128 1129 if (rpage) 1130 __free_page(rpage); 1131 1132 mdevice = dmirror_page_to_device(page); 1133 1134 spin_lock(&mdevice->lock); 1135 mdevice->cfree++; 1136 page->zone_device_data = mdevice->free_pages; 1137 mdevice->free_pages = page; 1138 spin_unlock(&mdevice->lock); 1139 } 1140 1141 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 1142 struct dmirror *dmirror) 1143 { 1144 const unsigned long *src = args->src; 1145 unsigned long *dst = args->dst; 1146 unsigned long start = args->start; 1147 unsigned long end = args->end; 1148 unsigned long addr; 1149 1150 for (addr = start; addr < end; addr += PAGE_SIZE, 1151 src++, dst++) { 1152 struct page *dpage, *spage; 1153 1154 spage = migrate_pfn_to_page(*src); 1155 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 1156 continue; 1157 spage = spage->zone_device_data; 1158 1159 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1160 if (!dpage) 1161 continue; 1162 1163 lock_page(dpage); 1164 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1165 copy_highpage(dpage, spage); 1166 *dst = migrate_pfn(page_to_pfn(dpage)); 1167 if (*src & MIGRATE_PFN_WRITE) 1168 *dst |= MIGRATE_PFN_WRITE; 1169 } 1170 return 0; 1171 } 1172 1173 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1174 { 1175 struct migrate_vma args; 1176 unsigned long src_pfns; 1177 unsigned long dst_pfns; 1178 struct page *rpage; 1179 struct dmirror *dmirror; 1180 vm_fault_t ret; 1181 1182 /* 1183 * Normally, a device would use the page->zone_device_data to point to 1184 * the mirror but here we use it to hold the page for the simulated 1185 * device memory and that page holds the pointer to the mirror. 1186 */ 1187 rpage = vmf->page->zone_device_data; 1188 dmirror = rpage->zone_device_data; 1189 1190 /* FIXME demonstrate how we can adjust migrate range */ 1191 args.vma = vmf->vma; 1192 args.start = vmf->address; 1193 args.end = args.start + PAGE_SIZE; 1194 args.src = &src_pfns; 1195 args.dst = &dst_pfns; 1196 args.pgmap_owner = dmirror->mdevice; 1197 args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 1198 1199 if (migrate_vma_setup(&args)) 1200 return VM_FAULT_SIGBUS; 1201 1202 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1203 if (ret) 1204 return ret; 1205 migrate_vma_pages(&args); 1206 /* 1207 * No device finalize step is needed since 1208 * dmirror_devmem_fault_alloc_and_copy() will have already 1209 * invalidated the device page table. 1210 */ 1211 migrate_vma_finalize(&args); 1212 return 0; 1213 } 1214 1215 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1216 .page_free = dmirror_devmem_free, 1217 .migrate_to_ram = dmirror_devmem_fault, 1218 }; 1219 1220 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1221 { 1222 dev_t dev; 1223 int ret; 1224 1225 dev = MKDEV(MAJOR(dmirror_dev), id); 1226 mutex_init(&mdevice->devmem_lock); 1227 spin_lock_init(&mdevice->lock); 1228 1229 cdev_init(&mdevice->cdevice, &dmirror_fops); 1230 mdevice->cdevice.owner = THIS_MODULE; 1231 ret = cdev_add(&mdevice->cdevice, dev, 1); 1232 if (ret) 1233 return ret; 1234 1235 /* Build a list of free ZONE_DEVICE private struct pages */ 1236 dmirror_allocate_chunk(mdevice, NULL); 1237 1238 return 0; 1239 } 1240 1241 static void dmirror_device_remove(struct dmirror_device *mdevice) 1242 { 1243 unsigned int i; 1244 1245 if (mdevice->devmem_chunks) { 1246 for (i = 0; i < mdevice->devmem_count; i++) { 1247 struct dmirror_chunk *devmem = 1248 mdevice->devmem_chunks[i]; 1249 1250 memunmap_pages(&devmem->pagemap); 1251 release_mem_region(devmem->pagemap.range.start, 1252 range_len(&devmem->pagemap.range)); 1253 kfree(devmem); 1254 } 1255 kfree(mdevice->devmem_chunks); 1256 } 1257 1258 cdev_del(&mdevice->cdevice); 1259 } 1260 1261 static int __init hmm_dmirror_init(void) 1262 { 1263 int ret; 1264 int id; 1265 1266 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1267 "HMM_DMIRROR"); 1268 if (ret) 1269 goto err_unreg; 1270 1271 for (id = 0; id < DMIRROR_NDEVICES; id++) { 1272 ret = dmirror_device_init(dmirror_devices + id, id); 1273 if (ret) 1274 goto err_chrdev; 1275 } 1276 1277 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1278 return 0; 1279 1280 err_chrdev: 1281 while (--id >= 0) 1282 dmirror_device_remove(dmirror_devices + id); 1283 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1284 err_unreg: 1285 return ret; 1286 } 1287 1288 static void __exit hmm_dmirror_exit(void) 1289 { 1290 int id; 1291 1292 for (id = 0; id < DMIRROR_NDEVICES; id++) 1293 dmirror_device_remove(dmirror_devices + id); 1294 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1295 } 1296 1297 module_init(hmm_dmirror_init); 1298 module_exit(hmm_dmirror_exit); 1299 MODULE_LICENSE("GPL"); 1300