1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/mutex.h> 16 #include <linux/rwsem.h> 17 #include <linux/sched.h> 18 #include <linux/slab.h> 19 #include <linux/highmem.h> 20 #include <linux/delay.h> 21 #include <linux/pagemap.h> 22 #include <linux/hmm.h> 23 #include <linux/vmalloc.h> 24 #include <linux/swap.h> 25 #include <linux/swapops.h> 26 #include <linux/sched/mm.h> 27 #include <linux/platform_device.h> 28 29 #include "test_hmm_uapi.h" 30 31 #define DMIRROR_NDEVICES 2 32 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 33 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 34 #define DEVMEM_CHUNKS_RESERVE 16 35 36 static const struct dev_pagemap_ops dmirror_devmem_ops; 37 static const struct mmu_interval_notifier_ops dmirror_min_ops; 38 static dev_t dmirror_dev; 39 40 struct dmirror_device; 41 42 struct dmirror_bounce { 43 void *ptr; 44 unsigned long size; 45 unsigned long addr; 46 unsigned long cpages; 47 }; 48 49 #define DPT_XA_TAG_WRITE 3UL 50 51 /* 52 * Data structure to track address ranges and register for mmu interval 53 * notifier updates. 54 */ 55 struct dmirror_interval { 56 struct mmu_interval_notifier notifier; 57 struct dmirror *dmirror; 58 }; 59 60 /* 61 * Data attached to the open device file. 62 * Note that it might be shared after a fork(). 63 */ 64 struct dmirror { 65 struct dmirror_device *mdevice; 66 struct xarray pt; 67 struct mmu_interval_notifier notifier; 68 struct mutex mutex; 69 }; 70 71 /* 72 * ZONE_DEVICE pages for migration and simulating device memory. 73 */ 74 struct dmirror_chunk { 75 struct dev_pagemap pagemap; 76 struct dmirror_device *mdevice; 77 }; 78 79 /* 80 * Per device data. 81 */ 82 struct dmirror_device { 83 struct cdev cdevice; 84 struct hmm_devmem *devmem; 85 86 unsigned int devmem_capacity; 87 unsigned int devmem_count; 88 struct dmirror_chunk **devmem_chunks; 89 struct mutex devmem_lock; /* protects the above */ 90 91 unsigned long calloc; 92 unsigned long cfree; 93 struct page *free_pages; 94 spinlock_t lock; /* protects the above */ 95 }; 96 97 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 98 99 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 100 unsigned long addr, 101 unsigned long size) 102 { 103 bounce->addr = addr; 104 bounce->size = size; 105 bounce->cpages = 0; 106 bounce->ptr = vmalloc(size); 107 if (!bounce->ptr) 108 return -ENOMEM; 109 return 0; 110 } 111 112 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 113 { 114 vfree(bounce->ptr); 115 } 116 117 static int dmirror_fops_open(struct inode *inode, struct file *filp) 118 { 119 struct cdev *cdev = inode->i_cdev; 120 struct dmirror *dmirror; 121 int ret; 122 123 /* Mirror this process address space */ 124 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 125 if (dmirror == NULL) 126 return -ENOMEM; 127 128 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 129 mutex_init(&dmirror->mutex); 130 xa_init(&dmirror->pt); 131 132 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 133 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 134 if (ret) { 135 kfree(dmirror); 136 return ret; 137 } 138 139 filp->private_data = dmirror; 140 return 0; 141 } 142 143 static int dmirror_fops_release(struct inode *inode, struct file *filp) 144 { 145 struct dmirror *dmirror = filp->private_data; 146 147 mmu_interval_notifier_remove(&dmirror->notifier); 148 xa_destroy(&dmirror->pt); 149 kfree(dmirror); 150 return 0; 151 } 152 153 static struct dmirror_device *dmirror_page_to_device(struct page *page) 154 155 { 156 return container_of(page->pgmap, struct dmirror_chunk, 157 pagemap)->mdevice; 158 } 159 160 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 161 { 162 unsigned long *pfns = range->hmm_pfns; 163 unsigned long pfn; 164 165 for (pfn = (range->start >> PAGE_SHIFT); 166 pfn < (range->end >> PAGE_SHIFT); 167 pfn++, pfns++) { 168 struct page *page; 169 void *entry; 170 171 /* 172 * Since we asked for hmm_range_fault() to populate pages, 173 * it shouldn't return an error entry on success. 174 */ 175 WARN_ON(*pfns & HMM_PFN_ERROR); 176 WARN_ON(!(*pfns & HMM_PFN_VALID)); 177 178 page = hmm_pfn_to_page(*pfns); 179 WARN_ON(!page); 180 181 entry = page; 182 if (*pfns & HMM_PFN_WRITE) 183 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 184 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 185 return -EFAULT; 186 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 187 if (xa_is_err(entry)) 188 return xa_err(entry); 189 } 190 191 return 0; 192 } 193 194 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 195 unsigned long end) 196 { 197 unsigned long pfn; 198 void *entry; 199 200 /* 201 * The XArray doesn't hold references to pages since it relies on 202 * the mmu notifier to clear page pointers when they become stale. 203 * Therefore, it is OK to just clear the entry. 204 */ 205 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 206 end >> PAGE_SHIFT) 207 xa_erase(&dmirror->pt, pfn); 208 } 209 210 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 211 const struct mmu_notifier_range *range, 212 unsigned long cur_seq) 213 { 214 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 215 216 /* 217 * Ignore invalidation callbacks for device private pages since 218 * the invalidation is handled as part of the migration process. 219 */ 220 if (range->event == MMU_NOTIFY_MIGRATE && 221 range->migrate_pgmap_owner == dmirror->mdevice) 222 return true; 223 224 if (mmu_notifier_range_blockable(range)) 225 mutex_lock(&dmirror->mutex); 226 else if (!mutex_trylock(&dmirror->mutex)) 227 return false; 228 229 mmu_interval_set_seq(mni, cur_seq); 230 dmirror_do_update(dmirror, range->start, range->end); 231 232 mutex_unlock(&dmirror->mutex); 233 return true; 234 } 235 236 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 237 .invalidate = dmirror_interval_invalidate, 238 }; 239 240 static int dmirror_range_fault(struct dmirror *dmirror, 241 struct hmm_range *range) 242 { 243 struct mm_struct *mm = dmirror->notifier.mm; 244 unsigned long timeout = 245 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 246 int ret; 247 248 while (true) { 249 if (time_after(jiffies, timeout)) { 250 ret = -EBUSY; 251 goto out; 252 } 253 254 range->notifier_seq = mmu_interval_read_begin(range->notifier); 255 mmap_read_lock(mm); 256 ret = hmm_range_fault(range); 257 mmap_read_unlock(mm); 258 if (ret) { 259 if (ret == -EBUSY) 260 continue; 261 goto out; 262 } 263 264 mutex_lock(&dmirror->mutex); 265 if (mmu_interval_read_retry(range->notifier, 266 range->notifier_seq)) { 267 mutex_unlock(&dmirror->mutex); 268 continue; 269 } 270 break; 271 } 272 273 ret = dmirror_do_fault(dmirror, range); 274 275 mutex_unlock(&dmirror->mutex); 276 out: 277 return ret; 278 } 279 280 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 281 unsigned long end, bool write) 282 { 283 struct mm_struct *mm = dmirror->notifier.mm; 284 unsigned long addr; 285 unsigned long pfns[64]; 286 struct hmm_range range = { 287 .notifier = &dmirror->notifier, 288 .hmm_pfns = pfns, 289 .pfn_flags_mask = 0, 290 .default_flags = 291 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 292 .dev_private_owner = dmirror->mdevice, 293 }; 294 int ret = 0; 295 296 /* Since the mm is for the mirrored process, get a reference first. */ 297 if (!mmget_not_zero(mm)) 298 return 0; 299 300 for (addr = start; addr < end; addr = range.end) { 301 range.start = addr; 302 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 303 304 ret = dmirror_range_fault(dmirror, &range); 305 if (ret) 306 break; 307 } 308 309 mmput(mm); 310 return ret; 311 } 312 313 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 314 unsigned long end, struct dmirror_bounce *bounce) 315 { 316 unsigned long pfn; 317 void *ptr; 318 319 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 320 321 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 322 void *entry; 323 struct page *page; 324 void *tmp; 325 326 entry = xa_load(&dmirror->pt, pfn); 327 page = xa_untag_pointer(entry); 328 if (!page) 329 return -ENOENT; 330 331 tmp = kmap(page); 332 memcpy(ptr, tmp, PAGE_SIZE); 333 kunmap(page); 334 335 ptr += PAGE_SIZE; 336 bounce->cpages++; 337 } 338 339 return 0; 340 } 341 342 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 343 { 344 struct dmirror_bounce bounce; 345 unsigned long start, end; 346 unsigned long size = cmd->npages << PAGE_SHIFT; 347 int ret; 348 349 start = cmd->addr; 350 end = start + size; 351 if (end < start) 352 return -EINVAL; 353 354 ret = dmirror_bounce_init(&bounce, start, size); 355 if (ret) 356 return ret; 357 358 while (1) { 359 mutex_lock(&dmirror->mutex); 360 ret = dmirror_do_read(dmirror, start, end, &bounce); 361 mutex_unlock(&dmirror->mutex); 362 if (ret != -ENOENT) 363 break; 364 365 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 366 ret = dmirror_fault(dmirror, start, end, false); 367 if (ret) 368 break; 369 cmd->faults++; 370 } 371 372 if (ret == 0) { 373 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 374 bounce.size)) 375 ret = -EFAULT; 376 } 377 cmd->cpages = bounce.cpages; 378 dmirror_bounce_fini(&bounce); 379 return ret; 380 } 381 382 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 383 unsigned long end, struct dmirror_bounce *bounce) 384 { 385 unsigned long pfn; 386 void *ptr; 387 388 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 389 390 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 391 void *entry; 392 struct page *page; 393 void *tmp; 394 395 entry = xa_load(&dmirror->pt, pfn); 396 page = xa_untag_pointer(entry); 397 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 398 return -ENOENT; 399 400 tmp = kmap(page); 401 memcpy(tmp, ptr, PAGE_SIZE); 402 kunmap(page); 403 404 ptr += PAGE_SIZE; 405 bounce->cpages++; 406 } 407 408 return 0; 409 } 410 411 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 412 { 413 struct dmirror_bounce bounce; 414 unsigned long start, end; 415 unsigned long size = cmd->npages << PAGE_SHIFT; 416 int ret; 417 418 start = cmd->addr; 419 end = start + size; 420 if (end < start) 421 return -EINVAL; 422 423 ret = dmirror_bounce_init(&bounce, start, size); 424 if (ret) 425 return ret; 426 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 427 bounce.size)) { 428 ret = -EFAULT; 429 goto fini; 430 } 431 432 while (1) { 433 mutex_lock(&dmirror->mutex); 434 ret = dmirror_do_write(dmirror, start, end, &bounce); 435 mutex_unlock(&dmirror->mutex); 436 if (ret != -ENOENT) 437 break; 438 439 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 440 ret = dmirror_fault(dmirror, start, end, true); 441 if (ret) 442 break; 443 cmd->faults++; 444 } 445 446 fini: 447 cmd->cpages = bounce.cpages; 448 dmirror_bounce_fini(&bounce); 449 return ret; 450 } 451 452 static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, 453 struct page **ppage) 454 { 455 struct dmirror_chunk *devmem; 456 struct resource *res; 457 unsigned long pfn; 458 unsigned long pfn_first; 459 unsigned long pfn_last; 460 void *ptr; 461 462 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 463 if (!devmem) 464 return false; 465 466 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 467 "hmm_dmirror"); 468 if (IS_ERR(res)) 469 goto err_devmem; 470 471 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 472 devmem->pagemap.range.start = res->start; 473 devmem->pagemap.range.end = res->end; 474 devmem->pagemap.nr_range = 1; 475 devmem->pagemap.ops = &dmirror_devmem_ops; 476 devmem->pagemap.owner = mdevice; 477 478 mutex_lock(&mdevice->devmem_lock); 479 480 if (mdevice->devmem_count == mdevice->devmem_capacity) { 481 struct dmirror_chunk **new_chunks; 482 unsigned int new_capacity; 483 484 new_capacity = mdevice->devmem_capacity + 485 DEVMEM_CHUNKS_RESERVE; 486 new_chunks = krealloc(mdevice->devmem_chunks, 487 sizeof(new_chunks[0]) * new_capacity, 488 GFP_KERNEL); 489 if (!new_chunks) 490 goto err_release; 491 mdevice->devmem_capacity = new_capacity; 492 mdevice->devmem_chunks = new_chunks; 493 } 494 495 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 496 if (IS_ERR(ptr)) 497 goto err_release; 498 499 devmem->mdevice = mdevice; 500 pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; 501 pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); 502 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 503 504 mutex_unlock(&mdevice->devmem_lock); 505 506 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 507 DEVMEM_CHUNK_SIZE / (1024 * 1024), 508 mdevice->devmem_count, 509 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 510 pfn_first, pfn_last); 511 512 spin_lock(&mdevice->lock); 513 for (pfn = pfn_first; pfn < pfn_last; pfn++) { 514 struct page *page = pfn_to_page(pfn); 515 516 page->zone_device_data = mdevice->free_pages; 517 mdevice->free_pages = page; 518 } 519 if (ppage) { 520 *ppage = mdevice->free_pages; 521 mdevice->free_pages = (*ppage)->zone_device_data; 522 mdevice->calloc++; 523 } 524 spin_unlock(&mdevice->lock); 525 526 return true; 527 528 err_release: 529 mutex_unlock(&mdevice->devmem_lock); 530 release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); 531 err_devmem: 532 kfree(devmem); 533 534 return false; 535 } 536 537 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 538 { 539 struct page *dpage = NULL; 540 struct page *rpage; 541 542 /* 543 * This is a fake device so we alloc real system memory to store 544 * our device memory. 545 */ 546 rpage = alloc_page(GFP_HIGHUSER); 547 if (!rpage) 548 return NULL; 549 550 spin_lock(&mdevice->lock); 551 552 if (mdevice->free_pages) { 553 dpage = mdevice->free_pages; 554 mdevice->free_pages = dpage->zone_device_data; 555 mdevice->calloc++; 556 spin_unlock(&mdevice->lock); 557 } else { 558 spin_unlock(&mdevice->lock); 559 if (!dmirror_allocate_chunk(mdevice, &dpage)) 560 goto error; 561 } 562 563 dpage->zone_device_data = rpage; 564 get_page(dpage); 565 lock_page(dpage); 566 return dpage; 567 568 error: 569 __free_page(rpage); 570 return NULL; 571 } 572 573 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 574 struct dmirror *dmirror) 575 { 576 struct dmirror_device *mdevice = dmirror->mdevice; 577 const unsigned long *src = args->src; 578 unsigned long *dst = args->dst; 579 unsigned long addr; 580 581 for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 582 src++, dst++) { 583 struct page *spage; 584 struct page *dpage; 585 struct page *rpage; 586 587 if (!(*src & MIGRATE_PFN_MIGRATE)) 588 continue; 589 590 /* 591 * Note that spage might be NULL which is OK since it is an 592 * unallocated pte_none() or read-only zero page. 593 */ 594 spage = migrate_pfn_to_page(*src); 595 596 dpage = dmirror_devmem_alloc_page(mdevice); 597 if (!dpage) 598 continue; 599 600 rpage = dpage->zone_device_data; 601 if (spage) 602 copy_highpage(rpage, spage); 603 else 604 clear_highpage(rpage); 605 606 /* 607 * Normally, a device would use the page->zone_device_data to 608 * point to the mirror but here we use it to hold the page for 609 * the simulated device memory and that page holds the pointer 610 * to the mirror. 611 */ 612 rpage->zone_device_data = dmirror; 613 614 *dst = migrate_pfn(page_to_pfn(dpage)) | 615 MIGRATE_PFN_LOCKED; 616 if ((*src & MIGRATE_PFN_WRITE) || 617 (!spage && args->vma->vm_flags & VM_WRITE)) 618 *dst |= MIGRATE_PFN_WRITE; 619 } 620 } 621 622 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 623 struct dmirror *dmirror) 624 { 625 unsigned long start = args->start; 626 unsigned long end = args->end; 627 const unsigned long *src = args->src; 628 const unsigned long *dst = args->dst; 629 unsigned long pfn; 630 631 /* Map the migrated pages into the device's page tables. */ 632 mutex_lock(&dmirror->mutex); 633 634 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 635 src++, dst++) { 636 struct page *dpage; 637 void *entry; 638 639 if (!(*src & MIGRATE_PFN_MIGRATE)) 640 continue; 641 642 dpage = migrate_pfn_to_page(*dst); 643 if (!dpage) 644 continue; 645 646 /* 647 * Store the page that holds the data so the page table 648 * doesn't have to deal with ZONE_DEVICE private pages. 649 */ 650 entry = dpage->zone_device_data; 651 if (*dst & MIGRATE_PFN_WRITE) 652 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 653 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 654 if (xa_is_err(entry)) { 655 mutex_unlock(&dmirror->mutex); 656 return xa_err(entry); 657 } 658 } 659 660 mutex_unlock(&dmirror->mutex); 661 return 0; 662 } 663 664 static int dmirror_migrate(struct dmirror *dmirror, 665 struct hmm_dmirror_cmd *cmd) 666 { 667 unsigned long start, end, addr; 668 unsigned long size = cmd->npages << PAGE_SHIFT; 669 struct mm_struct *mm = dmirror->notifier.mm; 670 struct vm_area_struct *vma; 671 unsigned long src_pfns[64]; 672 unsigned long dst_pfns[64]; 673 struct dmirror_bounce bounce; 674 struct migrate_vma args; 675 unsigned long next; 676 int ret; 677 678 start = cmd->addr; 679 end = start + size; 680 if (end < start) 681 return -EINVAL; 682 683 /* Since the mm is for the mirrored process, get a reference first. */ 684 if (!mmget_not_zero(mm)) 685 return -EINVAL; 686 687 mmap_read_lock(mm); 688 for (addr = start; addr < end; addr = next) { 689 vma = vma_lookup(mm, addr); 690 if (!vma || !(vma->vm_flags & VM_READ)) { 691 ret = -EINVAL; 692 goto out; 693 } 694 next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 695 if (next > vma->vm_end) 696 next = vma->vm_end; 697 698 args.vma = vma; 699 args.src = src_pfns; 700 args.dst = dst_pfns; 701 args.start = addr; 702 args.end = next; 703 args.pgmap_owner = dmirror->mdevice; 704 args.flags = MIGRATE_VMA_SELECT_SYSTEM; 705 ret = migrate_vma_setup(&args); 706 if (ret) 707 goto out; 708 709 dmirror_migrate_alloc_and_copy(&args, dmirror); 710 migrate_vma_pages(&args); 711 dmirror_migrate_finalize_and_map(&args, dmirror); 712 migrate_vma_finalize(&args); 713 } 714 mmap_read_unlock(mm); 715 mmput(mm); 716 717 /* Return the migrated data for verification. */ 718 ret = dmirror_bounce_init(&bounce, start, size); 719 if (ret) 720 return ret; 721 mutex_lock(&dmirror->mutex); 722 ret = dmirror_do_read(dmirror, start, end, &bounce); 723 mutex_unlock(&dmirror->mutex); 724 if (ret == 0) { 725 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 726 bounce.size)) 727 ret = -EFAULT; 728 } 729 cmd->cpages = bounce.cpages; 730 dmirror_bounce_fini(&bounce); 731 return ret; 732 733 out: 734 mmap_read_unlock(mm); 735 mmput(mm); 736 return ret; 737 } 738 739 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 740 unsigned char *perm, unsigned long entry) 741 { 742 struct page *page; 743 744 if (entry & HMM_PFN_ERROR) { 745 *perm = HMM_DMIRROR_PROT_ERROR; 746 return; 747 } 748 if (!(entry & HMM_PFN_VALID)) { 749 *perm = HMM_DMIRROR_PROT_NONE; 750 return; 751 } 752 753 page = hmm_pfn_to_page(entry); 754 if (is_device_private_page(page)) { 755 /* Is the page migrated to this device or some other? */ 756 if (dmirror->mdevice == dmirror_page_to_device(page)) 757 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 758 else 759 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 760 } else if (is_zero_pfn(page_to_pfn(page))) 761 *perm = HMM_DMIRROR_PROT_ZERO; 762 else 763 *perm = HMM_DMIRROR_PROT_NONE; 764 if (entry & HMM_PFN_WRITE) 765 *perm |= HMM_DMIRROR_PROT_WRITE; 766 else 767 *perm |= HMM_DMIRROR_PROT_READ; 768 if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 769 *perm |= HMM_DMIRROR_PROT_PMD; 770 else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 771 *perm |= HMM_DMIRROR_PROT_PUD; 772 } 773 774 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 775 const struct mmu_notifier_range *range, 776 unsigned long cur_seq) 777 { 778 struct dmirror_interval *dmi = 779 container_of(mni, struct dmirror_interval, notifier); 780 struct dmirror *dmirror = dmi->dmirror; 781 782 if (mmu_notifier_range_blockable(range)) 783 mutex_lock(&dmirror->mutex); 784 else if (!mutex_trylock(&dmirror->mutex)) 785 return false; 786 787 /* 788 * Snapshots only need to set the sequence number since any 789 * invalidation in the interval invalidates the whole snapshot. 790 */ 791 mmu_interval_set_seq(mni, cur_seq); 792 793 mutex_unlock(&dmirror->mutex); 794 return true; 795 } 796 797 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 798 .invalidate = dmirror_snapshot_invalidate, 799 }; 800 801 static int dmirror_range_snapshot(struct dmirror *dmirror, 802 struct hmm_range *range, 803 unsigned char *perm) 804 { 805 struct mm_struct *mm = dmirror->notifier.mm; 806 struct dmirror_interval notifier; 807 unsigned long timeout = 808 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 809 unsigned long i; 810 unsigned long n; 811 int ret = 0; 812 813 notifier.dmirror = dmirror; 814 range->notifier = ¬ifier.notifier; 815 816 ret = mmu_interval_notifier_insert(range->notifier, mm, 817 range->start, range->end - range->start, 818 &dmirror_mrn_ops); 819 if (ret) 820 return ret; 821 822 while (true) { 823 if (time_after(jiffies, timeout)) { 824 ret = -EBUSY; 825 goto out; 826 } 827 828 range->notifier_seq = mmu_interval_read_begin(range->notifier); 829 830 mmap_read_lock(mm); 831 ret = hmm_range_fault(range); 832 mmap_read_unlock(mm); 833 if (ret) { 834 if (ret == -EBUSY) 835 continue; 836 goto out; 837 } 838 839 mutex_lock(&dmirror->mutex); 840 if (mmu_interval_read_retry(range->notifier, 841 range->notifier_seq)) { 842 mutex_unlock(&dmirror->mutex); 843 continue; 844 } 845 break; 846 } 847 848 n = (range->end - range->start) >> PAGE_SHIFT; 849 for (i = 0; i < n; i++) 850 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 851 852 mutex_unlock(&dmirror->mutex); 853 out: 854 mmu_interval_notifier_remove(range->notifier); 855 return ret; 856 } 857 858 static int dmirror_snapshot(struct dmirror *dmirror, 859 struct hmm_dmirror_cmd *cmd) 860 { 861 struct mm_struct *mm = dmirror->notifier.mm; 862 unsigned long start, end; 863 unsigned long size = cmd->npages << PAGE_SHIFT; 864 unsigned long addr; 865 unsigned long next; 866 unsigned long pfns[64]; 867 unsigned char perm[64]; 868 char __user *uptr; 869 struct hmm_range range = { 870 .hmm_pfns = pfns, 871 .dev_private_owner = dmirror->mdevice, 872 }; 873 int ret = 0; 874 875 start = cmd->addr; 876 end = start + size; 877 if (end < start) 878 return -EINVAL; 879 880 /* Since the mm is for the mirrored process, get a reference first. */ 881 if (!mmget_not_zero(mm)) 882 return -EINVAL; 883 884 /* 885 * Register a temporary notifier to detect invalidations even if it 886 * overlaps with other mmu_interval_notifiers. 887 */ 888 uptr = u64_to_user_ptr(cmd->ptr); 889 for (addr = start; addr < end; addr = next) { 890 unsigned long n; 891 892 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 893 range.start = addr; 894 range.end = next; 895 896 ret = dmirror_range_snapshot(dmirror, &range, perm); 897 if (ret) 898 break; 899 900 n = (range.end - range.start) >> PAGE_SHIFT; 901 if (copy_to_user(uptr, perm, n)) { 902 ret = -EFAULT; 903 break; 904 } 905 906 cmd->cpages += n; 907 uptr += n; 908 } 909 mmput(mm); 910 911 return ret; 912 } 913 914 static long dmirror_fops_unlocked_ioctl(struct file *filp, 915 unsigned int command, 916 unsigned long arg) 917 { 918 void __user *uarg = (void __user *)arg; 919 struct hmm_dmirror_cmd cmd; 920 struct dmirror *dmirror; 921 int ret; 922 923 dmirror = filp->private_data; 924 if (!dmirror) 925 return -EINVAL; 926 927 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 928 return -EFAULT; 929 930 if (cmd.addr & ~PAGE_MASK) 931 return -EINVAL; 932 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 933 return -EINVAL; 934 935 cmd.cpages = 0; 936 cmd.faults = 0; 937 938 switch (command) { 939 case HMM_DMIRROR_READ: 940 ret = dmirror_read(dmirror, &cmd); 941 break; 942 943 case HMM_DMIRROR_WRITE: 944 ret = dmirror_write(dmirror, &cmd); 945 break; 946 947 case HMM_DMIRROR_MIGRATE: 948 ret = dmirror_migrate(dmirror, &cmd); 949 break; 950 951 case HMM_DMIRROR_SNAPSHOT: 952 ret = dmirror_snapshot(dmirror, &cmd); 953 break; 954 955 default: 956 return -EINVAL; 957 } 958 if (ret) 959 return ret; 960 961 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 962 return -EFAULT; 963 964 return 0; 965 } 966 967 static const struct file_operations dmirror_fops = { 968 .open = dmirror_fops_open, 969 .release = dmirror_fops_release, 970 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 971 .llseek = default_llseek, 972 .owner = THIS_MODULE, 973 }; 974 975 static void dmirror_devmem_free(struct page *page) 976 { 977 struct page *rpage = page->zone_device_data; 978 struct dmirror_device *mdevice; 979 980 if (rpage) 981 __free_page(rpage); 982 983 mdevice = dmirror_page_to_device(page); 984 985 spin_lock(&mdevice->lock); 986 mdevice->cfree++; 987 page->zone_device_data = mdevice->free_pages; 988 mdevice->free_pages = page; 989 spin_unlock(&mdevice->lock); 990 } 991 992 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 993 struct dmirror *dmirror) 994 { 995 const unsigned long *src = args->src; 996 unsigned long *dst = args->dst; 997 unsigned long start = args->start; 998 unsigned long end = args->end; 999 unsigned long addr; 1000 1001 for (addr = start; addr < end; addr += PAGE_SIZE, 1002 src++, dst++) { 1003 struct page *dpage, *spage; 1004 1005 spage = migrate_pfn_to_page(*src); 1006 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 1007 continue; 1008 spage = spage->zone_device_data; 1009 1010 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1011 if (!dpage) 1012 continue; 1013 1014 lock_page(dpage); 1015 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1016 copy_highpage(dpage, spage); 1017 *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; 1018 if (*src & MIGRATE_PFN_WRITE) 1019 *dst |= MIGRATE_PFN_WRITE; 1020 } 1021 return 0; 1022 } 1023 1024 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1025 { 1026 struct migrate_vma args; 1027 unsigned long src_pfns; 1028 unsigned long dst_pfns; 1029 struct page *rpage; 1030 struct dmirror *dmirror; 1031 vm_fault_t ret; 1032 1033 /* 1034 * Normally, a device would use the page->zone_device_data to point to 1035 * the mirror but here we use it to hold the page for the simulated 1036 * device memory and that page holds the pointer to the mirror. 1037 */ 1038 rpage = vmf->page->zone_device_data; 1039 dmirror = rpage->zone_device_data; 1040 1041 /* FIXME demonstrate how we can adjust migrate range */ 1042 args.vma = vmf->vma; 1043 args.start = vmf->address; 1044 args.end = args.start + PAGE_SIZE; 1045 args.src = &src_pfns; 1046 args.dst = &dst_pfns; 1047 args.pgmap_owner = dmirror->mdevice; 1048 args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 1049 1050 if (migrate_vma_setup(&args)) 1051 return VM_FAULT_SIGBUS; 1052 1053 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1054 if (ret) 1055 return ret; 1056 migrate_vma_pages(&args); 1057 /* 1058 * No device finalize step is needed since 1059 * dmirror_devmem_fault_alloc_and_copy() will have already 1060 * invalidated the device page table. 1061 */ 1062 migrate_vma_finalize(&args); 1063 return 0; 1064 } 1065 1066 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1067 .page_free = dmirror_devmem_free, 1068 .migrate_to_ram = dmirror_devmem_fault, 1069 }; 1070 1071 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1072 { 1073 dev_t dev; 1074 int ret; 1075 1076 dev = MKDEV(MAJOR(dmirror_dev), id); 1077 mutex_init(&mdevice->devmem_lock); 1078 spin_lock_init(&mdevice->lock); 1079 1080 cdev_init(&mdevice->cdevice, &dmirror_fops); 1081 mdevice->cdevice.owner = THIS_MODULE; 1082 ret = cdev_add(&mdevice->cdevice, dev, 1); 1083 if (ret) 1084 return ret; 1085 1086 /* Build a list of free ZONE_DEVICE private struct pages */ 1087 dmirror_allocate_chunk(mdevice, NULL); 1088 1089 return 0; 1090 } 1091 1092 static void dmirror_device_remove(struct dmirror_device *mdevice) 1093 { 1094 unsigned int i; 1095 1096 if (mdevice->devmem_chunks) { 1097 for (i = 0; i < mdevice->devmem_count; i++) { 1098 struct dmirror_chunk *devmem = 1099 mdevice->devmem_chunks[i]; 1100 1101 memunmap_pages(&devmem->pagemap); 1102 release_mem_region(devmem->pagemap.range.start, 1103 range_len(&devmem->pagemap.range)); 1104 kfree(devmem); 1105 } 1106 kfree(mdevice->devmem_chunks); 1107 } 1108 1109 cdev_del(&mdevice->cdevice); 1110 } 1111 1112 static int __init hmm_dmirror_init(void) 1113 { 1114 int ret; 1115 int id; 1116 1117 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1118 "HMM_DMIRROR"); 1119 if (ret) 1120 goto err_unreg; 1121 1122 for (id = 0; id < DMIRROR_NDEVICES; id++) { 1123 ret = dmirror_device_init(dmirror_devices + id, id); 1124 if (ret) 1125 goto err_chrdev; 1126 } 1127 1128 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1129 return 0; 1130 1131 err_chrdev: 1132 while (--id >= 0) 1133 dmirror_device_remove(dmirror_devices + id); 1134 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1135 err_unreg: 1136 return ret; 1137 } 1138 1139 static void __exit hmm_dmirror_exit(void) 1140 { 1141 int id; 1142 1143 for (id = 0; id < DMIRROR_NDEVICES; id++) 1144 dmirror_device_remove(dmirror_devices + id); 1145 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1146 } 1147 1148 module_init(hmm_dmirror_init); 1149 module_exit(hmm_dmirror_exit); 1150 MODULE_LICENSE("GPL"); 1151