1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/mutex.h> 16 #include <linux/rwsem.h> 17 #include <linux/sched.h> 18 #include <linux/slab.h> 19 #include <linux/highmem.h> 20 #include <linux/delay.h> 21 #include <linux/pagemap.h> 22 #include <linux/hmm.h> 23 #include <linux/vmalloc.h> 24 #include <linux/swap.h> 25 #include <linux/swapops.h> 26 #include <linux/sched/mm.h> 27 #include <linux/platform_device.h> 28 29 #include "test_hmm_uapi.h" 30 31 #define DMIRROR_NDEVICES 2 32 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 33 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 34 #define DEVMEM_CHUNKS_RESERVE 16 35 36 static const struct dev_pagemap_ops dmirror_devmem_ops; 37 static const struct mmu_interval_notifier_ops dmirror_min_ops; 38 static dev_t dmirror_dev; 39 static struct page *dmirror_zero_page; 40 41 struct dmirror_device; 42 43 struct dmirror_bounce { 44 void *ptr; 45 unsigned long size; 46 unsigned long addr; 47 unsigned long cpages; 48 }; 49 50 #define DPT_XA_TAG_WRITE 3UL 51 52 /* 53 * Data structure to track address ranges and register for mmu interval 54 * notifier updates. 55 */ 56 struct dmirror_interval { 57 struct mmu_interval_notifier notifier; 58 struct dmirror *dmirror; 59 }; 60 61 /* 62 * Data attached to the open device file. 63 * Note that it might be shared after a fork(). 64 */ 65 struct dmirror { 66 struct dmirror_device *mdevice; 67 struct xarray pt; 68 struct mmu_interval_notifier notifier; 69 struct mutex mutex; 70 }; 71 72 /* 73 * ZONE_DEVICE pages for migration and simulating device memory. 74 */ 75 struct dmirror_chunk { 76 struct dev_pagemap pagemap; 77 struct dmirror_device *mdevice; 78 }; 79 80 /* 81 * Per device data. 82 */ 83 struct dmirror_device { 84 struct cdev cdevice; 85 struct hmm_devmem *devmem; 86 87 unsigned int devmem_capacity; 88 unsigned int devmem_count; 89 struct dmirror_chunk **devmem_chunks; 90 struct mutex devmem_lock; /* protects the above */ 91 92 unsigned long calloc; 93 unsigned long cfree; 94 struct page *free_pages; 95 spinlock_t lock; /* protects the above */ 96 }; 97 98 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 99 100 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 101 unsigned long addr, 102 unsigned long size) 103 { 104 bounce->addr = addr; 105 bounce->size = size; 106 bounce->cpages = 0; 107 bounce->ptr = vmalloc(size); 108 if (!bounce->ptr) 109 return -ENOMEM; 110 return 0; 111 } 112 113 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 114 { 115 vfree(bounce->ptr); 116 } 117 118 static int dmirror_fops_open(struct inode *inode, struct file *filp) 119 { 120 struct cdev *cdev = inode->i_cdev; 121 struct dmirror *dmirror; 122 int ret; 123 124 /* Mirror this process address space */ 125 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 126 if (dmirror == NULL) 127 return -ENOMEM; 128 129 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 130 mutex_init(&dmirror->mutex); 131 xa_init(&dmirror->pt); 132 133 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 134 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 135 if (ret) { 136 kfree(dmirror); 137 return ret; 138 } 139 140 filp->private_data = dmirror; 141 return 0; 142 } 143 144 static int dmirror_fops_release(struct inode *inode, struct file *filp) 145 { 146 struct dmirror *dmirror = filp->private_data; 147 148 mmu_interval_notifier_remove(&dmirror->notifier); 149 xa_destroy(&dmirror->pt); 150 kfree(dmirror); 151 return 0; 152 } 153 154 static struct dmirror_device *dmirror_page_to_device(struct page *page) 155 156 { 157 return container_of(page->pgmap, struct dmirror_chunk, 158 pagemap)->mdevice; 159 } 160 161 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 162 { 163 unsigned long *pfns = range->hmm_pfns; 164 unsigned long pfn; 165 166 for (pfn = (range->start >> PAGE_SHIFT); 167 pfn < (range->end >> PAGE_SHIFT); 168 pfn++, pfns++) { 169 struct page *page; 170 void *entry; 171 172 /* 173 * Since we asked for hmm_range_fault() to populate pages, 174 * it shouldn't return an error entry on success. 175 */ 176 WARN_ON(*pfns & HMM_PFN_ERROR); 177 WARN_ON(!(*pfns & HMM_PFN_VALID)); 178 179 page = hmm_pfn_to_page(*pfns); 180 WARN_ON(!page); 181 182 entry = page; 183 if (*pfns & HMM_PFN_WRITE) 184 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 185 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 186 return -EFAULT; 187 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 188 if (xa_is_err(entry)) 189 return xa_err(entry); 190 } 191 192 return 0; 193 } 194 195 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 196 unsigned long end) 197 { 198 unsigned long pfn; 199 void *entry; 200 201 /* 202 * The XArray doesn't hold references to pages since it relies on 203 * the mmu notifier to clear page pointers when they become stale. 204 * Therefore, it is OK to just clear the entry. 205 */ 206 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 207 end >> PAGE_SHIFT) 208 xa_erase(&dmirror->pt, pfn); 209 } 210 211 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 212 const struct mmu_notifier_range *range, 213 unsigned long cur_seq) 214 { 215 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 216 217 if (mmu_notifier_range_blockable(range)) 218 mutex_lock(&dmirror->mutex); 219 else if (!mutex_trylock(&dmirror->mutex)) 220 return false; 221 222 mmu_interval_set_seq(mni, cur_seq); 223 dmirror_do_update(dmirror, range->start, range->end); 224 225 mutex_unlock(&dmirror->mutex); 226 return true; 227 } 228 229 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 230 .invalidate = dmirror_interval_invalidate, 231 }; 232 233 static int dmirror_range_fault(struct dmirror *dmirror, 234 struct hmm_range *range) 235 { 236 struct mm_struct *mm = dmirror->notifier.mm; 237 unsigned long timeout = 238 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 239 int ret; 240 241 while (true) { 242 if (time_after(jiffies, timeout)) { 243 ret = -EBUSY; 244 goto out; 245 } 246 247 range->notifier_seq = mmu_interval_read_begin(range->notifier); 248 mmap_read_lock(mm); 249 ret = hmm_range_fault(range); 250 mmap_read_unlock(mm); 251 if (ret) { 252 if (ret == -EBUSY) 253 continue; 254 goto out; 255 } 256 257 mutex_lock(&dmirror->mutex); 258 if (mmu_interval_read_retry(range->notifier, 259 range->notifier_seq)) { 260 mutex_unlock(&dmirror->mutex); 261 continue; 262 } 263 break; 264 } 265 266 ret = dmirror_do_fault(dmirror, range); 267 268 mutex_unlock(&dmirror->mutex); 269 out: 270 return ret; 271 } 272 273 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 274 unsigned long end, bool write) 275 { 276 struct mm_struct *mm = dmirror->notifier.mm; 277 unsigned long addr; 278 unsigned long pfns[64]; 279 struct hmm_range range = { 280 .notifier = &dmirror->notifier, 281 .hmm_pfns = pfns, 282 .pfn_flags_mask = 0, 283 .default_flags = 284 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 285 .dev_private_owner = dmirror->mdevice, 286 }; 287 int ret = 0; 288 289 /* Since the mm is for the mirrored process, get a reference first. */ 290 if (!mmget_not_zero(mm)) 291 return 0; 292 293 for (addr = start; addr < end; addr = range.end) { 294 range.start = addr; 295 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 296 297 ret = dmirror_range_fault(dmirror, &range); 298 if (ret) 299 break; 300 } 301 302 mmput(mm); 303 return ret; 304 } 305 306 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 307 unsigned long end, struct dmirror_bounce *bounce) 308 { 309 unsigned long pfn; 310 void *ptr; 311 312 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 313 314 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 315 void *entry; 316 struct page *page; 317 void *tmp; 318 319 entry = xa_load(&dmirror->pt, pfn); 320 page = xa_untag_pointer(entry); 321 if (!page) 322 return -ENOENT; 323 324 tmp = kmap(page); 325 memcpy(ptr, tmp, PAGE_SIZE); 326 kunmap(page); 327 328 ptr += PAGE_SIZE; 329 bounce->cpages++; 330 } 331 332 return 0; 333 } 334 335 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 336 { 337 struct dmirror_bounce bounce; 338 unsigned long start, end; 339 unsigned long size = cmd->npages << PAGE_SHIFT; 340 int ret; 341 342 start = cmd->addr; 343 end = start + size; 344 if (end < start) 345 return -EINVAL; 346 347 ret = dmirror_bounce_init(&bounce, start, size); 348 if (ret) 349 return ret; 350 351 while (1) { 352 mutex_lock(&dmirror->mutex); 353 ret = dmirror_do_read(dmirror, start, end, &bounce); 354 mutex_unlock(&dmirror->mutex); 355 if (ret != -ENOENT) 356 break; 357 358 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 359 ret = dmirror_fault(dmirror, start, end, false); 360 if (ret) 361 break; 362 cmd->faults++; 363 } 364 365 if (ret == 0) { 366 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 367 bounce.size)) 368 ret = -EFAULT; 369 } 370 cmd->cpages = bounce.cpages; 371 dmirror_bounce_fini(&bounce); 372 return ret; 373 } 374 375 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 376 unsigned long end, struct dmirror_bounce *bounce) 377 { 378 unsigned long pfn; 379 void *ptr; 380 381 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 382 383 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 384 void *entry; 385 struct page *page; 386 void *tmp; 387 388 entry = xa_load(&dmirror->pt, pfn); 389 page = xa_untag_pointer(entry); 390 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 391 return -ENOENT; 392 393 tmp = kmap(page); 394 memcpy(tmp, ptr, PAGE_SIZE); 395 kunmap(page); 396 397 ptr += PAGE_SIZE; 398 bounce->cpages++; 399 } 400 401 return 0; 402 } 403 404 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 405 { 406 struct dmirror_bounce bounce; 407 unsigned long start, end; 408 unsigned long size = cmd->npages << PAGE_SHIFT; 409 int ret; 410 411 start = cmd->addr; 412 end = start + size; 413 if (end < start) 414 return -EINVAL; 415 416 ret = dmirror_bounce_init(&bounce, start, size); 417 if (ret) 418 return ret; 419 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 420 bounce.size)) { 421 ret = -EFAULT; 422 goto fini; 423 } 424 425 while (1) { 426 mutex_lock(&dmirror->mutex); 427 ret = dmirror_do_write(dmirror, start, end, &bounce); 428 mutex_unlock(&dmirror->mutex); 429 if (ret != -ENOENT) 430 break; 431 432 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 433 ret = dmirror_fault(dmirror, start, end, true); 434 if (ret) 435 break; 436 cmd->faults++; 437 } 438 439 fini: 440 cmd->cpages = bounce.cpages; 441 dmirror_bounce_fini(&bounce); 442 return ret; 443 } 444 445 static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, 446 struct page **ppage) 447 { 448 struct dmirror_chunk *devmem; 449 struct resource *res; 450 unsigned long pfn; 451 unsigned long pfn_first; 452 unsigned long pfn_last; 453 void *ptr; 454 455 mutex_lock(&mdevice->devmem_lock); 456 457 if (mdevice->devmem_count == mdevice->devmem_capacity) { 458 struct dmirror_chunk **new_chunks; 459 unsigned int new_capacity; 460 461 new_capacity = mdevice->devmem_capacity + 462 DEVMEM_CHUNKS_RESERVE; 463 new_chunks = krealloc(mdevice->devmem_chunks, 464 sizeof(new_chunks[0]) * new_capacity, 465 GFP_KERNEL); 466 if (!new_chunks) 467 goto err; 468 mdevice->devmem_capacity = new_capacity; 469 mdevice->devmem_chunks = new_chunks; 470 } 471 472 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 473 "hmm_dmirror"); 474 if (IS_ERR(res)) 475 goto err; 476 477 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 478 if (!devmem) 479 goto err_release; 480 481 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 482 devmem->pagemap.res = *res; 483 devmem->pagemap.ops = &dmirror_devmem_ops; 484 devmem->pagemap.owner = mdevice; 485 486 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 487 if (IS_ERR(ptr)) 488 goto err_free; 489 490 devmem->mdevice = mdevice; 491 pfn_first = devmem->pagemap.res.start >> PAGE_SHIFT; 492 pfn_last = pfn_first + 493 (resource_size(&devmem->pagemap.res) >> PAGE_SHIFT); 494 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 495 496 mutex_unlock(&mdevice->devmem_lock); 497 498 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 499 DEVMEM_CHUNK_SIZE / (1024 * 1024), 500 mdevice->devmem_count, 501 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 502 pfn_first, pfn_last); 503 504 spin_lock(&mdevice->lock); 505 for (pfn = pfn_first; pfn < pfn_last; pfn++) { 506 struct page *page = pfn_to_page(pfn); 507 508 page->zone_device_data = mdevice->free_pages; 509 mdevice->free_pages = page; 510 } 511 if (ppage) { 512 *ppage = mdevice->free_pages; 513 mdevice->free_pages = (*ppage)->zone_device_data; 514 mdevice->calloc++; 515 } 516 spin_unlock(&mdevice->lock); 517 518 return true; 519 520 err_free: 521 kfree(devmem); 522 err_release: 523 release_mem_region(devmem->pagemap.res.start, 524 resource_size(&devmem->pagemap.res)); 525 err: 526 mutex_unlock(&mdevice->devmem_lock); 527 return false; 528 } 529 530 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 531 { 532 struct page *dpage = NULL; 533 struct page *rpage; 534 535 /* 536 * This is a fake device so we alloc real system memory to store 537 * our device memory. 538 */ 539 rpage = alloc_page(GFP_HIGHUSER); 540 if (!rpage) 541 return NULL; 542 543 spin_lock(&mdevice->lock); 544 545 if (mdevice->free_pages) { 546 dpage = mdevice->free_pages; 547 mdevice->free_pages = dpage->zone_device_data; 548 mdevice->calloc++; 549 spin_unlock(&mdevice->lock); 550 } else { 551 spin_unlock(&mdevice->lock); 552 if (!dmirror_allocate_chunk(mdevice, &dpage)) 553 goto error; 554 } 555 556 dpage->zone_device_data = rpage; 557 get_page(dpage); 558 lock_page(dpage); 559 return dpage; 560 561 error: 562 __free_page(rpage); 563 return NULL; 564 } 565 566 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 567 struct dmirror *dmirror) 568 { 569 struct dmirror_device *mdevice = dmirror->mdevice; 570 const unsigned long *src = args->src; 571 unsigned long *dst = args->dst; 572 unsigned long addr; 573 574 for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 575 src++, dst++) { 576 struct page *spage; 577 struct page *dpage; 578 struct page *rpage; 579 580 if (!(*src & MIGRATE_PFN_MIGRATE)) 581 continue; 582 583 /* 584 * Note that spage might be NULL which is OK since it is an 585 * unallocated pte_none() or read-only zero page. 586 */ 587 spage = migrate_pfn_to_page(*src); 588 589 /* 590 * Don't migrate device private pages from our own driver or 591 * others. For our own we would do a device private memory copy 592 * not a migration and for others, we would need to fault the 593 * other device's page into system memory first. 594 */ 595 if (spage && is_zone_device_page(spage)) 596 continue; 597 598 dpage = dmirror_devmem_alloc_page(mdevice); 599 if (!dpage) 600 continue; 601 602 rpage = dpage->zone_device_data; 603 if (spage) 604 copy_highpage(rpage, spage); 605 else 606 clear_highpage(rpage); 607 608 /* 609 * Normally, a device would use the page->zone_device_data to 610 * point to the mirror but here we use it to hold the page for 611 * the simulated device memory and that page holds the pointer 612 * to the mirror. 613 */ 614 rpage->zone_device_data = dmirror; 615 616 *dst = migrate_pfn(page_to_pfn(dpage)) | 617 MIGRATE_PFN_LOCKED; 618 if ((*src & MIGRATE_PFN_WRITE) || 619 (!spage && args->vma->vm_flags & VM_WRITE)) 620 *dst |= MIGRATE_PFN_WRITE; 621 } 622 } 623 624 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 625 struct dmirror *dmirror) 626 { 627 unsigned long start = args->start; 628 unsigned long end = args->end; 629 const unsigned long *src = args->src; 630 const unsigned long *dst = args->dst; 631 unsigned long pfn; 632 633 /* Map the migrated pages into the device's page tables. */ 634 mutex_lock(&dmirror->mutex); 635 636 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 637 src++, dst++) { 638 struct page *dpage; 639 void *entry; 640 641 if (!(*src & MIGRATE_PFN_MIGRATE)) 642 continue; 643 644 dpage = migrate_pfn_to_page(*dst); 645 if (!dpage) 646 continue; 647 648 /* 649 * Store the page that holds the data so the page table 650 * doesn't have to deal with ZONE_DEVICE private pages. 651 */ 652 entry = dpage->zone_device_data; 653 if (*dst & MIGRATE_PFN_WRITE) 654 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 655 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 656 if (xa_is_err(entry)) { 657 mutex_unlock(&dmirror->mutex); 658 return xa_err(entry); 659 } 660 } 661 662 mutex_unlock(&dmirror->mutex); 663 return 0; 664 } 665 666 static int dmirror_migrate(struct dmirror *dmirror, 667 struct hmm_dmirror_cmd *cmd) 668 { 669 unsigned long start, end, addr; 670 unsigned long size = cmd->npages << PAGE_SHIFT; 671 struct mm_struct *mm = dmirror->notifier.mm; 672 struct vm_area_struct *vma; 673 unsigned long src_pfns[64]; 674 unsigned long dst_pfns[64]; 675 struct dmirror_bounce bounce; 676 struct migrate_vma args; 677 unsigned long next; 678 int ret; 679 680 start = cmd->addr; 681 end = start + size; 682 if (end < start) 683 return -EINVAL; 684 685 /* Since the mm is for the mirrored process, get a reference first. */ 686 if (!mmget_not_zero(mm)) 687 return -EINVAL; 688 689 mmap_read_lock(mm); 690 for (addr = start; addr < end; addr = next) { 691 vma = find_vma(mm, addr); 692 if (!vma || addr < vma->vm_start || 693 !(vma->vm_flags & VM_READ)) { 694 ret = -EINVAL; 695 goto out; 696 } 697 next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 698 if (next > vma->vm_end) 699 next = vma->vm_end; 700 701 args.vma = vma; 702 args.src = src_pfns; 703 args.dst = dst_pfns; 704 args.start = addr; 705 args.end = next; 706 args.src_owner = NULL; 707 ret = migrate_vma_setup(&args); 708 if (ret) 709 goto out; 710 711 dmirror_migrate_alloc_and_copy(&args, dmirror); 712 migrate_vma_pages(&args); 713 dmirror_migrate_finalize_and_map(&args, dmirror); 714 migrate_vma_finalize(&args); 715 } 716 mmap_read_unlock(mm); 717 mmput(mm); 718 719 /* Return the migrated data for verification. */ 720 ret = dmirror_bounce_init(&bounce, start, size); 721 if (ret) 722 return ret; 723 mutex_lock(&dmirror->mutex); 724 ret = dmirror_do_read(dmirror, start, end, &bounce); 725 mutex_unlock(&dmirror->mutex); 726 if (ret == 0) { 727 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 728 bounce.size)) 729 ret = -EFAULT; 730 } 731 cmd->cpages = bounce.cpages; 732 dmirror_bounce_fini(&bounce); 733 return ret; 734 735 out: 736 mmap_read_unlock(mm); 737 mmput(mm); 738 return ret; 739 } 740 741 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 742 unsigned char *perm, unsigned long entry) 743 { 744 struct page *page; 745 746 if (entry & HMM_PFN_ERROR) { 747 *perm = HMM_DMIRROR_PROT_ERROR; 748 return; 749 } 750 if (!(entry & HMM_PFN_VALID)) { 751 *perm = HMM_DMIRROR_PROT_NONE; 752 return; 753 } 754 755 page = hmm_pfn_to_page(entry); 756 if (is_device_private_page(page)) { 757 /* Is the page migrated to this device or some other? */ 758 if (dmirror->mdevice == dmirror_page_to_device(page)) 759 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 760 else 761 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 762 } else if (is_zero_pfn(page_to_pfn(page))) 763 *perm = HMM_DMIRROR_PROT_ZERO; 764 else 765 *perm = HMM_DMIRROR_PROT_NONE; 766 if (entry & HMM_PFN_WRITE) 767 *perm |= HMM_DMIRROR_PROT_WRITE; 768 else 769 *perm |= HMM_DMIRROR_PROT_READ; 770 } 771 772 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 773 const struct mmu_notifier_range *range, 774 unsigned long cur_seq) 775 { 776 struct dmirror_interval *dmi = 777 container_of(mni, struct dmirror_interval, notifier); 778 struct dmirror *dmirror = dmi->dmirror; 779 780 if (mmu_notifier_range_blockable(range)) 781 mutex_lock(&dmirror->mutex); 782 else if (!mutex_trylock(&dmirror->mutex)) 783 return false; 784 785 /* 786 * Snapshots only need to set the sequence number since any 787 * invalidation in the interval invalidates the whole snapshot. 788 */ 789 mmu_interval_set_seq(mni, cur_seq); 790 791 mutex_unlock(&dmirror->mutex); 792 return true; 793 } 794 795 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 796 .invalidate = dmirror_snapshot_invalidate, 797 }; 798 799 static int dmirror_range_snapshot(struct dmirror *dmirror, 800 struct hmm_range *range, 801 unsigned char *perm) 802 { 803 struct mm_struct *mm = dmirror->notifier.mm; 804 struct dmirror_interval notifier; 805 unsigned long timeout = 806 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 807 unsigned long i; 808 unsigned long n; 809 int ret = 0; 810 811 notifier.dmirror = dmirror; 812 range->notifier = ¬ifier.notifier; 813 814 ret = mmu_interval_notifier_insert(range->notifier, mm, 815 range->start, range->end - range->start, 816 &dmirror_mrn_ops); 817 if (ret) 818 return ret; 819 820 while (true) { 821 if (time_after(jiffies, timeout)) { 822 ret = -EBUSY; 823 goto out; 824 } 825 826 range->notifier_seq = mmu_interval_read_begin(range->notifier); 827 828 mmap_read_lock(mm); 829 ret = hmm_range_fault(range); 830 mmap_read_unlock(mm); 831 if (ret) { 832 if (ret == -EBUSY) 833 continue; 834 goto out; 835 } 836 837 mutex_lock(&dmirror->mutex); 838 if (mmu_interval_read_retry(range->notifier, 839 range->notifier_seq)) { 840 mutex_unlock(&dmirror->mutex); 841 continue; 842 } 843 break; 844 } 845 846 n = (range->end - range->start) >> PAGE_SHIFT; 847 for (i = 0; i < n; i++) 848 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 849 850 mutex_unlock(&dmirror->mutex); 851 out: 852 mmu_interval_notifier_remove(range->notifier); 853 return ret; 854 } 855 856 static int dmirror_snapshot(struct dmirror *dmirror, 857 struct hmm_dmirror_cmd *cmd) 858 { 859 struct mm_struct *mm = dmirror->notifier.mm; 860 unsigned long start, end; 861 unsigned long size = cmd->npages << PAGE_SHIFT; 862 unsigned long addr; 863 unsigned long next; 864 unsigned long pfns[64]; 865 unsigned char perm[64]; 866 char __user *uptr; 867 struct hmm_range range = { 868 .hmm_pfns = pfns, 869 .dev_private_owner = dmirror->mdevice, 870 }; 871 int ret = 0; 872 873 start = cmd->addr; 874 end = start + size; 875 if (end < start) 876 return -EINVAL; 877 878 /* Since the mm is for the mirrored process, get a reference first. */ 879 if (!mmget_not_zero(mm)) 880 return -EINVAL; 881 882 /* 883 * Register a temporary notifier to detect invalidations even if it 884 * overlaps with other mmu_interval_notifiers. 885 */ 886 uptr = u64_to_user_ptr(cmd->ptr); 887 for (addr = start; addr < end; addr = next) { 888 unsigned long n; 889 890 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 891 range.start = addr; 892 range.end = next; 893 894 ret = dmirror_range_snapshot(dmirror, &range, perm); 895 if (ret) 896 break; 897 898 n = (range.end - range.start) >> PAGE_SHIFT; 899 if (copy_to_user(uptr, perm, n)) { 900 ret = -EFAULT; 901 break; 902 } 903 904 cmd->cpages += n; 905 uptr += n; 906 } 907 mmput(mm); 908 909 return ret; 910 } 911 912 static long dmirror_fops_unlocked_ioctl(struct file *filp, 913 unsigned int command, 914 unsigned long arg) 915 { 916 void __user *uarg = (void __user *)arg; 917 struct hmm_dmirror_cmd cmd; 918 struct dmirror *dmirror; 919 int ret; 920 921 dmirror = filp->private_data; 922 if (!dmirror) 923 return -EINVAL; 924 925 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 926 return -EFAULT; 927 928 if (cmd.addr & ~PAGE_MASK) 929 return -EINVAL; 930 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 931 return -EINVAL; 932 933 cmd.cpages = 0; 934 cmd.faults = 0; 935 936 switch (command) { 937 case HMM_DMIRROR_READ: 938 ret = dmirror_read(dmirror, &cmd); 939 break; 940 941 case HMM_DMIRROR_WRITE: 942 ret = dmirror_write(dmirror, &cmd); 943 break; 944 945 case HMM_DMIRROR_MIGRATE: 946 ret = dmirror_migrate(dmirror, &cmd); 947 break; 948 949 case HMM_DMIRROR_SNAPSHOT: 950 ret = dmirror_snapshot(dmirror, &cmd); 951 break; 952 953 default: 954 return -EINVAL; 955 } 956 if (ret) 957 return ret; 958 959 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 960 return -EFAULT; 961 962 return 0; 963 } 964 965 static const struct file_operations dmirror_fops = { 966 .open = dmirror_fops_open, 967 .release = dmirror_fops_release, 968 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 969 .llseek = default_llseek, 970 .owner = THIS_MODULE, 971 }; 972 973 static void dmirror_devmem_free(struct page *page) 974 { 975 struct page *rpage = page->zone_device_data; 976 struct dmirror_device *mdevice; 977 978 if (rpage) 979 __free_page(rpage); 980 981 mdevice = dmirror_page_to_device(page); 982 983 spin_lock(&mdevice->lock); 984 mdevice->cfree++; 985 page->zone_device_data = mdevice->free_pages; 986 mdevice->free_pages = page; 987 spin_unlock(&mdevice->lock); 988 } 989 990 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 991 struct dmirror_device *mdevice) 992 { 993 const unsigned long *src = args->src; 994 unsigned long *dst = args->dst; 995 unsigned long start = args->start; 996 unsigned long end = args->end; 997 unsigned long addr; 998 999 for (addr = start; addr < end; addr += PAGE_SIZE, 1000 src++, dst++) { 1001 struct page *dpage, *spage; 1002 1003 spage = migrate_pfn_to_page(*src); 1004 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 1005 continue; 1006 spage = spage->zone_device_data; 1007 1008 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1009 if (!dpage) 1010 continue; 1011 1012 lock_page(dpage); 1013 copy_highpage(dpage, spage); 1014 *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; 1015 if (*src & MIGRATE_PFN_WRITE) 1016 *dst |= MIGRATE_PFN_WRITE; 1017 } 1018 return 0; 1019 } 1020 1021 static void dmirror_devmem_fault_finalize_and_map(struct migrate_vma *args, 1022 struct dmirror *dmirror) 1023 { 1024 /* Invalidate the device's page table mapping. */ 1025 mutex_lock(&dmirror->mutex); 1026 dmirror_do_update(dmirror, args->start, args->end); 1027 mutex_unlock(&dmirror->mutex); 1028 } 1029 1030 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1031 { 1032 struct migrate_vma args; 1033 unsigned long src_pfns; 1034 unsigned long dst_pfns; 1035 struct page *rpage; 1036 struct dmirror *dmirror; 1037 vm_fault_t ret; 1038 1039 /* 1040 * Normally, a device would use the page->zone_device_data to point to 1041 * the mirror but here we use it to hold the page for the simulated 1042 * device memory and that page holds the pointer to the mirror. 1043 */ 1044 rpage = vmf->page->zone_device_data; 1045 dmirror = rpage->zone_device_data; 1046 1047 /* FIXME demonstrate how we can adjust migrate range */ 1048 args.vma = vmf->vma; 1049 args.start = vmf->address; 1050 args.end = args.start + PAGE_SIZE; 1051 args.src = &src_pfns; 1052 args.dst = &dst_pfns; 1053 args.src_owner = dmirror->mdevice; 1054 1055 if (migrate_vma_setup(&args)) 1056 return VM_FAULT_SIGBUS; 1057 1058 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror->mdevice); 1059 if (ret) 1060 return ret; 1061 migrate_vma_pages(&args); 1062 dmirror_devmem_fault_finalize_and_map(&args, dmirror); 1063 migrate_vma_finalize(&args); 1064 return 0; 1065 } 1066 1067 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1068 .page_free = dmirror_devmem_free, 1069 .migrate_to_ram = dmirror_devmem_fault, 1070 }; 1071 1072 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1073 { 1074 dev_t dev; 1075 int ret; 1076 1077 dev = MKDEV(MAJOR(dmirror_dev), id); 1078 mutex_init(&mdevice->devmem_lock); 1079 spin_lock_init(&mdevice->lock); 1080 1081 cdev_init(&mdevice->cdevice, &dmirror_fops); 1082 mdevice->cdevice.owner = THIS_MODULE; 1083 ret = cdev_add(&mdevice->cdevice, dev, 1); 1084 if (ret) 1085 return ret; 1086 1087 /* Build a list of free ZONE_DEVICE private struct pages */ 1088 dmirror_allocate_chunk(mdevice, NULL); 1089 1090 return 0; 1091 } 1092 1093 static void dmirror_device_remove(struct dmirror_device *mdevice) 1094 { 1095 unsigned int i; 1096 1097 if (mdevice->devmem_chunks) { 1098 for (i = 0; i < mdevice->devmem_count; i++) { 1099 struct dmirror_chunk *devmem = 1100 mdevice->devmem_chunks[i]; 1101 1102 memunmap_pages(&devmem->pagemap); 1103 release_mem_region(devmem->pagemap.res.start, 1104 resource_size(&devmem->pagemap.res)); 1105 kfree(devmem); 1106 } 1107 kfree(mdevice->devmem_chunks); 1108 } 1109 1110 cdev_del(&mdevice->cdevice); 1111 } 1112 1113 static int __init hmm_dmirror_init(void) 1114 { 1115 int ret; 1116 int id; 1117 1118 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1119 "HMM_DMIRROR"); 1120 if (ret) 1121 goto err_unreg; 1122 1123 for (id = 0; id < DMIRROR_NDEVICES; id++) { 1124 ret = dmirror_device_init(dmirror_devices + id, id); 1125 if (ret) 1126 goto err_chrdev; 1127 } 1128 1129 /* 1130 * Allocate a zero page to simulate a reserved page of device private 1131 * memory which is always zero. The zero_pfn page isn't used just to 1132 * make the code here simpler (i.e., we need a struct page for it). 1133 */ 1134 dmirror_zero_page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); 1135 if (!dmirror_zero_page) { 1136 ret = -ENOMEM; 1137 goto err_chrdev; 1138 } 1139 1140 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1141 return 0; 1142 1143 err_chrdev: 1144 while (--id >= 0) 1145 dmirror_device_remove(dmirror_devices + id); 1146 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1147 err_unreg: 1148 return ret; 1149 } 1150 1151 static void __exit hmm_dmirror_exit(void) 1152 { 1153 int id; 1154 1155 if (dmirror_zero_page) 1156 __free_page(dmirror_zero_page); 1157 for (id = 0; id < DMIRROR_NDEVICES; id++) 1158 dmirror_device_remove(dmirror_devices + id); 1159 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1160 } 1161 1162 module_init(hmm_dmirror_init); 1163 module_exit(hmm_dmirror_exit); 1164 MODULE_LICENSE("GPL"); 1165