xref: /openbmc/linux/lib/test_hmm.c (revision 8a649e33f48e08be20c51541d9184645892ec370)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * This is a module to test the HMM (Heterogeneous Memory Management)
4  * mirror and zone device private memory migration APIs of the kernel.
5  * Userspace programs can register with the driver to mirror their own address
6  * space and can use the device to read/write any valid virtual address.
7  */
8 #include <linux/init.h>
9 #include <linux/fs.h>
10 #include <linux/mm.h>
11 #include <linux/module.h>
12 #include <linux/kernel.h>
13 #include <linux/cdev.h>
14 #include <linux/device.h>
15 #include <linux/memremap.h>
16 #include <linux/mutex.h>
17 #include <linux/rwsem.h>
18 #include <linux/sched.h>
19 #include <linux/slab.h>
20 #include <linux/highmem.h>
21 #include <linux/delay.h>
22 #include <linux/pagemap.h>
23 #include <linux/hmm.h>
24 #include <linux/vmalloc.h>
25 #include <linux/swap.h>
26 #include <linux/swapops.h>
27 #include <linux/sched/mm.h>
28 #include <linux/platform_device.h>
29 #include <linux/rmap.h>
30 #include <linux/mmu_notifier.h>
31 #include <linux/migrate.h>
32 
33 #include "test_hmm_uapi.h"
34 
35 #define DMIRROR_NDEVICES		4
36 #define DMIRROR_RANGE_FAULT_TIMEOUT	1000
37 #define DEVMEM_CHUNK_SIZE		(256 * 1024 * 1024U)
38 #define DEVMEM_CHUNKS_RESERVE		16
39 
40 /*
41  * For device_private pages, dpage is just a dummy struct page
42  * representing a piece of device memory. dmirror_devmem_alloc_page
43  * allocates a real system memory page as backing storage to fake a
44  * real device. zone_device_data points to that backing page. But
45  * for device_coherent memory, the struct page represents real
46  * physical CPU-accessible memory that we can use directly.
47  */
48 #define BACKING_PAGE(page) (is_device_private_page((page)) ? \
49 			   (page)->zone_device_data : (page))
50 
51 static unsigned long spm_addr_dev0;
52 module_param(spm_addr_dev0, long, 0644);
53 MODULE_PARM_DESC(spm_addr_dev0,
54 		"Specify start address for SPM (special purpose memory) used for device 0. By setting this Coherent device type will be used. Make sure spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
55 
56 static unsigned long spm_addr_dev1;
57 module_param(spm_addr_dev1, long, 0644);
58 MODULE_PARM_DESC(spm_addr_dev1,
59 		"Specify start address for SPM (special purpose memory) used for device 1. By setting this Coherent device type will be used. Make sure spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
60 
61 static const struct dev_pagemap_ops dmirror_devmem_ops;
62 static const struct mmu_interval_notifier_ops dmirror_min_ops;
63 static dev_t dmirror_dev;
64 
65 struct dmirror_device;
66 
67 struct dmirror_bounce {
68 	void			*ptr;
69 	unsigned long		size;
70 	unsigned long		addr;
71 	unsigned long		cpages;
72 };
73 
74 #define DPT_XA_TAG_ATOMIC 1UL
75 #define DPT_XA_TAG_WRITE 3UL
76 
77 /*
78  * Data structure to track address ranges and register for mmu interval
79  * notifier updates.
80  */
81 struct dmirror_interval {
82 	struct mmu_interval_notifier	notifier;
83 	struct dmirror			*dmirror;
84 };
85 
86 /*
87  * Data attached to the open device file.
88  * Note that it might be shared after a fork().
89  */
90 struct dmirror {
91 	struct dmirror_device		*mdevice;
92 	struct xarray			pt;
93 	struct mmu_interval_notifier	notifier;
94 	struct mutex			mutex;
95 };
96 
97 /*
98  * ZONE_DEVICE pages for migration and simulating device memory.
99  */
100 struct dmirror_chunk {
101 	struct dev_pagemap	pagemap;
102 	struct dmirror_device	*mdevice;
103 	bool remove;
104 };
105 
106 /*
107  * Per device data.
108  */
109 struct dmirror_device {
110 	struct cdev		cdevice;
111 	unsigned int            zone_device_type;
112 	struct device		device;
113 
114 	unsigned int		devmem_capacity;
115 	unsigned int		devmem_count;
116 	struct dmirror_chunk	**devmem_chunks;
117 	struct mutex		devmem_lock;	/* protects the above */
118 
119 	unsigned long		calloc;
120 	unsigned long		cfree;
121 	struct page		*free_pages;
122 	spinlock_t		lock;		/* protects the above */
123 };
124 
125 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES];
126 
127 static int dmirror_bounce_init(struct dmirror_bounce *bounce,
128 			       unsigned long addr,
129 			       unsigned long size)
130 {
131 	bounce->addr = addr;
132 	bounce->size = size;
133 	bounce->cpages = 0;
134 	bounce->ptr = vmalloc(size);
135 	if (!bounce->ptr)
136 		return -ENOMEM;
137 	return 0;
138 }
139 
140 static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
141 {
142 	return (mdevice->zone_device_type ==
143 		HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
144 }
145 
146 static enum migrate_vma_direction
147 dmirror_select_device(struct dmirror *dmirror)
148 {
149 	return (dmirror->mdevice->zone_device_type ==
150 		HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
151 		MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
152 		MIGRATE_VMA_SELECT_DEVICE_COHERENT;
153 }
154 
155 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
156 {
157 	vfree(bounce->ptr);
158 }
159 
160 static int dmirror_fops_open(struct inode *inode, struct file *filp)
161 {
162 	struct cdev *cdev = inode->i_cdev;
163 	struct dmirror *dmirror;
164 	int ret;
165 
166 	/* Mirror this process address space */
167 	dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL);
168 	if (dmirror == NULL)
169 		return -ENOMEM;
170 
171 	dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice);
172 	mutex_init(&dmirror->mutex);
173 	xa_init(&dmirror->pt);
174 
175 	ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm,
176 				0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops);
177 	if (ret) {
178 		kfree(dmirror);
179 		return ret;
180 	}
181 
182 	filp->private_data = dmirror;
183 	return 0;
184 }
185 
186 static int dmirror_fops_release(struct inode *inode, struct file *filp)
187 {
188 	struct dmirror *dmirror = filp->private_data;
189 
190 	mmu_interval_notifier_remove(&dmirror->notifier);
191 	xa_destroy(&dmirror->pt);
192 	kfree(dmirror);
193 	return 0;
194 }
195 
196 static struct dmirror_chunk *dmirror_page_to_chunk(struct page *page)
197 {
198 	return container_of(page->pgmap, struct dmirror_chunk, pagemap);
199 }
200 
201 static struct dmirror_device *dmirror_page_to_device(struct page *page)
202 
203 {
204 	return dmirror_page_to_chunk(page)->mdevice;
205 }
206 
207 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
208 {
209 	unsigned long *pfns = range->hmm_pfns;
210 	unsigned long pfn;
211 
212 	for (pfn = (range->start >> PAGE_SHIFT);
213 	     pfn < (range->end >> PAGE_SHIFT);
214 	     pfn++, pfns++) {
215 		struct page *page;
216 		void *entry;
217 
218 		/*
219 		 * Since we asked for hmm_range_fault() to populate pages,
220 		 * it shouldn't return an error entry on success.
221 		 */
222 		WARN_ON(*pfns & HMM_PFN_ERROR);
223 		WARN_ON(!(*pfns & HMM_PFN_VALID));
224 
225 		page = hmm_pfn_to_page(*pfns);
226 		WARN_ON(!page);
227 
228 		entry = page;
229 		if (*pfns & HMM_PFN_WRITE)
230 			entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
231 		else if (WARN_ON(range->default_flags & HMM_PFN_WRITE))
232 			return -EFAULT;
233 		entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
234 		if (xa_is_err(entry))
235 			return xa_err(entry);
236 	}
237 
238 	return 0;
239 }
240 
241 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start,
242 			      unsigned long end)
243 {
244 	unsigned long pfn;
245 	void *entry;
246 
247 	/*
248 	 * The XArray doesn't hold references to pages since it relies on
249 	 * the mmu notifier to clear page pointers when they become stale.
250 	 * Therefore, it is OK to just clear the entry.
251 	 */
252 	xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT,
253 			  end >> PAGE_SHIFT)
254 		xa_erase(&dmirror->pt, pfn);
255 }
256 
257 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
258 				const struct mmu_notifier_range *range,
259 				unsigned long cur_seq)
260 {
261 	struct dmirror *dmirror = container_of(mni, struct dmirror, notifier);
262 
263 	/*
264 	 * Ignore invalidation callbacks for device private pages since
265 	 * the invalidation is handled as part of the migration process.
266 	 */
267 	if (range->event == MMU_NOTIFY_MIGRATE &&
268 	    range->owner == dmirror->mdevice)
269 		return true;
270 
271 	if (mmu_notifier_range_blockable(range))
272 		mutex_lock(&dmirror->mutex);
273 	else if (!mutex_trylock(&dmirror->mutex))
274 		return false;
275 
276 	mmu_interval_set_seq(mni, cur_seq);
277 	dmirror_do_update(dmirror, range->start, range->end);
278 
279 	mutex_unlock(&dmirror->mutex);
280 	return true;
281 }
282 
283 static const struct mmu_interval_notifier_ops dmirror_min_ops = {
284 	.invalidate = dmirror_interval_invalidate,
285 };
286 
287 static int dmirror_range_fault(struct dmirror *dmirror,
288 				struct hmm_range *range)
289 {
290 	struct mm_struct *mm = dmirror->notifier.mm;
291 	unsigned long timeout =
292 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
293 	int ret;
294 
295 	while (true) {
296 		if (time_after(jiffies, timeout)) {
297 			ret = -EBUSY;
298 			goto out;
299 		}
300 
301 		range->notifier_seq = mmu_interval_read_begin(range->notifier);
302 		mmap_read_lock(mm);
303 		ret = hmm_range_fault(range);
304 		mmap_read_unlock(mm);
305 		if (ret) {
306 			if (ret == -EBUSY)
307 				continue;
308 			goto out;
309 		}
310 
311 		mutex_lock(&dmirror->mutex);
312 		if (mmu_interval_read_retry(range->notifier,
313 					    range->notifier_seq)) {
314 			mutex_unlock(&dmirror->mutex);
315 			continue;
316 		}
317 		break;
318 	}
319 
320 	ret = dmirror_do_fault(dmirror, range);
321 
322 	mutex_unlock(&dmirror->mutex);
323 out:
324 	return ret;
325 }
326 
327 static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
328 			 unsigned long end, bool write)
329 {
330 	struct mm_struct *mm = dmirror->notifier.mm;
331 	unsigned long addr;
332 	unsigned long pfns[64];
333 	struct hmm_range range = {
334 		.notifier = &dmirror->notifier,
335 		.hmm_pfns = pfns,
336 		.pfn_flags_mask = 0,
337 		.default_flags =
338 			HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0),
339 		.dev_private_owner = dmirror->mdevice,
340 	};
341 	int ret = 0;
342 
343 	/* Since the mm is for the mirrored process, get a reference first. */
344 	if (!mmget_not_zero(mm))
345 		return 0;
346 
347 	for (addr = start; addr < end; addr = range.end) {
348 		range.start = addr;
349 		range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
350 
351 		ret = dmirror_range_fault(dmirror, &range);
352 		if (ret)
353 			break;
354 	}
355 
356 	mmput(mm);
357 	return ret;
358 }
359 
360 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start,
361 			   unsigned long end, struct dmirror_bounce *bounce)
362 {
363 	unsigned long pfn;
364 	void *ptr;
365 
366 	ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
367 
368 	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
369 		void *entry;
370 		struct page *page;
371 		void *tmp;
372 
373 		entry = xa_load(&dmirror->pt, pfn);
374 		page = xa_untag_pointer(entry);
375 		if (!page)
376 			return -ENOENT;
377 
378 		tmp = kmap(page);
379 		memcpy(ptr, tmp, PAGE_SIZE);
380 		kunmap(page);
381 
382 		ptr += PAGE_SIZE;
383 		bounce->cpages++;
384 	}
385 
386 	return 0;
387 }
388 
389 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
390 {
391 	struct dmirror_bounce bounce;
392 	unsigned long start, end;
393 	unsigned long size = cmd->npages << PAGE_SHIFT;
394 	int ret;
395 
396 	start = cmd->addr;
397 	end = start + size;
398 	if (end < start)
399 		return -EINVAL;
400 
401 	ret = dmirror_bounce_init(&bounce, start, size);
402 	if (ret)
403 		return ret;
404 
405 	while (1) {
406 		mutex_lock(&dmirror->mutex);
407 		ret = dmirror_do_read(dmirror, start, end, &bounce);
408 		mutex_unlock(&dmirror->mutex);
409 		if (ret != -ENOENT)
410 			break;
411 
412 		start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
413 		ret = dmirror_fault(dmirror, start, end, false);
414 		if (ret)
415 			break;
416 		cmd->faults++;
417 	}
418 
419 	if (ret == 0) {
420 		if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
421 				 bounce.size))
422 			ret = -EFAULT;
423 	}
424 	cmd->cpages = bounce.cpages;
425 	dmirror_bounce_fini(&bounce);
426 	return ret;
427 }
428 
429 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start,
430 			    unsigned long end, struct dmirror_bounce *bounce)
431 {
432 	unsigned long pfn;
433 	void *ptr;
434 
435 	ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
436 
437 	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
438 		void *entry;
439 		struct page *page;
440 		void *tmp;
441 
442 		entry = xa_load(&dmirror->pt, pfn);
443 		page = xa_untag_pointer(entry);
444 		if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE)
445 			return -ENOENT;
446 
447 		tmp = kmap(page);
448 		memcpy(tmp, ptr, PAGE_SIZE);
449 		kunmap(page);
450 
451 		ptr += PAGE_SIZE;
452 		bounce->cpages++;
453 	}
454 
455 	return 0;
456 }
457 
458 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
459 {
460 	struct dmirror_bounce bounce;
461 	unsigned long start, end;
462 	unsigned long size = cmd->npages << PAGE_SHIFT;
463 	int ret;
464 
465 	start = cmd->addr;
466 	end = start + size;
467 	if (end < start)
468 		return -EINVAL;
469 
470 	ret = dmirror_bounce_init(&bounce, start, size);
471 	if (ret)
472 		return ret;
473 	if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr),
474 			   bounce.size)) {
475 		ret = -EFAULT;
476 		goto fini;
477 	}
478 
479 	while (1) {
480 		mutex_lock(&dmirror->mutex);
481 		ret = dmirror_do_write(dmirror, start, end, &bounce);
482 		mutex_unlock(&dmirror->mutex);
483 		if (ret != -ENOENT)
484 			break;
485 
486 		start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
487 		ret = dmirror_fault(dmirror, start, end, true);
488 		if (ret)
489 			break;
490 		cmd->faults++;
491 	}
492 
493 fini:
494 	cmd->cpages = bounce.cpages;
495 	dmirror_bounce_fini(&bounce);
496 	return ret;
497 }
498 
499 static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
500 				   struct page **ppage)
501 {
502 	struct dmirror_chunk *devmem;
503 	struct resource *res = NULL;
504 	unsigned long pfn;
505 	unsigned long pfn_first;
506 	unsigned long pfn_last;
507 	void *ptr;
508 	int ret = -ENOMEM;
509 
510 	devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
511 	if (!devmem)
512 		return ret;
513 
514 	switch (mdevice->zone_device_type) {
515 	case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
516 		res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
517 					      "hmm_dmirror");
518 		if (IS_ERR_OR_NULL(res))
519 			goto err_devmem;
520 		devmem->pagemap.range.start = res->start;
521 		devmem->pagemap.range.end = res->end;
522 		devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
523 		break;
524 	case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
525 		devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ?
526 							spm_addr_dev0 :
527 							spm_addr_dev1;
528 		devmem->pagemap.range.end = devmem->pagemap.range.start +
529 					    DEVMEM_CHUNK_SIZE - 1;
530 		devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
531 		break;
532 	default:
533 		ret = -EINVAL;
534 		goto err_devmem;
535 	}
536 
537 	devmem->pagemap.nr_range = 1;
538 	devmem->pagemap.ops = &dmirror_devmem_ops;
539 	devmem->pagemap.owner = mdevice;
540 
541 	mutex_lock(&mdevice->devmem_lock);
542 
543 	if (mdevice->devmem_count == mdevice->devmem_capacity) {
544 		struct dmirror_chunk **new_chunks;
545 		unsigned int new_capacity;
546 
547 		new_capacity = mdevice->devmem_capacity +
548 				DEVMEM_CHUNKS_RESERVE;
549 		new_chunks = krealloc(mdevice->devmem_chunks,
550 				sizeof(new_chunks[0]) * new_capacity,
551 				GFP_KERNEL);
552 		if (!new_chunks)
553 			goto err_release;
554 		mdevice->devmem_capacity = new_capacity;
555 		mdevice->devmem_chunks = new_chunks;
556 	}
557 	ptr = memremap_pages(&devmem->pagemap, numa_node_id());
558 	if (IS_ERR_OR_NULL(ptr)) {
559 		if (ptr)
560 			ret = PTR_ERR(ptr);
561 		else
562 			ret = -EFAULT;
563 		goto err_release;
564 	}
565 
566 	devmem->mdevice = mdevice;
567 	pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
568 	pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT);
569 	mdevice->devmem_chunks[mdevice->devmem_count++] = devmem;
570 
571 	mutex_unlock(&mdevice->devmem_lock);
572 
573 	pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n",
574 		DEVMEM_CHUNK_SIZE / (1024 * 1024),
575 		mdevice->devmem_count,
576 		mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)),
577 		pfn_first, pfn_last);
578 
579 	spin_lock(&mdevice->lock);
580 	for (pfn = pfn_first; pfn < pfn_last; pfn++) {
581 		struct page *page = pfn_to_page(pfn);
582 
583 		page->zone_device_data = mdevice->free_pages;
584 		mdevice->free_pages = page;
585 	}
586 	if (ppage) {
587 		*ppage = mdevice->free_pages;
588 		mdevice->free_pages = (*ppage)->zone_device_data;
589 		mdevice->calloc++;
590 	}
591 	spin_unlock(&mdevice->lock);
592 
593 	return 0;
594 
595 err_release:
596 	mutex_unlock(&mdevice->devmem_lock);
597 	if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
598 		release_mem_region(devmem->pagemap.range.start,
599 				   range_len(&devmem->pagemap.range));
600 err_devmem:
601 	kfree(devmem);
602 
603 	return ret;
604 }
605 
606 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
607 {
608 	struct page *dpage = NULL;
609 	struct page *rpage = NULL;
610 
611 	/*
612 	 * For ZONE_DEVICE private type, this is a fake device so we allocate
613 	 * real system memory to store our device memory.
614 	 * For ZONE_DEVICE coherent type we use the actual dpage to store the
615 	 * data and ignore rpage.
616 	 */
617 	if (dmirror_is_private_zone(mdevice)) {
618 		rpage = alloc_page(GFP_HIGHUSER);
619 		if (!rpage)
620 			return NULL;
621 	}
622 	spin_lock(&mdevice->lock);
623 
624 	if (mdevice->free_pages) {
625 		dpage = mdevice->free_pages;
626 		mdevice->free_pages = dpage->zone_device_data;
627 		mdevice->calloc++;
628 		spin_unlock(&mdevice->lock);
629 	} else {
630 		spin_unlock(&mdevice->lock);
631 		if (dmirror_allocate_chunk(mdevice, &dpage))
632 			goto error;
633 	}
634 
635 	zone_device_page_init(dpage);
636 	dpage->zone_device_data = rpage;
637 	return dpage;
638 
639 error:
640 	if (rpage)
641 		__free_page(rpage);
642 	return NULL;
643 }
644 
645 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
646 					   struct dmirror *dmirror)
647 {
648 	struct dmirror_device *mdevice = dmirror->mdevice;
649 	const unsigned long *src = args->src;
650 	unsigned long *dst = args->dst;
651 	unsigned long addr;
652 
653 	for (addr = args->start; addr < args->end; addr += PAGE_SIZE,
654 						   src++, dst++) {
655 		struct page *spage;
656 		struct page *dpage;
657 		struct page *rpage;
658 
659 		if (!(*src & MIGRATE_PFN_MIGRATE))
660 			continue;
661 
662 		/*
663 		 * Note that spage might be NULL which is OK since it is an
664 		 * unallocated pte_none() or read-only zero page.
665 		 */
666 		spage = migrate_pfn_to_page(*src);
667 		if (WARN(spage && is_zone_device_page(spage),
668 		     "page already in device spage pfn: 0x%lx\n",
669 		     page_to_pfn(spage)))
670 			continue;
671 
672 		dpage = dmirror_devmem_alloc_page(mdevice);
673 		if (!dpage)
674 			continue;
675 
676 		rpage = BACKING_PAGE(dpage);
677 		if (spage)
678 			copy_highpage(rpage, spage);
679 		else
680 			clear_highpage(rpage);
681 
682 		/*
683 		 * Normally, a device would use the page->zone_device_data to
684 		 * point to the mirror but here we use it to hold the page for
685 		 * the simulated device memory and that page holds the pointer
686 		 * to the mirror.
687 		 */
688 		rpage->zone_device_data = dmirror;
689 
690 		pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
691 			 page_to_pfn(spage), page_to_pfn(dpage));
692 		*dst = migrate_pfn(page_to_pfn(dpage));
693 		if ((*src & MIGRATE_PFN_WRITE) ||
694 		    (!spage && args->vma->vm_flags & VM_WRITE))
695 			*dst |= MIGRATE_PFN_WRITE;
696 	}
697 }
698 
699 static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start,
700 			     unsigned long end)
701 {
702 	unsigned long pfn;
703 
704 	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
705 		void *entry;
706 
707 		entry = xa_load(&dmirror->pt, pfn);
708 		if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC)
709 			return -EPERM;
710 	}
711 
712 	return 0;
713 }
714 
715 static int dmirror_atomic_map(unsigned long start, unsigned long end,
716 			      struct page **pages, struct dmirror *dmirror)
717 {
718 	unsigned long pfn, mapped = 0;
719 	int i;
720 
721 	/* Map the migrated pages into the device's page tables. */
722 	mutex_lock(&dmirror->mutex);
723 
724 	for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) {
725 		void *entry;
726 
727 		if (!pages[i])
728 			continue;
729 
730 		entry = pages[i];
731 		entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC);
732 		entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
733 		if (xa_is_err(entry)) {
734 			mutex_unlock(&dmirror->mutex);
735 			return xa_err(entry);
736 		}
737 
738 		mapped++;
739 	}
740 
741 	mutex_unlock(&dmirror->mutex);
742 	return mapped;
743 }
744 
745 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
746 					    struct dmirror *dmirror)
747 {
748 	unsigned long start = args->start;
749 	unsigned long end = args->end;
750 	const unsigned long *src = args->src;
751 	const unsigned long *dst = args->dst;
752 	unsigned long pfn;
753 
754 	/* Map the migrated pages into the device's page tables. */
755 	mutex_lock(&dmirror->mutex);
756 
757 	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++,
758 								src++, dst++) {
759 		struct page *dpage;
760 		void *entry;
761 
762 		if (!(*src & MIGRATE_PFN_MIGRATE))
763 			continue;
764 
765 		dpage = migrate_pfn_to_page(*dst);
766 		if (!dpage)
767 			continue;
768 
769 		entry = BACKING_PAGE(dpage);
770 		if (*dst & MIGRATE_PFN_WRITE)
771 			entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
772 		entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
773 		if (xa_is_err(entry)) {
774 			mutex_unlock(&dmirror->mutex);
775 			return xa_err(entry);
776 		}
777 	}
778 
779 	mutex_unlock(&dmirror->mutex);
780 	return 0;
781 }
782 
783 static int dmirror_exclusive(struct dmirror *dmirror,
784 			     struct hmm_dmirror_cmd *cmd)
785 {
786 	unsigned long start, end, addr;
787 	unsigned long size = cmd->npages << PAGE_SHIFT;
788 	struct mm_struct *mm = dmirror->notifier.mm;
789 	struct page *pages[64];
790 	struct dmirror_bounce bounce;
791 	unsigned long next;
792 	int ret;
793 
794 	start = cmd->addr;
795 	end = start + size;
796 	if (end < start)
797 		return -EINVAL;
798 
799 	/* Since the mm is for the mirrored process, get a reference first. */
800 	if (!mmget_not_zero(mm))
801 		return -EINVAL;
802 
803 	mmap_read_lock(mm);
804 	for (addr = start; addr < end; addr = next) {
805 		unsigned long mapped = 0;
806 		int i;
807 
808 		if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT))
809 			next = end;
810 		else
811 			next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT);
812 
813 		ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
814 		/*
815 		 * Do dmirror_atomic_map() iff all pages are marked for
816 		 * exclusive access to avoid accessing uninitialized
817 		 * fields of pages.
818 		 */
819 		if (ret == (next - addr) >> PAGE_SHIFT)
820 			mapped = dmirror_atomic_map(addr, next, pages, dmirror);
821 		for (i = 0; i < ret; i++) {
822 			if (pages[i]) {
823 				unlock_page(pages[i]);
824 				put_page(pages[i]);
825 			}
826 		}
827 
828 		if (addr + (mapped << PAGE_SHIFT) < next) {
829 			mmap_read_unlock(mm);
830 			mmput(mm);
831 			return -EBUSY;
832 		}
833 	}
834 	mmap_read_unlock(mm);
835 	mmput(mm);
836 
837 	/* Return the migrated data for verification. */
838 	ret = dmirror_bounce_init(&bounce, start, size);
839 	if (ret)
840 		return ret;
841 	mutex_lock(&dmirror->mutex);
842 	ret = dmirror_do_read(dmirror, start, end, &bounce);
843 	mutex_unlock(&dmirror->mutex);
844 	if (ret == 0) {
845 		if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
846 				 bounce.size))
847 			ret = -EFAULT;
848 	}
849 
850 	cmd->cpages = bounce.cpages;
851 	dmirror_bounce_fini(&bounce);
852 	return ret;
853 }
854 
855 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
856 						      struct dmirror *dmirror)
857 {
858 	const unsigned long *src = args->src;
859 	unsigned long *dst = args->dst;
860 	unsigned long start = args->start;
861 	unsigned long end = args->end;
862 	unsigned long addr;
863 
864 	for (addr = start; addr < end; addr += PAGE_SIZE,
865 				       src++, dst++) {
866 		struct page *dpage, *spage;
867 
868 		spage = migrate_pfn_to_page(*src);
869 		if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
870 			continue;
871 
872 		if (WARN_ON(!is_device_private_page(spage) &&
873 			    !is_device_coherent_page(spage)))
874 			continue;
875 		spage = BACKING_PAGE(spage);
876 		dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
877 		if (!dpage)
878 			continue;
879 		pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
880 			 page_to_pfn(spage), page_to_pfn(dpage));
881 
882 		lock_page(dpage);
883 		xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
884 		copy_highpage(dpage, spage);
885 		*dst = migrate_pfn(page_to_pfn(dpage));
886 		if (*src & MIGRATE_PFN_WRITE)
887 			*dst |= MIGRATE_PFN_WRITE;
888 	}
889 	return 0;
890 }
891 
892 static unsigned long
893 dmirror_successful_migrated_pages(struct migrate_vma *migrate)
894 {
895 	unsigned long cpages = 0;
896 	unsigned long i;
897 
898 	for (i = 0; i < migrate->npages; i++) {
899 		if (migrate->src[i] & MIGRATE_PFN_VALID &&
900 		    migrate->src[i] & MIGRATE_PFN_MIGRATE)
901 			cpages++;
902 	}
903 	return cpages;
904 }
905 
906 static int dmirror_migrate_to_system(struct dmirror *dmirror,
907 				     struct hmm_dmirror_cmd *cmd)
908 {
909 	unsigned long start, end, addr;
910 	unsigned long size = cmd->npages << PAGE_SHIFT;
911 	struct mm_struct *mm = dmirror->notifier.mm;
912 	struct vm_area_struct *vma;
913 	unsigned long src_pfns[64] = { 0 };
914 	unsigned long dst_pfns[64] = { 0 };
915 	struct migrate_vma args = { 0 };
916 	unsigned long next;
917 	int ret;
918 
919 	start = cmd->addr;
920 	end = start + size;
921 	if (end < start)
922 		return -EINVAL;
923 
924 	/* Since the mm is for the mirrored process, get a reference first. */
925 	if (!mmget_not_zero(mm))
926 		return -EINVAL;
927 
928 	cmd->cpages = 0;
929 	mmap_read_lock(mm);
930 	for (addr = start; addr < end; addr = next) {
931 		vma = vma_lookup(mm, addr);
932 		if (!vma || !(vma->vm_flags & VM_READ)) {
933 			ret = -EINVAL;
934 			goto out;
935 		}
936 		next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
937 		if (next > vma->vm_end)
938 			next = vma->vm_end;
939 
940 		args.vma = vma;
941 		args.src = src_pfns;
942 		args.dst = dst_pfns;
943 		args.start = addr;
944 		args.end = next;
945 		args.pgmap_owner = dmirror->mdevice;
946 		args.flags = dmirror_select_device(dmirror);
947 
948 		ret = migrate_vma_setup(&args);
949 		if (ret)
950 			goto out;
951 
952 		pr_debug("Migrating from device mem to sys mem\n");
953 		dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
954 
955 		migrate_vma_pages(&args);
956 		cmd->cpages += dmirror_successful_migrated_pages(&args);
957 		migrate_vma_finalize(&args);
958 	}
959 out:
960 	mmap_read_unlock(mm);
961 	mmput(mm);
962 
963 	return ret;
964 }
965 
966 static int dmirror_migrate_to_device(struct dmirror *dmirror,
967 				struct hmm_dmirror_cmd *cmd)
968 {
969 	unsigned long start, end, addr;
970 	unsigned long size = cmd->npages << PAGE_SHIFT;
971 	struct mm_struct *mm = dmirror->notifier.mm;
972 	struct vm_area_struct *vma;
973 	unsigned long src_pfns[64] = { 0 };
974 	unsigned long dst_pfns[64] = { 0 };
975 	struct dmirror_bounce bounce;
976 	struct migrate_vma args = { 0 };
977 	unsigned long next;
978 	int ret;
979 
980 	start = cmd->addr;
981 	end = start + size;
982 	if (end < start)
983 		return -EINVAL;
984 
985 	/* Since the mm is for the mirrored process, get a reference first. */
986 	if (!mmget_not_zero(mm))
987 		return -EINVAL;
988 
989 	mmap_read_lock(mm);
990 	for (addr = start; addr < end; addr = next) {
991 		vma = vma_lookup(mm, addr);
992 		if (!vma || !(vma->vm_flags & VM_READ)) {
993 			ret = -EINVAL;
994 			goto out;
995 		}
996 		next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
997 		if (next > vma->vm_end)
998 			next = vma->vm_end;
999 
1000 		args.vma = vma;
1001 		args.src = src_pfns;
1002 		args.dst = dst_pfns;
1003 		args.start = addr;
1004 		args.end = next;
1005 		args.pgmap_owner = dmirror->mdevice;
1006 		args.flags = MIGRATE_VMA_SELECT_SYSTEM;
1007 		ret = migrate_vma_setup(&args);
1008 		if (ret)
1009 			goto out;
1010 
1011 		pr_debug("Migrating from sys mem to device mem\n");
1012 		dmirror_migrate_alloc_and_copy(&args, dmirror);
1013 		migrate_vma_pages(&args);
1014 		dmirror_migrate_finalize_and_map(&args, dmirror);
1015 		migrate_vma_finalize(&args);
1016 	}
1017 	mmap_read_unlock(mm);
1018 	mmput(mm);
1019 
1020 	/*
1021 	 * Return the migrated data for verification.
1022 	 * Only for pages in device zone
1023 	 */
1024 	ret = dmirror_bounce_init(&bounce, start, size);
1025 	if (ret)
1026 		return ret;
1027 	mutex_lock(&dmirror->mutex);
1028 	ret = dmirror_do_read(dmirror, start, end, &bounce);
1029 	mutex_unlock(&dmirror->mutex);
1030 	if (ret == 0) {
1031 		if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
1032 				 bounce.size))
1033 			ret = -EFAULT;
1034 	}
1035 	cmd->cpages = bounce.cpages;
1036 	dmirror_bounce_fini(&bounce);
1037 	return ret;
1038 
1039 out:
1040 	mmap_read_unlock(mm);
1041 	mmput(mm);
1042 	return ret;
1043 }
1044 
1045 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
1046 			    unsigned char *perm, unsigned long entry)
1047 {
1048 	struct page *page;
1049 
1050 	if (entry & HMM_PFN_ERROR) {
1051 		*perm = HMM_DMIRROR_PROT_ERROR;
1052 		return;
1053 	}
1054 	if (!(entry & HMM_PFN_VALID)) {
1055 		*perm = HMM_DMIRROR_PROT_NONE;
1056 		return;
1057 	}
1058 
1059 	page = hmm_pfn_to_page(entry);
1060 	if (is_device_private_page(page)) {
1061 		/* Is the page migrated to this device or some other? */
1062 		if (dmirror->mdevice == dmirror_page_to_device(page))
1063 			*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
1064 		else
1065 			*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
1066 	} else if (is_device_coherent_page(page)) {
1067 		/* Is the page migrated to this device or some other? */
1068 		if (dmirror->mdevice == dmirror_page_to_device(page))
1069 			*perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
1070 		else
1071 			*perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
1072 	} else if (is_zero_pfn(page_to_pfn(page)))
1073 		*perm = HMM_DMIRROR_PROT_ZERO;
1074 	else
1075 		*perm = HMM_DMIRROR_PROT_NONE;
1076 	if (entry & HMM_PFN_WRITE)
1077 		*perm |= HMM_DMIRROR_PROT_WRITE;
1078 	else
1079 		*perm |= HMM_DMIRROR_PROT_READ;
1080 	if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT)
1081 		*perm |= HMM_DMIRROR_PROT_PMD;
1082 	else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT)
1083 		*perm |= HMM_DMIRROR_PROT_PUD;
1084 }
1085 
1086 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni,
1087 				const struct mmu_notifier_range *range,
1088 				unsigned long cur_seq)
1089 {
1090 	struct dmirror_interval *dmi =
1091 		container_of(mni, struct dmirror_interval, notifier);
1092 	struct dmirror *dmirror = dmi->dmirror;
1093 
1094 	if (mmu_notifier_range_blockable(range))
1095 		mutex_lock(&dmirror->mutex);
1096 	else if (!mutex_trylock(&dmirror->mutex))
1097 		return false;
1098 
1099 	/*
1100 	 * Snapshots only need to set the sequence number since any
1101 	 * invalidation in the interval invalidates the whole snapshot.
1102 	 */
1103 	mmu_interval_set_seq(mni, cur_seq);
1104 
1105 	mutex_unlock(&dmirror->mutex);
1106 	return true;
1107 }
1108 
1109 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = {
1110 	.invalidate = dmirror_snapshot_invalidate,
1111 };
1112 
1113 static int dmirror_range_snapshot(struct dmirror *dmirror,
1114 				  struct hmm_range *range,
1115 				  unsigned char *perm)
1116 {
1117 	struct mm_struct *mm = dmirror->notifier.mm;
1118 	struct dmirror_interval notifier;
1119 	unsigned long timeout =
1120 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1121 	unsigned long i;
1122 	unsigned long n;
1123 	int ret = 0;
1124 
1125 	notifier.dmirror = dmirror;
1126 	range->notifier = &notifier.notifier;
1127 
1128 	ret = mmu_interval_notifier_insert(range->notifier, mm,
1129 			range->start, range->end - range->start,
1130 			&dmirror_mrn_ops);
1131 	if (ret)
1132 		return ret;
1133 
1134 	while (true) {
1135 		if (time_after(jiffies, timeout)) {
1136 			ret = -EBUSY;
1137 			goto out;
1138 		}
1139 
1140 		range->notifier_seq = mmu_interval_read_begin(range->notifier);
1141 
1142 		mmap_read_lock(mm);
1143 		ret = hmm_range_fault(range);
1144 		mmap_read_unlock(mm);
1145 		if (ret) {
1146 			if (ret == -EBUSY)
1147 				continue;
1148 			goto out;
1149 		}
1150 
1151 		mutex_lock(&dmirror->mutex);
1152 		if (mmu_interval_read_retry(range->notifier,
1153 					    range->notifier_seq)) {
1154 			mutex_unlock(&dmirror->mutex);
1155 			continue;
1156 		}
1157 		break;
1158 	}
1159 
1160 	n = (range->end - range->start) >> PAGE_SHIFT;
1161 	for (i = 0; i < n; i++)
1162 		dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]);
1163 
1164 	mutex_unlock(&dmirror->mutex);
1165 out:
1166 	mmu_interval_notifier_remove(range->notifier);
1167 	return ret;
1168 }
1169 
1170 static int dmirror_snapshot(struct dmirror *dmirror,
1171 			    struct hmm_dmirror_cmd *cmd)
1172 {
1173 	struct mm_struct *mm = dmirror->notifier.mm;
1174 	unsigned long start, end;
1175 	unsigned long size = cmd->npages << PAGE_SHIFT;
1176 	unsigned long addr;
1177 	unsigned long next;
1178 	unsigned long pfns[64];
1179 	unsigned char perm[64];
1180 	char __user *uptr;
1181 	struct hmm_range range = {
1182 		.hmm_pfns = pfns,
1183 		.dev_private_owner = dmirror->mdevice,
1184 	};
1185 	int ret = 0;
1186 
1187 	start = cmd->addr;
1188 	end = start + size;
1189 	if (end < start)
1190 		return -EINVAL;
1191 
1192 	/* Since the mm is for the mirrored process, get a reference first. */
1193 	if (!mmget_not_zero(mm))
1194 		return -EINVAL;
1195 
1196 	/*
1197 	 * Register a temporary notifier to detect invalidations even if it
1198 	 * overlaps with other mmu_interval_notifiers.
1199 	 */
1200 	uptr = u64_to_user_ptr(cmd->ptr);
1201 	for (addr = start; addr < end; addr = next) {
1202 		unsigned long n;
1203 
1204 		next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
1205 		range.start = addr;
1206 		range.end = next;
1207 
1208 		ret = dmirror_range_snapshot(dmirror, &range, perm);
1209 		if (ret)
1210 			break;
1211 
1212 		n = (range.end - range.start) >> PAGE_SHIFT;
1213 		if (copy_to_user(uptr, perm, n)) {
1214 			ret = -EFAULT;
1215 			break;
1216 		}
1217 
1218 		cmd->cpages += n;
1219 		uptr += n;
1220 	}
1221 	mmput(mm);
1222 
1223 	return ret;
1224 }
1225 
1226 static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
1227 {
1228 	unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT;
1229 	unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT;
1230 	unsigned long npages = end_pfn - start_pfn + 1;
1231 	unsigned long i;
1232 	unsigned long *src_pfns;
1233 	unsigned long *dst_pfns;
1234 
1235 	src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
1236 	dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
1237 
1238 	migrate_device_range(src_pfns, start_pfn, npages);
1239 	for (i = 0; i < npages; i++) {
1240 		struct page *dpage, *spage;
1241 
1242 		spage = migrate_pfn_to_page(src_pfns[i]);
1243 		if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
1244 			continue;
1245 
1246 		if (WARN_ON(!is_device_private_page(spage) &&
1247 			    !is_device_coherent_page(spage)))
1248 			continue;
1249 		spage = BACKING_PAGE(spage);
1250 		dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
1251 		lock_page(dpage);
1252 		copy_highpage(dpage, spage);
1253 		dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
1254 		if (src_pfns[i] & MIGRATE_PFN_WRITE)
1255 			dst_pfns[i] |= MIGRATE_PFN_WRITE;
1256 	}
1257 	migrate_device_pages(src_pfns, dst_pfns, npages);
1258 	migrate_device_finalize(src_pfns, dst_pfns, npages);
1259 	kfree(src_pfns);
1260 	kfree(dst_pfns);
1261 }
1262 
1263 /* Removes free pages from the free list so they can't be re-allocated */
1264 static void dmirror_remove_free_pages(struct dmirror_chunk *devmem)
1265 {
1266 	struct dmirror_device *mdevice = devmem->mdevice;
1267 	struct page *page;
1268 
1269 	for (page = mdevice->free_pages; page; page = page->zone_device_data)
1270 		if (dmirror_page_to_chunk(page) == devmem)
1271 			mdevice->free_pages = page->zone_device_data;
1272 }
1273 
1274 static void dmirror_device_remove_chunks(struct dmirror_device *mdevice)
1275 {
1276 	unsigned int i;
1277 
1278 	mutex_lock(&mdevice->devmem_lock);
1279 	if (mdevice->devmem_chunks) {
1280 		for (i = 0; i < mdevice->devmem_count; i++) {
1281 			struct dmirror_chunk *devmem =
1282 				mdevice->devmem_chunks[i];
1283 
1284 			spin_lock(&mdevice->lock);
1285 			devmem->remove = true;
1286 			dmirror_remove_free_pages(devmem);
1287 			spin_unlock(&mdevice->lock);
1288 
1289 			dmirror_device_evict_chunk(devmem);
1290 			memunmap_pages(&devmem->pagemap);
1291 			if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
1292 				release_mem_region(devmem->pagemap.range.start,
1293 						   range_len(&devmem->pagemap.range));
1294 			kfree(devmem);
1295 		}
1296 		mdevice->devmem_count = 0;
1297 		mdevice->devmem_capacity = 0;
1298 		mdevice->free_pages = NULL;
1299 		kfree(mdevice->devmem_chunks);
1300 		mdevice->devmem_chunks = NULL;
1301 	}
1302 	mutex_unlock(&mdevice->devmem_lock);
1303 }
1304 
1305 static long dmirror_fops_unlocked_ioctl(struct file *filp,
1306 					unsigned int command,
1307 					unsigned long arg)
1308 {
1309 	void __user *uarg = (void __user *)arg;
1310 	struct hmm_dmirror_cmd cmd;
1311 	struct dmirror *dmirror;
1312 	int ret;
1313 
1314 	dmirror = filp->private_data;
1315 	if (!dmirror)
1316 		return -EINVAL;
1317 
1318 	if (copy_from_user(&cmd, uarg, sizeof(cmd)))
1319 		return -EFAULT;
1320 
1321 	if (cmd.addr & ~PAGE_MASK)
1322 		return -EINVAL;
1323 	if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT)))
1324 		return -EINVAL;
1325 
1326 	cmd.cpages = 0;
1327 	cmd.faults = 0;
1328 
1329 	switch (command) {
1330 	case HMM_DMIRROR_READ:
1331 		ret = dmirror_read(dmirror, &cmd);
1332 		break;
1333 
1334 	case HMM_DMIRROR_WRITE:
1335 		ret = dmirror_write(dmirror, &cmd);
1336 		break;
1337 
1338 	case HMM_DMIRROR_MIGRATE_TO_DEV:
1339 		ret = dmirror_migrate_to_device(dmirror, &cmd);
1340 		break;
1341 
1342 	case HMM_DMIRROR_MIGRATE_TO_SYS:
1343 		ret = dmirror_migrate_to_system(dmirror, &cmd);
1344 		break;
1345 
1346 	case HMM_DMIRROR_EXCLUSIVE:
1347 		ret = dmirror_exclusive(dmirror, &cmd);
1348 		break;
1349 
1350 	case HMM_DMIRROR_CHECK_EXCLUSIVE:
1351 		ret = dmirror_check_atomic(dmirror, cmd.addr,
1352 					cmd.addr + (cmd.npages << PAGE_SHIFT));
1353 		break;
1354 
1355 	case HMM_DMIRROR_SNAPSHOT:
1356 		ret = dmirror_snapshot(dmirror, &cmd);
1357 		break;
1358 
1359 	case HMM_DMIRROR_RELEASE:
1360 		dmirror_device_remove_chunks(dmirror->mdevice);
1361 		ret = 0;
1362 		break;
1363 
1364 	default:
1365 		return -EINVAL;
1366 	}
1367 	if (ret)
1368 		return ret;
1369 
1370 	if (copy_to_user(uarg, &cmd, sizeof(cmd)))
1371 		return -EFAULT;
1372 
1373 	return 0;
1374 }
1375 
1376 static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
1377 {
1378 	unsigned long addr;
1379 
1380 	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
1381 		struct page *page;
1382 		int ret;
1383 
1384 		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1385 		if (!page)
1386 			return -ENOMEM;
1387 
1388 		ret = vm_insert_page(vma, addr, page);
1389 		if (ret) {
1390 			__free_page(page);
1391 			return ret;
1392 		}
1393 		put_page(page);
1394 	}
1395 
1396 	return 0;
1397 }
1398 
1399 static const struct file_operations dmirror_fops = {
1400 	.open		= dmirror_fops_open,
1401 	.release	= dmirror_fops_release,
1402 	.mmap		= dmirror_fops_mmap,
1403 	.unlocked_ioctl = dmirror_fops_unlocked_ioctl,
1404 	.llseek		= default_llseek,
1405 	.owner		= THIS_MODULE,
1406 };
1407 
1408 static void dmirror_devmem_free(struct page *page)
1409 {
1410 	struct page *rpage = BACKING_PAGE(page);
1411 	struct dmirror_device *mdevice;
1412 
1413 	if (rpage != page)
1414 		__free_page(rpage);
1415 
1416 	mdevice = dmirror_page_to_device(page);
1417 	spin_lock(&mdevice->lock);
1418 
1419 	/* Return page to our allocator if not freeing the chunk */
1420 	if (!dmirror_page_to_chunk(page)->remove) {
1421 		mdevice->cfree++;
1422 		page->zone_device_data = mdevice->free_pages;
1423 		mdevice->free_pages = page;
1424 	}
1425 	spin_unlock(&mdevice->lock);
1426 }
1427 
1428 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
1429 {
1430 	struct migrate_vma args = { 0 };
1431 	unsigned long src_pfns = 0;
1432 	unsigned long dst_pfns = 0;
1433 	struct page *rpage;
1434 	struct dmirror *dmirror;
1435 	vm_fault_t ret;
1436 
1437 	/*
1438 	 * Normally, a device would use the page->zone_device_data to point to
1439 	 * the mirror but here we use it to hold the page for the simulated
1440 	 * device memory and that page holds the pointer to the mirror.
1441 	 */
1442 	rpage = vmf->page->zone_device_data;
1443 	dmirror = rpage->zone_device_data;
1444 
1445 	/* FIXME demonstrate how we can adjust migrate range */
1446 	args.vma = vmf->vma;
1447 	args.start = vmf->address;
1448 	args.end = args.start + PAGE_SIZE;
1449 	args.src = &src_pfns;
1450 	args.dst = &dst_pfns;
1451 	args.pgmap_owner = dmirror->mdevice;
1452 	args.flags = dmirror_select_device(dmirror);
1453 	args.fault_page = vmf->page;
1454 
1455 	if (migrate_vma_setup(&args))
1456 		return VM_FAULT_SIGBUS;
1457 
1458 	ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
1459 	if (ret)
1460 		return ret;
1461 	migrate_vma_pages(&args);
1462 	/*
1463 	 * No device finalize step is needed since
1464 	 * dmirror_devmem_fault_alloc_and_copy() will have already
1465 	 * invalidated the device page table.
1466 	 */
1467 	migrate_vma_finalize(&args);
1468 	return 0;
1469 }
1470 
1471 static const struct dev_pagemap_ops dmirror_devmem_ops = {
1472 	.page_free	= dmirror_devmem_free,
1473 	.migrate_to_ram	= dmirror_devmem_fault,
1474 };
1475 
1476 static int dmirror_device_init(struct dmirror_device *mdevice, int id)
1477 {
1478 	dev_t dev;
1479 	int ret;
1480 
1481 	dev = MKDEV(MAJOR(dmirror_dev), id);
1482 	mutex_init(&mdevice->devmem_lock);
1483 	spin_lock_init(&mdevice->lock);
1484 
1485 	cdev_init(&mdevice->cdevice, &dmirror_fops);
1486 	mdevice->cdevice.owner = THIS_MODULE;
1487 	device_initialize(&mdevice->device);
1488 	mdevice->device.devt = dev;
1489 
1490 	ret = dev_set_name(&mdevice->device, "hmm_dmirror%u", id);
1491 	if (ret)
1492 		return ret;
1493 
1494 	ret = cdev_device_add(&mdevice->cdevice, &mdevice->device);
1495 	if (ret)
1496 		return ret;
1497 
1498 	/* Build a list of free ZONE_DEVICE struct pages */
1499 	return dmirror_allocate_chunk(mdevice, NULL);
1500 }
1501 
1502 static void dmirror_device_remove(struct dmirror_device *mdevice)
1503 {
1504 	dmirror_device_remove_chunks(mdevice);
1505 	cdev_device_del(&mdevice->cdevice, &mdevice->device);
1506 }
1507 
1508 static int __init hmm_dmirror_init(void)
1509 {
1510 	int ret;
1511 	int id = 0;
1512 	int ndevices = 0;
1513 
1514 	ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES,
1515 				  "HMM_DMIRROR");
1516 	if (ret)
1517 		goto err_unreg;
1518 
1519 	memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0]));
1520 	dmirror_devices[ndevices++].zone_device_type =
1521 				HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
1522 	dmirror_devices[ndevices++].zone_device_type =
1523 				HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
1524 	if (spm_addr_dev0 && spm_addr_dev1) {
1525 		dmirror_devices[ndevices++].zone_device_type =
1526 					HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
1527 		dmirror_devices[ndevices++].zone_device_type =
1528 					HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
1529 	}
1530 	for (id = 0; id < ndevices; id++) {
1531 		ret = dmirror_device_init(dmirror_devices + id, id);
1532 		if (ret)
1533 			goto err_chrdev;
1534 	}
1535 
1536 	pr_info("HMM test module loaded. This is only for testing HMM.\n");
1537 	return 0;
1538 
1539 err_chrdev:
1540 	while (--id >= 0)
1541 		dmirror_device_remove(dmirror_devices + id);
1542 	unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
1543 err_unreg:
1544 	return ret;
1545 }
1546 
1547 static void __exit hmm_dmirror_exit(void)
1548 {
1549 	int id;
1550 
1551 	for (id = 0; id < DMIRROR_NDEVICES; id++)
1552 		if (dmirror_devices[id].zone_device_type)
1553 			dmirror_device_remove(dmirror_devices + id);
1554 	unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
1555 }
1556 
1557 module_init(hmm_dmirror_init);
1558 module_exit(hmm_dmirror_exit);
1559 MODULE_LICENSE("GPL");
1560