xref: /openbmc/linux/lib/test_hmm.c (revision ed913b055a74b723976f8e885a3395162a0371e6)
1b2ef9f5aSRalph Campbell // SPDX-License-Identifier: GPL-2.0
2b2ef9f5aSRalph Campbell /*
3b2ef9f5aSRalph Campbell  * This is a module to test the HMM (Heterogeneous Memory Management)
4b2ef9f5aSRalph Campbell  * mirror and zone device private memory migration APIs of the kernel.
5b2ef9f5aSRalph Campbell  * Userspace programs can register with the driver to mirror their own address
6b2ef9f5aSRalph Campbell  * space and can use the device to read/write any valid virtual address.
7b2ef9f5aSRalph Campbell  */
8b2ef9f5aSRalph Campbell #include <linux/init.h>
9b2ef9f5aSRalph Campbell #include <linux/fs.h>
10b2ef9f5aSRalph Campbell #include <linux/mm.h>
11b2ef9f5aSRalph Campbell #include <linux/module.h>
12b2ef9f5aSRalph Campbell #include <linux/kernel.h>
13b2ef9f5aSRalph Campbell #include <linux/cdev.h>
14b2ef9f5aSRalph Campbell #include <linux/device.h>
15dc90f084SChristoph Hellwig #include <linux/memremap.h>
16b2ef9f5aSRalph Campbell #include <linux/mutex.h>
17b2ef9f5aSRalph Campbell #include <linux/rwsem.h>
18b2ef9f5aSRalph Campbell #include <linux/sched.h>
19b2ef9f5aSRalph Campbell #include <linux/slab.h>
20b2ef9f5aSRalph Campbell #include <linux/highmem.h>
21b2ef9f5aSRalph Campbell #include <linux/delay.h>
22b2ef9f5aSRalph Campbell #include <linux/pagemap.h>
23b2ef9f5aSRalph Campbell #include <linux/hmm.h>
24b2ef9f5aSRalph Campbell #include <linux/vmalloc.h>
25b2ef9f5aSRalph Campbell #include <linux/swap.h>
26b2ef9f5aSRalph Campbell #include <linux/swapops.h>
27b2ef9f5aSRalph Campbell #include <linux/sched/mm.h>
28b2ef9f5aSRalph Campbell #include <linux/platform_device.h>
29b659baeaSAlistair Popple #include <linux/rmap.h>
30730ff521SChristoph Hellwig #include <linux/mmu_notifier.h>
31730ff521SChristoph Hellwig #include <linux/migrate.h>
32b2ef9f5aSRalph Campbell 
33b2ef9f5aSRalph Campbell #include "test_hmm_uapi.h"
34b2ef9f5aSRalph Campbell 
35b2ef9f5aSRalph Campbell #define DMIRROR_NDEVICES		2
36b2ef9f5aSRalph Campbell #define DMIRROR_RANGE_FAULT_TIMEOUT	1000
37b2ef9f5aSRalph Campbell #define DEVMEM_CHUNK_SIZE		(256 * 1024 * 1024U)
38b2ef9f5aSRalph Campbell #define DEVMEM_CHUNKS_RESERVE		16
39b2ef9f5aSRalph Campbell 
40b2ef9f5aSRalph Campbell static const struct dev_pagemap_ops dmirror_devmem_ops;
41b2ef9f5aSRalph Campbell static const struct mmu_interval_notifier_ops dmirror_min_ops;
42b2ef9f5aSRalph Campbell static dev_t dmirror_dev;
43b2ef9f5aSRalph Campbell 
44b2ef9f5aSRalph Campbell struct dmirror_device;
45b2ef9f5aSRalph Campbell 
46b2ef9f5aSRalph Campbell struct dmirror_bounce {
47b2ef9f5aSRalph Campbell 	void			*ptr;
48b2ef9f5aSRalph Campbell 	unsigned long		size;
49b2ef9f5aSRalph Campbell 	unsigned long		addr;
50b2ef9f5aSRalph Campbell 	unsigned long		cpages;
51b2ef9f5aSRalph Campbell };
52b2ef9f5aSRalph Campbell 
53b659baeaSAlistair Popple #define DPT_XA_TAG_ATOMIC 1UL
54b2ef9f5aSRalph Campbell #define DPT_XA_TAG_WRITE 3UL
55b2ef9f5aSRalph Campbell 
56b2ef9f5aSRalph Campbell /*
57b2ef9f5aSRalph Campbell  * Data structure to track address ranges and register for mmu interval
58b2ef9f5aSRalph Campbell  * notifier updates.
59b2ef9f5aSRalph Campbell  */
60b2ef9f5aSRalph Campbell struct dmirror_interval {
61b2ef9f5aSRalph Campbell 	struct mmu_interval_notifier	notifier;
62b2ef9f5aSRalph Campbell 	struct dmirror			*dmirror;
63b2ef9f5aSRalph Campbell };
64b2ef9f5aSRalph Campbell 
65b2ef9f5aSRalph Campbell /*
66b2ef9f5aSRalph Campbell  * Data attached to the open device file.
67b2ef9f5aSRalph Campbell  * Note that it might be shared after a fork().
68b2ef9f5aSRalph Campbell  */
69b2ef9f5aSRalph Campbell struct dmirror {
70b2ef9f5aSRalph Campbell 	struct dmirror_device		*mdevice;
71b2ef9f5aSRalph Campbell 	struct xarray			pt;
72b2ef9f5aSRalph Campbell 	struct mmu_interval_notifier	notifier;
73b2ef9f5aSRalph Campbell 	struct mutex			mutex;
74b2ef9f5aSRalph Campbell };
75b2ef9f5aSRalph Campbell 
76b2ef9f5aSRalph Campbell /*
77b2ef9f5aSRalph Campbell  * ZONE_DEVICE pages for migration and simulating device memory.
78b2ef9f5aSRalph Campbell  */
79b2ef9f5aSRalph Campbell struct dmirror_chunk {
80b2ef9f5aSRalph Campbell 	struct dev_pagemap	pagemap;
81b2ef9f5aSRalph Campbell 	struct dmirror_device	*mdevice;
82b2ef9f5aSRalph Campbell };
83b2ef9f5aSRalph Campbell 
84b2ef9f5aSRalph Campbell /*
85b2ef9f5aSRalph Campbell  * Per device data.
86b2ef9f5aSRalph Campbell  */
87b2ef9f5aSRalph Campbell struct dmirror_device {
88b2ef9f5aSRalph Campbell 	struct cdev		cdevice;
89b2ef9f5aSRalph Campbell 	struct hmm_devmem	*devmem;
90b2ef9f5aSRalph Campbell 
91b2ef9f5aSRalph Campbell 	unsigned int		devmem_capacity;
92b2ef9f5aSRalph Campbell 	unsigned int		devmem_count;
93b2ef9f5aSRalph Campbell 	struct dmirror_chunk	**devmem_chunks;
94b2ef9f5aSRalph Campbell 	struct mutex		devmem_lock;	/* protects the above */
95b2ef9f5aSRalph Campbell 
96b2ef9f5aSRalph Campbell 	unsigned long		calloc;
97b2ef9f5aSRalph Campbell 	unsigned long		cfree;
98b2ef9f5aSRalph Campbell 	struct page		*free_pages;
99b2ef9f5aSRalph Campbell 	spinlock_t		lock;		/* protects the above */
100b2ef9f5aSRalph Campbell };
101b2ef9f5aSRalph Campbell 
102b2ef9f5aSRalph Campbell static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES];
103b2ef9f5aSRalph Campbell 
104b2ef9f5aSRalph Campbell static int dmirror_bounce_init(struct dmirror_bounce *bounce,
105b2ef9f5aSRalph Campbell 			       unsigned long addr,
106b2ef9f5aSRalph Campbell 			       unsigned long size)
107b2ef9f5aSRalph Campbell {
108b2ef9f5aSRalph Campbell 	bounce->addr = addr;
109b2ef9f5aSRalph Campbell 	bounce->size = size;
110b2ef9f5aSRalph Campbell 	bounce->cpages = 0;
111b2ef9f5aSRalph Campbell 	bounce->ptr = vmalloc(size);
112b2ef9f5aSRalph Campbell 	if (!bounce->ptr)
113b2ef9f5aSRalph Campbell 		return -ENOMEM;
114b2ef9f5aSRalph Campbell 	return 0;
115b2ef9f5aSRalph Campbell }
116b2ef9f5aSRalph Campbell 
117b2ef9f5aSRalph Campbell static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
118b2ef9f5aSRalph Campbell {
119b2ef9f5aSRalph Campbell 	vfree(bounce->ptr);
120b2ef9f5aSRalph Campbell }
121b2ef9f5aSRalph Campbell 
122b2ef9f5aSRalph Campbell static int dmirror_fops_open(struct inode *inode, struct file *filp)
123b2ef9f5aSRalph Campbell {
124b2ef9f5aSRalph Campbell 	struct cdev *cdev = inode->i_cdev;
125b2ef9f5aSRalph Campbell 	struct dmirror *dmirror;
126b2ef9f5aSRalph Campbell 	int ret;
127b2ef9f5aSRalph Campbell 
128b2ef9f5aSRalph Campbell 	/* Mirror this process address space */
129b2ef9f5aSRalph Campbell 	dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL);
130b2ef9f5aSRalph Campbell 	if (dmirror == NULL)
131b2ef9f5aSRalph Campbell 		return -ENOMEM;
132b2ef9f5aSRalph Campbell 
133b2ef9f5aSRalph Campbell 	dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice);
134b2ef9f5aSRalph Campbell 	mutex_init(&dmirror->mutex);
135b2ef9f5aSRalph Campbell 	xa_init(&dmirror->pt);
136b2ef9f5aSRalph Campbell 
137b2ef9f5aSRalph Campbell 	ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm,
138b2ef9f5aSRalph Campbell 				0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops);
139b2ef9f5aSRalph Campbell 	if (ret) {
140b2ef9f5aSRalph Campbell 		kfree(dmirror);
141b2ef9f5aSRalph Campbell 		return ret;
142b2ef9f5aSRalph Campbell 	}
143b2ef9f5aSRalph Campbell 
144b2ef9f5aSRalph Campbell 	filp->private_data = dmirror;
145b2ef9f5aSRalph Campbell 	return 0;
146b2ef9f5aSRalph Campbell }
147b2ef9f5aSRalph Campbell 
148b2ef9f5aSRalph Campbell static int dmirror_fops_release(struct inode *inode, struct file *filp)
149b2ef9f5aSRalph Campbell {
150b2ef9f5aSRalph Campbell 	struct dmirror *dmirror = filp->private_data;
151b2ef9f5aSRalph Campbell 
152b2ef9f5aSRalph Campbell 	mmu_interval_notifier_remove(&dmirror->notifier);
153b2ef9f5aSRalph Campbell 	xa_destroy(&dmirror->pt);
154b2ef9f5aSRalph Campbell 	kfree(dmirror);
155b2ef9f5aSRalph Campbell 	return 0;
156b2ef9f5aSRalph Campbell }
157b2ef9f5aSRalph Campbell 
158b2ef9f5aSRalph Campbell static struct dmirror_device *dmirror_page_to_device(struct page *page)
159b2ef9f5aSRalph Campbell 
160b2ef9f5aSRalph Campbell {
161b2ef9f5aSRalph Campbell 	return container_of(page->pgmap, struct dmirror_chunk,
162b2ef9f5aSRalph Campbell 			    pagemap)->mdevice;
163b2ef9f5aSRalph Campbell }
164b2ef9f5aSRalph Campbell 
165b2ef9f5aSRalph Campbell static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
166b2ef9f5aSRalph Campbell {
167b2ef9f5aSRalph Campbell 	unsigned long *pfns = range->hmm_pfns;
168b2ef9f5aSRalph Campbell 	unsigned long pfn;
169b2ef9f5aSRalph Campbell 
170b2ef9f5aSRalph Campbell 	for (pfn = (range->start >> PAGE_SHIFT);
171b2ef9f5aSRalph Campbell 	     pfn < (range->end >> PAGE_SHIFT);
172b2ef9f5aSRalph Campbell 	     pfn++, pfns++) {
173b2ef9f5aSRalph Campbell 		struct page *page;
174b2ef9f5aSRalph Campbell 		void *entry;
175b2ef9f5aSRalph Campbell 
176b2ef9f5aSRalph Campbell 		/*
177b2ef9f5aSRalph Campbell 		 * Since we asked for hmm_range_fault() to populate pages,
178b2ef9f5aSRalph Campbell 		 * it shouldn't return an error entry on success.
179b2ef9f5aSRalph Campbell 		 */
180b2ef9f5aSRalph Campbell 		WARN_ON(*pfns & HMM_PFN_ERROR);
181b2ef9f5aSRalph Campbell 		WARN_ON(!(*pfns & HMM_PFN_VALID));
182b2ef9f5aSRalph Campbell 
183b2ef9f5aSRalph Campbell 		page = hmm_pfn_to_page(*pfns);
184b2ef9f5aSRalph Campbell 		WARN_ON(!page);
185b2ef9f5aSRalph Campbell 
186b2ef9f5aSRalph Campbell 		entry = page;
187b2ef9f5aSRalph Campbell 		if (*pfns & HMM_PFN_WRITE)
188b2ef9f5aSRalph Campbell 			entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
189b2ef9f5aSRalph Campbell 		else if (WARN_ON(range->default_flags & HMM_PFN_WRITE))
190b2ef9f5aSRalph Campbell 			return -EFAULT;
191b2ef9f5aSRalph Campbell 		entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
192b2ef9f5aSRalph Campbell 		if (xa_is_err(entry))
193b2ef9f5aSRalph Campbell 			return xa_err(entry);
194b2ef9f5aSRalph Campbell 	}
195b2ef9f5aSRalph Campbell 
196b2ef9f5aSRalph Campbell 	return 0;
197b2ef9f5aSRalph Campbell }
198b2ef9f5aSRalph Campbell 
199b2ef9f5aSRalph Campbell static void dmirror_do_update(struct dmirror *dmirror, unsigned long start,
200b2ef9f5aSRalph Campbell 			      unsigned long end)
201b2ef9f5aSRalph Campbell {
202b2ef9f5aSRalph Campbell 	unsigned long pfn;
203b2ef9f5aSRalph Campbell 	void *entry;
204b2ef9f5aSRalph Campbell 
205b2ef9f5aSRalph Campbell 	/*
206b2ef9f5aSRalph Campbell 	 * The XArray doesn't hold references to pages since it relies on
207b2ef9f5aSRalph Campbell 	 * the mmu notifier to clear page pointers when they become stale.
208b2ef9f5aSRalph Campbell 	 * Therefore, it is OK to just clear the entry.
209b2ef9f5aSRalph Campbell 	 */
210b2ef9f5aSRalph Campbell 	xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT,
211b2ef9f5aSRalph Campbell 			  end >> PAGE_SHIFT)
212b2ef9f5aSRalph Campbell 		xa_erase(&dmirror->pt, pfn);
213b2ef9f5aSRalph Campbell }
214b2ef9f5aSRalph Campbell 
215b2ef9f5aSRalph Campbell static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
216b2ef9f5aSRalph Campbell 				const struct mmu_notifier_range *range,
217b2ef9f5aSRalph Campbell 				unsigned long cur_seq)
218b2ef9f5aSRalph Campbell {
219b2ef9f5aSRalph Campbell 	struct dmirror *dmirror = container_of(mni, struct dmirror, notifier);
220b2ef9f5aSRalph Campbell 
2217d17e83aSRalph Campbell 	/*
2227d17e83aSRalph Campbell 	 * Ignore invalidation callbacks for device private pages since
2237d17e83aSRalph Campbell 	 * the invalidation is handled as part of the migration process.
2247d17e83aSRalph Campbell 	 */
2257d17e83aSRalph Campbell 	if (range->event == MMU_NOTIFY_MIGRATE &&
2266b49bf6dSAlistair Popple 	    range->owner == dmirror->mdevice)
2277d17e83aSRalph Campbell 		return true;
2287d17e83aSRalph Campbell 
229b2ef9f5aSRalph Campbell 	if (mmu_notifier_range_blockable(range))
230b2ef9f5aSRalph Campbell 		mutex_lock(&dmirror->mutex);
231b2ef9f5aSRalph Campbell 	else if (!mutex_trylock(&dmirror->mutex))
232b2ef9f5aSRalph Campbell 		return false;
233b2ef9f5aSRalph Campbell 
234b2ef9f5aSRalph Campbell 	mmu_interval_set_seq(mni, cur_seq);
235b2ef9f5aSRalph Campbell 	dmirror_do_update(dmirror, range->start, range->end);
236b2ef9f5aSRalph Campbell 
237b2ef9f5aSRalph Campbell 	mutex_unlock(&dmirror->mutex);
238b2ef9f5aSRalph Campbell 	return true;
239b2ef9f5aSRalph Campbell }
240b2ef9f5aSRalph Campbell 
241b2ef9f5aSRalph Campbell static const struct mmu_interval_notifier_ops dmirror_min_ops = {
242b2ef9f5aSRalph Campbell 	.invalidate = dmirror_interval_invalidate,
243b2ef9f5aSRalph Campbell };
244b2ef9f5aSRalph Campbell 
245b2ef9f5aSRalph Campbell static int dmirror_range_fault(struct dmirror *dmirror,
246b2ef9f5aSRalph Campbell 				struct hmm_range *range)
247b2ef9f5aSRalph Campbell {
248b2ef9f5aSRalph Campbell 	struct mm_struct *mm = dmirror->notifier.mm;
249b2ef9f5aSRalph Campbell 	unsigned long timeout =
250b2ef9f5aSRalph Campbell 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
251b2ef9f5aSRalph Campbell 	int ret;
252b2ef9f5aSRalph Campbell 
253b2ef9f5aSRalph Campbell 	while (true) {
254b2ef9f5aSRalph Campbell 		if (time_after(jiffies, timeout)) {
255b2ef9f5aSRalph Campbell 			ret = -EBUSY;
256b2ef9f5aSRalph Campbell 			goto out;
257b2ef9f5aSRalph Campbell 		}
258b2ef9f5aSRalph Campbell 
259b2ef9f5aSRalph Campbell 		range->notifier_seq = mmu_interval_read_begin(range->notifier);
26089154dd5SMichel Lespinasse 		mmap_read_lock(mm);
261b2ef9f5aSRalph Campbell 		ret = hmm_range_fault(range);
26289154dd5SMichel Lespinasse 		mmap_read_unlock(mm);
263b2ef9f5aSRalph Campbell 		if (ret) {
264b2ef9f5aSRalph Campbell 			if (ret == -EBUSY)
265b2ef9f5aSRalph Campbell 				continue;
266b2ef9f5aSRalph Campbell 			goto out;
267b2ef9f5aSRalph Campbell 		}
268b2ef9f5aSRalph Campbell 
269b2ef9f5aSRalph Campbell 		mutex_lock(&dmirror->mutex);
270b2ef9f5aSRalph Campbell 		if (mmu_interval_read_retry(range->notifier,
271b2ef9f5aSRalph Campbell 					    range->notifier_seq)) {
272b2ef9f5aSRalph Campbell 			mutex_unlock(&dmirror->mutex);
273b2ef9f5aSRalph Campbell 			continue;
274b2ef9f5aSRalph Campbell 		}
275b2ef9f5aSRalph Campbell 		break;
276b2ef9f5aSRalph Campbell 	}
277b2ef9f5aSRalph Campbell 
278b2ef9f5aSRalph Campbell 	ret = dmirror_do_fault(dmirror, range);
279b2ef9f5aSRalph Campbell 
280b2ef9f5aSRalph Campbell 	mutex_unlock(&dmirror->mutex);
281b2ef9f5aSRalph Campbell out:
282b2ef9f5aSRalph Campbell 	return ret;
283b2ef9f5aSRalph Campbell }
284b2ef9f5aSRalph Campbell 
285b2ef9f5aSRalph Campbell static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
286b2ef9f5aSRalph Campbell 			 unsigned long end, bool write)
287b2ef9f5aSRalph Campbell {
288b2ef9f5aSRalph Campbell 	struct mm_struct *mm = dmirror->notifier.mm;
289b2ef9f5aSRalph Campbell 	unsigned long addr;
290b2ef9f5aSRalph Campbell 	unsigned long pfns[64];
291b2ef9f5aSRalph Campbell 	struct hmm_range range = {
292b2ef9f5aSRalph Campbell 		.notifier = &dmirror->notifier,
293b2ef9f5aSRalph Campbell 		.hmm_pfns = pfns,
294b2ef9f5aSRalph Campbell 		.pfn_flags_mask = 0,
295b2ef9f5aSRalph Campbell 		.default_flags =
296b2ef9f5aSRalph Campbell 			HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0),
297b2ef9f5aSRalph Campbell 		.dev_private_owner = dmirror->mdevice,
298b2ef9f5aSRalph Campbell 	};
299b2ef9f5aSRalph Campbell 	int ret = 0;
300b2ef9f5aSRalph Campbell 
301b2ef9f5aSRalph Campbell 	/* Since the mm is for the mirrored process, get a reference first. */
302b2ef9f5aSRalph Campbell 	if (!mmget_not_zero(mm))
303b2ef9f5aSRalph Campbell 		return 0;
304b2ef9f5aSRalph Campbell 
305b2ef9f5aSRalph Campbell 	for (addr = start; addr < end; addr = range.end) {
306b2ef9f5aSRalph Campbell 		range.start = addr;
307b2ef9f5aSRalph Campbell 		range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
308b2ef9f5aSRalph Campbell 
309b2ef9f5aSRalph Campbell 		ret = dmirror_range_fault(dmirror, &range);
310b2ef9f5aSRalph Campbell 		if (ret)
311b2ef9f5aSRalph Campbell 			break;
312b2ef9f5aSRalph Campbell 	}
313b2ef9f5aSRalph Campbell 
314b2ef9f5aSRalph Campbell 	mmput(mm);
315b2ef9f5aSRalph Campbell 	return ret;
316b2ef9f5aSRalph Campbell }
317b2ef9f5aSRalph Campbell 
318b2ef9f5aSRalph Campbell static int dmirror_do_read(struct dmirror *dmirror, unsigned long start,
319b2ef9f5aSRalph Campbell 			   unsigned long end, struct dmirror_bounce *bounce)
320b2ef9f5aSRalph Campbell {
321b2ef9f5aSRalph Campbell 	unsigned long pfn;
322b2ef9f5aSRalph Campbell 	void *ptr;
323b2ef9f5aSRalph Campbell 
324b2ef9f5aSRalph Campbell 	ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
325b2ef9f5aSRalph Campbell 
326b2ef9f5aSRalph Campbell 	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
327b2ef9f5aSRalph Campbell 		void *entry;
328b2ef9f5aSRalph Campbell 		struct page *page;
329b2ef9f5aSRalph Campbell 		void *tmp;
330b2ef9f5aSRalph Campbell 
331b2ef9f5aSRalph Campbell 		entry = xa_load(&dmirror->pt, pfn);
332b2ef9f5aSRalph Campbell 		page = xa_untag_pointer(entry);
333b2ef9f5aSRalph Campbell 		if (!page)
334b2ef9f5aSRalph Campbell 			return -ENOENT;
335b2ef9f5aSRalph Campbell 
336b2ef9f5aSRalph Campbell 		tmp = kmap(page);
337b2ef9f5aSRalph Campbell 		memcpy(ptr, tmp, PAGE_SIZE);
338b2ef9f5aSRalph Campbell 		kunmap(page);
339b2ef9f5aSRalph Campbell 
340b2ef9f5aSRalph Campbell 		ptr += PAGE_SIZE;
341b2ef9f5aSRalph Campbell 		bounce->cpages++;
342b2ef9f5aSRalph Campbell 	}
343b2ef9f5aSRalph Campbell 
344b2ef9f5aSRalph Campbell 	return 0;
345b2ef9f5aSRalph Campbell }
346b2ef9f5aSRalph Campbell 
347b2ef9f5aSRalph Campbell static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
348b2ef9f5aSRalph Campbell {
349b2ef9f5aSRalph Campbell 	struct dmirror_bounce bounce;
350b2ef9f5aSRalph Campbell 	unsigned long start, end;
351b2ef9f5aSRalph Campbell 	unsigned long size = cmd->npages << PAGE_SHIFT;
352b2ef9f5aSRalph Campbell 	int ret;
353b2ef9f5aSRalph Campbell 
354b2ef9f5aSRalph Campbell 	start = cmd->addr;
355b2ef9f5aSRalph Campbell 	end = start + size;
356b2ef9f5aSRalph Campbell 	if (end < start)
357b2ef9f5aSRalph Campbell 		return -EINVAL;
358b2ef9f5aSRalph Campbell 
359b2ef9f5aSRalph Campbell 	ret = dmirror_bounce_init(&bounce, start, size);
360b2ef9f5aSRalph Campbell 	if (ret)
361b2ef9f5aSRalph Campbell 		return ret;
362b2ef9f5aSRalph Campbell 
363b2ef9f5aSRalph Campbell 	while (1) {
364b2ef9f5aSRalph Campbell 		mutex_lock(&dmirror->mutex);
365b2ef9f5aSRalph Campbell 		ret = dmirror_do_read(dmirror, start, end, &bounce);
366b2ef9f5aSRalph Campbell 		mutex_unlock(&dmirror->mutex);
367b2ef9f5aSRalph Campbell 		if (ret != -ENOENT)
368b2ef9f5aSRalph Campbell 			break;
369b2ef9f5aSRalph Campbell 
370b2ef9f5aSRalph Campbell 		start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
371b2ef9f5aSRalph Campbell 		ret = dmirror_fault(dmirror, start, end, false);
372b2ef9f5aSRalph Campbell 		if (ret)
373b2ef9f5aSRalph Campbell 			break;
374b2ef9f5aSRalph Campbell 		cmd->faults++;
375b2ef9f5aSRalph Campbell 	}
376b2ef9f5aSRalph Campbell 
377b2ef9f5aSRalph Campbell 	if (ret == 0) {
378b2ef9f5aSRalph Campbell 		if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
379b2ef9f5aSRalph Campbell 				 bounce.size))
380b2ef9f5aSRalph Campbell 			ret = -EFAULT;
381b2ef9f5aSRalph Campbell 	}
382b2ef9f5aSRalph Campbell 	cmd->cpages = bounce.cpages;
383b2ef9f5aSRalph Campbell 	dmirror_bounce_fini(&bounce);
384b2ef9f5aSRalph Campbell 	return ret;
385b2ef9f5aSRalph Campbell }
386b2ef9f5aSRalph Campbell 
387b2ef9f5aSRalph Campbell static int dmirror_do_write(struct dmirror *dmirror, unsigned long start,
388b2ef9f5aSRalph Campbell 			    unsigned long end, struct dmirror_bounce *bounce)
389b2ef9f5aSRalph Campbell {
390b2ef9f5aSRalph Campbell 	unsigned long pfn;
391b2ef9f5aSRalph Campbell 	void *ptr;
392b2ef9f5aSRalph Campbell 
393b2ef9f5aSRalph Campbell 	ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
394b2ef9f5aSRalph Campbell 
395b2ef9f5aSRalph Campbell 	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
396b2ef9f5aSRalph Campbell 		void *entry;
397b2ef9f5aSRalph Campbell 		struct page *page;
398b2ef9f5aSRalph Campbell 		void *tmp;
399b2ef9f5aSRalph Campbell 
400b2ef9f5aSRalph Campbell 		entry = xa_load(&dmirror->pt, pfn);
401b2ef9f5aSRalph Campbell 		page = xa_untag_pointer(entry);
402b2ef9f5aSRalph Campbell 		if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE)
403b2ef9f5aSRalph Campbell 			return -ENOENT;
404b2ef9f5aSRalph Campbell 
405b2ef9f5aSRalph Campbell 		tmp = kmap(page);
406b2ef9f5aSRalph Campbell 		memcpy(tmp, ptr, PAGE_SIZE);
407b2ef9f5aSRalph Campbell 		kunmap(page);
408b2ef9f5aSRalph Campbell 
409b2ef9f5aSRalph Campbell 		ptr += PAGE_SIZE;
410b2ef9f5aSRalph Campbell 		bounce->cpages++;
411b2ef9f5aSRalph Campbell 	}
412b2ef9f5aSRalph Campbell 
413b2ef9f5aSRalph Campbell 	return 0;
414b2ef9f5aSRalph Campbell }
415b2ef9f5aSRalph Campbell 
416b2ef9f5aSRalph Campbell static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
417b2ef9f5aSRalph Campbell {
418b2ef9f5aSRalph Campbell 	struct dmirror_bounce bounce;
419b2ef9f5aSRalph Campbell 	unsigned long start, end;
420b2ef9f5aSRalph Campbell 	unsigned long size = cmd->npages << PAGE_SHIFT;
421b2ef9f5aSRalph Campbell 	int ret;
422b2ef9f5aSRalph Campbell 
423b2ef9f5aSRalph Campbell 	start = cmd->addr;
424b2ef9f5aSRalph Campbell 	end = start + size;
425b2ef9f5aSRalph Campbell 	if (end < start)
426b2ef9f5aSRalph Campbell 		return -EINVAL;
427b2ef9f5aSRalph Campbell 
428b2ef9f5aSRalph Campbell 	ret = dmirror_bounce_init(&bounce, start, size);
429b2ef9f5aSRalph Campbell 	if (ret)
430b2ef9f5aSRalph Campbell 		return ret;
431b2ef9f5aSRalph Campbell 	if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr),
432b2ef9f5aSRalph Campbell 			   bounce.size)) {
433b2ef9f5aSRalph Campbell 		ret = -EFAULT;
434b2ef9f5aSRalph Campbell 		goto fini;
435b2ef9f5aSRalph Campbell 	}
436b2ef9f5aSRalph Campbell 
437b2ef9f5aSRalph Campbell 	while (1) {
438b2ef9f5aSRalph Campbell 		mutex_lock(&dmirror->mutex);
439b2ef9f5aSRalph Campbell 		ret = dmirror_do_write(dmirror, start, end, &bounce);
440b2ef9f5aSRalph Campbell 		mutex_unlock(&dmirror->mutex);
441b2ef9f5aSRalph Campbell 		if (ret != -ENOENT)
442b2ef9f5aSRalph Campbell 			break;
443b2ef9f5aSRalph Campbell 
444b2ef9f5aSRalph Campbell 		start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
445b2ef9f5aSRalph Campbell 		ret = dmirror_fault(dmirror, start, end, true);
446b2ef9f5aSRalph Campbell 		if (ret)
447b2ef9f5aSRalph Campbell 			break;
448b2ef9f5aSRalph Campbell 		cmd->faults++;
449b2ef9f5aSRalph Campbell 	}
450b2ef9f5aSRalph Campbell 
451b2ef9f5aSRalph Campbell fini:
452b2ef9f5aSRalph Campbell 	cmd->cpages = bounce.cpages;
453b2ef9f5aSRalph Campbell 	dmirror_bounce_fini(&bounce);
454b2ef9f5aSRalph Campbell 	return ret;
455b2ef9f5aSRalph Campbell }
456b2ef9f5aSRalph Campbell 
457b2ef9f5aSRalph Campbell static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
458b2ef9f5aSRalph Campbell 				   struct page **ppage)
459b2ef9f5aSRalph Campbell {
460b2ef9f5aSRalph Campbell 	struct dmirror_chunk *devmem;
461b2ef9f5aSRalph Campbell 	struct resource *res;
462b2ef9f5aSRalph Campbell 	unsigned long pfn;
463b2ef9f5aSRalph Campbell 	unsigned long pfn_first;
464b2ef9f5aSRalph Campbell 	unsigned long pfn_last;
465b2ef9f5aSRalph Campbell 	void *ptr;
466b2ef9f5aSRalph Campbell 
467a4574f63SDan Williams 	devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
468a4574f63SDan Williams 	if (!devmem)
469f3c9d0a3SDan Carpenter 		return false;
470a4574f63SDan Williams 
471a4574f63SDan Williams 	res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
472a4574f63SDan Williams 				      "hmm_dmirror");
473a4574f63SDan Williams 	if (IS_ERR(res))
474a4574f63SDan Williams 		goto err_devmem;
475a4574f63SDan Williams 
476a4574f63SDan Williams 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
477a4574f63SDan Williams 	devmem->pagemap.range.start = res->start;
478a4574f63SDan Williams 	devmem->pagemap.range.end = res->end;
479b7b3c01bSDan Williams 	devmem->pagemap.nr_range = 1;
480a4574f63SDan Williams 	devmem->pagemap.ops = &dmirror_devmem_ops;
481a4574f63SDan Williams 	devmem->pagemap.owner = mdevice;
482a4574f63SDan Williams 
483b2ef9f5aSRalph Campbell 	mutex_lock(&mdevice->devmem_lock);
484b2ef9f5aSRalph Campbell 
485b2ef9f5aSRalph Campbell 	if (mdevice->devmem_count == mdevice->devmem_capacity) {
486b2ef9f5aSRalph Campbell 		struct dmirror_chunk **new_chunks;
487b2ef9f5aSRalph Campbell 		unsigned int new_capacity;
488b2ef9f5aSRalph Campbell 
489b2ef9f5aSRalph Campbell 		new_capacity = mdevice->devmem_capacity +
490b2ef9f5aSRalph Campbell 				DEVMEM_CHUNKS_RESERVE;
491b2ef9f5aSRalph Campbell 		new_chunks = krealloc(mdevice->devmem_chunks,
492b2ef9f5aSRalph Campbell 				sizeof(new_chunks[0]) * new_capacity,
493b2ef9f5aSRalph Campbell 				GFP_KERNEL);
494b2ef9f5aSRalph Campbell 		if (!new_chunks)
495a4574f63SDan Williams 			goto err_release;
496b2ef9f5aSRalph Campbell 		mdevice->devmem_capacity = new_capacity;
497b2ef9f5aSRalph Campbell 		mdevice->devmem_chunks = new_chunks;
498b2ef9f5aSRalph Campbell 	}
499b2ef9f5aSRalph Campbell 
500b2ef9f5aSRalph Campbell 	ptr = memremap_pages(&devmem->pagemap, numa_node_id());
501b2ef9f5aSRalph Campbell 	if (IS_ERR(ptr))
502a4574f63SDan Williams 		goto err_release;
503b2ef9f5aSRalph Campbell 
504b2ef9f5aSRalph Campbell 	devmem->mdevice = mdevice;
505a4574f63SDan Williams 	pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
506a4574f63SDan Williams 	pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT);
507b2ef9f5aSRalph Campbell 	mdevice->devmem_chunks[mdevice->devmem_count++] = devmem;
508b2ef9f5aSRalph Campbell 
509b2ef9f5aSRalph Campbell 	mutex_unlock(&mdevice->devmem_lock);
510b2ef9f5aSRalph Campbell 
511b2ef9f5aSRalph Campbell 	pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n",
512b2ef9f5aSRalph Campbell 		DEVMEM_CHUNK_SIZE / (1024 * 1024),
513b2ef9f5aSRalph Campbell 		mdevice->devmem_count,
514b2ef9f5aSRalph Campbell 		mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)),
515b2ef9f5aSRalph Campbell 		pfn_first, pfn_last);
516b2ef9f5aSRalph Campbell 
517b2ef9f5aSRalph Campbell 	spin_lock(&mdevice->lock);
518b2ef9f5aSRalph Campbell 	for (pfn = pfn_first; pfn < pfn_last; pfn++) {
519b2ef9f5aSRalph Campbell 		struct page *page = pfn_to_page(pfn);
520b2ef9f5aSRalph Campbell 
521b2ef9f5aSRalph Campbell 		page->zone_device_data = mdevice->free_pages;
522b2ef9f5aSRalph Campbell 		mdevice->free_pages = page;
523b2ef9f5aSRalph Campbell 	}
524b2ef9f5aSRalph Campbell 	if (ppage) {
525b2ef9f5aSRalph Campbell 		*ppage = mdevice->free_pages;
526b2ef9f5aSRalph Campbell 		mdevice->free_pages = (*ppage)->zone_device_data;
527b2ef9f5aSRalph Campbell 		mdevice->calloc++;
528b2ef9f5aSRalph Campbell 	}
529b2ef9f5aSRalph Campbell 	spin_unlock(&mdevice->lock);
530b2ef9f5aSRalph Campbell 
531b2ef9f5aSRalph Campbell 	return true;
532b2ef9f5aSRalph Campbell 
533b2ef9f5aSRalph Campbell err_release:
534b2ef9f5aSRalph Campbell 	mutex_unlock(&mdevice->devmem_lock);
535a4574f63SDan Williams 	release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
536a4574f63SDan Williams err_devmem:
537a4574f63SDan Williams 	kfree(devmem);
538a4574f63SDan Williams 
539b2ef9f5aSRalph Campbell 	return false;
540b2ef9f5aSRalph Campbell }
541b2ef9f5aSRalph Campbell 
542b2ef9f5aSRalph Campbell static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
543b2ef9f5aSRalph Campbell {
544b2ef9f5aSRalph Campbell 	struct page *dpage = NULL;
545b2ef9f5aSRalph Campbell 	struct page *rpage;
546b2ef9f5aSRalph Campbell 
547b2ef9f5aSRalph Campbell 	/*
548b2ef9f5aSRalph Campbell 	 * This is a fake device so we alloc real system memory to store
549b2ef9f5aSRalph Campbell 	 * our device memory.
550b2ef9f5aSRalph Campbell 	 */
551b2ef9f5aSRalph Campbell 	rpage = alloc_page(GFP_HIGHUSER);
552b2ef9f5aSRalph Campbell 	if (!rpage)
553b2ef9f5aSRalph Campbell 		return NULL;
554b2ef9f5aSRalph Campbell 
555b2ef9f5aSRalph Campbell 	spin_lock(&mdevice->lock);
556b2ef9f5aSRalph Campbell 
557b2ef9f5aSRalph Campbell 	if (mdevice->free_pages) {
558b2ef9f5aSRalph Campbell 		dpage = mdevice->free_pages;
559b2ef9f5aSRalph Campbell 		mdevice->free_pages = dpage->zone_device_data;
560b2ef9f5aSRalph Campbell 		mdevice->calloc++;
561b2ef9f5aSRalph Campbell 		spin_unlock(&mdevice->lock);
562b2ef9f5aSRalph Campbell 	} else {
563b2ef9f5aSRalph Campbell 		spin_unlock(&mdevice->lock);
564b2ef9f5aSRalph Campbell 		if (!dmirror_allocate_chunk(mdevice, &dpage))
565b2ef9f5aSRalph Campbell 			goto error;
566b2ef9f5aSRalph Campbell 	}
567b2ef9f5aSRalph Campbell 
568b2ef9f5aSRalph Campbell 	dpage->zone_device_data = rpage;
569b2ef9f5aSRalph Campbell 	lock_page(dpage);
570b2ef9f5aSRalph Campbell 	return dpage;
571b2ef9f5aSRalph Campbell 
572b2ef9f5aSRalph Campbell error:
573b2ef9f5aSRalph Campbell 	__free_page(rpage);
574b2ef9f5aSRalph Campbell 	return NULL;
575b2ef9f5aSRalph Campbell }
576b2ef9f5aSRalph Campbell 
577b2ef9f5aSRalph Campbell static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
578b2ef9f5aSRalph Campbell 					   struct dmirror *dmirror)
579b2ef9f5aSRalph Campbell {
580b2ef9f5aSRalph Campbell 	struct dmirror_device *mdevice = dmirror->mdevice;
581b2ef9f5aSRalph Campbell 	const unsigned long *src = args->src;
582b2ef9f5aSRalph Campbell 	unsigned long *dst = args->dst;
583b2ef9f5aSRalph Campbell 	unsigned long addr;
584b2ef9f5aSRalph Campbell 
585b2ef9f5aSRalph Campbell 	for (addr = args->start; addr < args->end; addr += PAGE_SIZE,
586b2ef9f5aSRalph Campbell 						   src++, dst++) {
587b2ef9f5aSRalph Campbell 		struct page *spage;
588b2ef9f5aSRalph Campbell 		struct page *dpage;
589b2ef9f5aSRalph Campbell 		struct page *rpage;
590b2ef9f5aSRalph Campbell 
591b2ef9f5aSRalph Campbell 		if (!(*src & MIGRATE_PFN_MIGRATE))
592b2ef9f5aSRalph Campbell 			continue;
593b2ef9f5aSRalph Campbell 
594b2ef9f5aSRalph Campbell 		/*
595b2ef9f5aSRalph Campbell 		 * Note that spage might be NULL which is OK since it is an
596b2ef9f5aSRalph Campbell 		 * unallocated pte_none() or read-only zero page.
597b2ef9f5aSRalph Campbell 		 */
598b2ef9f5aSRalph Campbell 		spage = migrate_pfn_to_page(*src);
599b2ef9f5aSRalph Campbell 
600b2ef9f5aSRalph Campbell 		dpage = dmirror_devmem_alloc_page(mdevice);
601b2ef9f5aSRalph Campbell 		if (!dpage)
602b2ef9f5aSRalph Campbell 			continue;
603b2ef9f5aSRalph Campbell 
604b2ef9f5aSRalph Campbell 		rpage = dpage->zone_device_data;
605b2ef9f5aSRalph Campbell 		if (spage)
606b2ef9f5aSRalph Campbell 			copy_highpage(rpage, spage);
607b2ef9f5aSRalph Campbell 		else
608b2ef9f5aSRalph Campbell 			clear_highpage(rpage);
609b2ef9f5aSRalph Campbell 
610b2ef9f5aSRalph Campbell 		/*
611b2ef9f5aSRalph Campbell 		 * Normally, a device would use the page->zone_device_data to
612b2ef9f5aSRalph Campbell 		 * point to the mirror but here we use it to hold the page for
613b2ef9f5aSRalph Campbell 		 * the simulated device memory and that page holds the pointer
614b2ef9f5aSRalph Campbell 		 * to the mirror.
615b2ef9f5aSRalph Campbell 		 */
616b2ef9f5aSRalph Campbell 		rpage->zone_device_data = dmirror;
617b2ef9f5aSRalph Campbell 
618ab09243aSAlistair Popple 		*dst = migrate_pfn(page_to_pfn(dpage));
619b2ef9f5aSRalph Campbell 		if ((*src & MIGRATE_PFN_WRITE) ||
620b2ef9f5aSRalph Campbell 		    (!spage && args->vma->vm_flags & VM_WRITE))
621b2ef9f5aSRalph Campbell 			*dst |= MIGRATE_PFN_WRITE;
622b2ef9f5aSRalph Campbell 	}
623b2ef9f5aSRalph Campbell }
624b2ef9f5aSRalph Campbell 
625b659baeaSAlistair Popple static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start,
626b659baeaSAlistair Popple 			     unsigned long end)
627b659baeaSAlistair Popple {
628b659baeaSAlistair Popple 	unsigned long pfn;
629b659baeaSAlistair Popple 
630b659baeaSAlistair Popple 	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
631b659baeaSAlistair Popple 		void *entry;
632b659baeaSAlistair Popple 
633b659baeaSAlistair Popple 		entry = xa_load(&dmirror->pt, pfn);
634b659baeaSAlistair Popple 		if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC)
635b659baeaSAlistair Popple 			return -EPERM;
636b659baeaSAlistair Popple 	}
637b659baeaSAlistair Popple 
638b659baeaSAlistair Popple 	return 0;
639b659baeaSAlistair Popple }
640b659baeaSAlistair Popple 
641b659baeaSAlistair Popple static int dmirror_atomic_map(unsigned long start, unsigned long end,
642b659baeaSAlistair Popple 			      struct page **pages, struct dmirror *dmirror)
643b659baeaSAlistair Popple {
644b659baeaSAlistair Popple 	unsigned long pfn, mapped = 0;
645b659baeaSAlistair Popple 	int i;
646b659baeaSAlistair Popple 
647b659baeaSAlistair Popple 	/* Map the migrated pages into the device's page tables. */
648b659baeaSAlistair Popple 	mutex_lock(&dmirror->mutex);
649b659baeaSAlistair Popple 
650b659baeaSAlistair Popple 	for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) {
651b659baeaSAlistair Popple 		void *entry;
652b659baeaSAlistair Popple 
653b659baeaSAlistair Popple 		if (!pages[i])
654b659baeaSAlistair Popple 			continue;
655b659baeaSAlistair Popple 
656b659baeaSAlistair Popple 		entry = pages[i];
657b659baeaSAlistair Popple 		entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC);
658b659baeaSAlistair Popple 		entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
659b659baeaSAlistair Popple 		if (xa_is_err(entry)) {
660b659baeaSAlistair Popple 			mutex_unlock(&dmirror->mutex);
661b659baeaSAlistair Popple 			return xa_err(entry);
662b659baeaSAlistair Popple 		}
663b659baeaSAlistair Popple 
664b659baeaSAlistair Popple 		mapped++;
665b659baeaSAlistair Popple 	}
666b659baeaSAlistair Popple 
667b659baeaSAlistair Popple 	mutex_unlock(&dmirror->mutex);
668b659baeaSAlistair Popple 	return mapped;
669b659baeaSAlistair Popple }
670b659baeaSAlistair Popple 
671b2ef9f5aSRalph Campbell static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
672b2ef9f5aSRalph Campbell 					    struct dmirror *dmirror)
673b2ef9f5aSRalph Campbell {
674b2ef9f5aSRalph Campbell 	unsigned long start = args->start;
675b2ef9f5aSRalph Campbell 	unsigned long end = args->end;
676b2ef9f5aSRalph Campbell 	const unsigned long *src = args->src;
677b2ef9f5aSRalph Campbell 	const unsigned long *dst = args->dst;
678b2ef9f5aSRalph Campbell 	unsigned long pfn;
679b2ef9f5aSRalph Campbell 
680b2ef9f5aSRalph Campbell 	/* Map the migrated pages into the device's page tables. */
681b2ef9f5aSRalph Campbell 	mutex_lock(&dmirror->mutex);
682b2ef9f5aSRalph Campbell 
683b2ef9f5aSRalph Campbell 	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++,
684b2ef9f5aSRalph Campbell 								src++, dst++) {
685b2ef9f5aSRalph Campbell 		struct page *dpage;
686b2ef9f5aSRalph Campbell 		void *entry;
687b2ef9f5aSRalph Campbell 
688b2ef9f5aSRalph Campbell 		if (!(*src & MIGRATE_PFN_MIGRATE))
689b2ef9f5aSRalph Campbell 			continue;
690b2ef9f5aSRalph Campbell 
691b2ef9f5aSRalph Campbell 		dpage = migrate_pfn_to_page(*dst);
692b2ef9f5aSRalph Campbell 		if (!dpage)
693b2ef9f5aSRalph Campbell 			continue;
694b2ef9f5aSRalph Campbell 
695b2ef9f5aSRalph Campbell 		/*
696b2ef9f5aSRalph Campbell 		 * Store the page that holds the data so the page table
697b2ef9f5aSRalph Campbell 		 * doesn't have to deal with ZONE_DEVICE private pages.
698b2ef9f5aSRalph Campbell 		 */
699b2ef9f5aSRalph Campbell 		entry = dpage->zone_device_data;
700b2ef9f5aSRalph Campbell 		if (*dst & MIGRATE_PFN_WRITE)
701b2ef9f5aSRalph Campbell 			entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
702b2ef9f5aSRalph Campbell 		entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
703b2ef9f5aSRalph Campbell 		if (xa_is_err(entry)) {
704b2ef9f5aSRalph Campbell 			mutex_unlock(&dmirror->mutex);
705b2ef9f5aSRalph Campbell 			return xa_err(entry);
706b2ef9f5aSRalph Campbell 		}
707b2ef9f5aSRalph Campbell 	}
708b2ef9f5aSRalph Campbell 
709b2ef9f5aSRalph Campbell 	mutex_unlock(&dmirror->mutex);
710b2ef9f5aSRalph Campbell 	return 0;
711b2ef9f5aSRalph Campbell }
712b2ef9f5aSRalph Campbell 
713b659baeaSAlistair Popple static int dmirror_exclusive(struct dmirror *dmirror,
714b659baeaSAlistair Popple 			     struct hmm_dmirror_cmd *cmd)
715b659baeaSAlistair Popple {
716b659baeaSAlistair Popple 	unsigned long start, end, addr;
717b659baeaSAlistair Popple 	unsigned long size = cmd->npages << PAGE_SHIFT;
718b659baeaSAlistair Popple 	struct mm_struct *mm = dmirror->notifier.mm;
719b659baeaSAlistair Popple 	struct page *pages[64];
720b659baeaSAlistair Popple 	struct dmirror_bounce bounce;
721b659baeaSAlistair Popple 	unsigned long next;
722b659baeaSAlistair Popple 	int ret;
723b659baeaSAlistair Popple 
724b659baeaSAlistair Popple 	start = cmd->addr;
725b659baeaSAlistair Popple 	end = start + size;
726b659baeaSAlistair Popple 	if (end < start)
727b659baeaSAlistair Popple 		return -EINVAL;
728b659baeaSAlistair Popple 
729b659baeaSAlistair Popple 	/* Since the mm is for the mirrored process, get a reference first. */
730b659baeaSAlistair Popple 	if (!mmget_not_zero(mm))
731b659baeaSAlistair Popple 		return -EINVAL;
732b659baeaSAlistair Popple 
733b659baeaSAlistair Popple 	mmap_read_lock(mm);
734b659baeaSAlistair Popple 	for (addr = start; addr < end; addr = next) {
735*ed913b05SMiaohe Lin 		unsigned long mapped = 0;
736b659baeaSAlistair Popple 		int i;
737b659baeaSAlistair Popple 
738b659baeaSAlistair Popple 		if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT))
739b659baeaSAlistair Popple 			next = end;
740b659baeaSAlistair Popple 		else
741b659baeaSAlistair Popple 			next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT);
742b659baeaSAlistair Popple 
743b659baeaSAlistair Popple 		ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
744*ed913b05SMiaohe Lin 		/*
745*ed913b05SMiaohe Lin 		 * Do dmirror_atomic_map() iff all pages are marked for
746*ed913b05SMiaohe Lin 		 * exclusive access to avoid accessing uninitialized
747*ed913b05SMiaohe Lin 		 * fields of pages.
748*ed913b05SMiaohe Lin 		 */
749*ed913b05SMiaohe Lin 		if (ret == (next - addr) >> PAGE_SHIFT)
750b659baeaSAlistair Popple 			mapped = dmirror_atomic_map(addr, next, pages, dmirror);
751b659baeaSAlistair Popple 		for (i = 0; i < ret; i++) {
752b659baeaSAlistair Popple 			if (pages[i]) {
753b659baeaSAlistair Popple 				unlock_page(pages[i]);
754b659baeaSAlistair Popple 				put_page(pages[i]);
755b659baeaSAlistair Popple 			}
756b659baeaSAlistair Popple 		}
757b659baeaSAlistair Popple 
758b659baeaSAlistair Popple 		if (addr + (mapped << PAGE_SHIFT) < next) {
759b659baeaSAlistair Popple 			mmap_read_unlock(mm);
760b659baeaSAlistair Popple 			mmput(mm);
761b659baeaSAlistair Popple 			return -EBUSY;
762b659baeaSAlistair Popple 		}
763b659baeaSAlistair Popple 	}
764b659baeaSAlistair Popple 	mmap_read_unlock(mm);
765b659baeaSAlistair Popple 	mmput(mm);
766b659baeaSAlistair Popple 
767b659baeaSAlistair Popple 	/* Return the migrated data for verification. */
768b659baeaSAlistair Popple 	ret = dmirror_bounce_init(&bounce, start, size);
769b659baeaSAlistair Popple 	if (ret)
770b659baeaSAlistair Popple 		return ret;
771b659baeaSAlistair Popple 	mutex_lock(&dmirror->mutex);
772b659baeaSAlistair Popple 	ret = dmirror_do_read(dmirror, start, end, &bounce);
773b659baeaSAlistair Popple 	mutex_unlock(&dmirror->mutex);
774b659baeaSAlistair Popple 	if (ret == 0) {
775b659baeaSAlistair Popple 		if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
776b659baeaSAlistair Popple 				 bounce.size))
777b659baeaSAlistair Popple 			ret = -EFAULT;
778b659baeaSAlistair Popple 	}
779b659baeaSAlistair Popple 
780b659baeaSAlistair Popple 	cmd->cpages = bounce.cpages;
781b659baeaSAlistair Popple 	dmirror_bounce_fini(&bounce);
782b659baeaSAlistair Popple 	return ret;
783b659baeaSAlistair Popple }
784b659baeaSAlistair Popple 
785b2ef9f5aSRalph Campbell static int dmirror_migrate(struct dmirror *dmirror,
786b2ef9f5aSRalph Campbell 			   struct hmm_dmirror_cmd *cmd)
787b2ef9f5aSRalph Campbell {
788b2ef9f5aSRalph Campbell 	unsigned long start, end, addr;
789b2ef9f5aSRalph Campbell 	unsigned long size = cmd->npages << PAGE_SHIFT;
790b2ef9f5aSRalph Campbell 	struct mm_struct *mm = dmirror->notifier.mm;
791b2ef9f5aSRalph Campbell 	struct vm_area_struct *vma;
792b2ef9f5aSRalph Campbell 	unsigned long src_pfns[64];
793b2ef9f5aSRalph Campbell 	unsigned long dst_pfns[64];
794b2ef9f5aSRalph Campbell 	struct dmirror_bounce bounce;
795b2ef9f5aSRalph Campbell 	struct migrate_vma args;
796b2ef9f5aSRalph Campbell 	unsigned long next;
797b2ef9f5aSRalph Campbell 	int ret;
798b2ef9f5aSRalph Campbell 
799b2ef9f5aSRalph Campbell 	start = cmd->addr;
800b2ef9f5aSRalph Campbell 	end = start + size;
801b2ef9f5aSRalph Campbell 	if (end < start)
802b2ef9f5aSRalph Campbell 		return -EINVAL;
803b2ef9f5aSRalph Campbell 
804b2ef9f5aSRalph Campbell 	/* Since the mm is for the mirrored process, get a reference first. */
805b2ef9f5aSRalph Campbell 	if (!mmget_not_zero(mm))
806b2ef9f5aSRalph Campbell 		return -EINVAL;
807b2ef9f5aSRalph Campbell 
80889154dd5SMichel Lespinasse 	mmap_read_lock(mm);
809b2ef9f5aSRalph Campbell 	for (addr = start; addr < end; addr = next) {
81046e6b31dSLiam Howlett 		vma = vma_lookup(mm, addr);
81146e6b31dSLiam Howlett 		if (!vma || !(vma->vm_flags & VM_READ)) {
812b2ef9f5aSRalph Campbell 			ret = -EINVAL;
813b2ef9f5aSRalph Campbell 			goto out;
814b2ef9f5aSRalph Campbell 		}
815b2ef9f5aSRalph Campbell 		next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
816b2ef9f5aSRalph Campbell 		if (next > vma->vm_end)
817b2ef9f5aSRalph Campbell 			next = vma->vm_end;
818b2ef9f5aSRalph Campbell 
819b2ef9f5aSRalph Campbell 		args.vma = vma;
820b2ef9f5aSRalph Campbell 		args.src = src_pfns;
821b2ef9f5aSRalph Campbell 		args.dst = dst_pfns;
822b2ef9f5aSRalph Campbell 		args.start = addr;
823b2ef9f5aSRalph Campbell 		args.end = next;
8247d17e83aSRalph Campbell 		args.pgmap_owner = dmirror->mdevice;
8255143192cSRalph Campbell 		args.flags = MIGRATE_VMA_SELECT_SYSTEM;
826b2ef9f5aSRalph Campbell 		ret = migrate_vma_setup(&args);
827b2ef9f5aSRalph Campbell 		if (ret)
828b2ef9f5aSRalph Campbell 			goto out;
829b2ef9f5aSRalph Campbell 
830b2ef9f5aSRalph Campbell 		dmirror_migrate_alloc_and_copy(&args, dmirror);
831b2ef9f5aSRalph Campbell 		migrate_vma_pages(&args);
832b2ef9f5aSRalph Campbell 		dmirror_migrate_finalize_and_map(&args, dmirror);
833b2ef9f5aSRalph Campbell 		migrate_vma_finalize(&args);
834b2ef9f5aSRalph Campbell 	}
83589154dd5SMichel Lespinasse 	mmap_read_unlock(mm);
836b2ef9f5aSRalph Campbell 	mmput(mm);
837b2ef9f5aSRalph Campbell 
838b2ef9f5aSRalph Campbell 	/* Return the migrated data for verification. */
839b2ef9f5aSRalph Campbell 	ret = dmirror_bounce_init(&bounce, start, size);
840b2ef9f5aSRalph Campbell 	if (ret)
841b2ef9f5aSRalph Campbell 		return ret;
842b2ef9f5aSRalph Campbell 	mutex_lock(&dmirror->mutex);
843b2ef9f5aSRalph Campbell 	ret = dmirror_do_read(dmirror, start, end, &bounce);
844b2ef9f5aSRalph Campbell 	mutex_unlock(&dmirror->mutex);
845b2ef9f5aSRalph Campbell 	if (ret == 0) {
846b2ef9f5aSRalph Campbell 		if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
847b2ef9f5aSRalph Campbell 				 bounce.size))
848b2ef9f5aSRalph Campbell 			ret = -EFAULT;
849b2ef9f5aSRalph Campbell 	}
850b2ef9f5aSRalph Campbell 	cmd->cpages = bounce.cpages;
851b2ef9f5aSRalph Campbell 	dmirror_bounce_fini(&bounce);
852b2ef9f5aSRalph Campbell 	return ret;
853b2ef9f5aSRalph Campbell 
854b2ef9f5aSRalph Campbell out:
85589154dd5SMichel Lespinasse 	mmap_read_unlock(mm);
856b2ef9f5aSRalph Campbell 	mmput(mm);
857b2ef9f5aSRalph Campbell 	return ret;
858b2ef9f5aSRalph Campbell }
859b2ef9f5aSRalph Campbell 
860b2ef9f5aSRalph Campbell static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
861b2ef9f5aSRalph Campbell 			    unsigned char *perm, unsigned long entry)
862b2ef9f5aSRalph Campbell {
863b2ef9f5aSRalph Campbell 	struct page *page;
864b2ef9f5aSRalph Campbell 
865b2ef9f5aSRalph Campbell 	if (entry & HMM_PFN_ERROR) {
866b2ef9f5aSRalph Campbell 		*perm = HMM_DMIRROR_PROT_ERROR;
867b2ef9f5aSRalph Campbell 		return;
868b2ef9f5aSRalph Campbell 	}
869b2ef9f5aSRalph Campbell 	if (!(entry & HMM_PFN_VALID)) {
870b2ef9f5aSRalph Campbell 		*perm = HMM_DMIRROR_PROT_NONE;
871b2ef9f5aSRalph Campbell 		return;
872b2ef9f5aSRalph Campbell 	}
873b2ef9f5aSRalph Campbell 
874b2ef9f5aSRalph Campbell 	page = hmm_pfn_to_page(entry);
875b2ef9f5aSRalph Campbell 	if (is_device_private_page(page)) {
876b2ef9f5aSRalph Campbell 		/* Is the page migrated to this device or some other? */
877b2ef9f5aSRalph Campbell 		if (dmirror->mdevice == dmirror_page_to_device(page))
878b2ef9f5aSRalph Campbell 			*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
879b2ef9f5aSRalph Campbell 		else
880b2ef9f5aSRalph Campbell 			*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
881b2ef9f5aSRalph Campbell 	} else if (is_zero_pfn(page_to_pfn(page)))
882b2ef9f5aSRalph Campbell 		*perm = HMM_DMIRROR_PROT_ZERO;
883b2ef9f5aSRalph Campbell 	else
884b2ef9f5aSRalph Campbell 		*perm = HMM_DMIRROR_PROT_NONE;
885b2ef9f5aSRalph Campbell 	if (entry & HMM_PFN_WRITE)
886b2ef9f5aSRalph Campbell 		*perm |= HMM_DMIRROR_PROT_WRITE;
887b2ef9f5aSRalph Campbell 	else
888b2ef9f5aSRalph Campbell 		*perm |= HMM_DMIRROR_PROT_READ;
889e478425bSRalph Campbell 	if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT)
890e478425bSRalph Campbell 		*perm |= HMM_DMIRROR_PROT_PMD;
891e478425bSRalph Campbell 	else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT)
892e478425bSRalph Campbell 		*perm |= HMM_DMIRROR_PROT_PUD;
893b2ef9f5aSRalph Campbell }
894b2ef9f5aSRalph Campbell 
895b2ef9f5aSRalph Campbell static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni,
896b2ef9f5aSRalph Campbell 				const struct mmu_notifier_range *range,
897b2ef9f5aSRalph Campbell 				unsigned long cur_seq)
898b2ef9f5aSRalph Campbell {
899b2ef9f5aSRalph Campbell 	struct dmirror_interval *dmi =
900b2ef9f5aSRalph Campbell 		container_of(mni, struct dmirror_interval, notifier);
901b2ef9f5aSRalph Campbell 	struct dmirror *dmirror = dmi->dmirror;
902b2ef9f5aSRalph Campbell 
903b2ef9f5aSRalph Campbell 	if (mmu_notifier_range_blockable(range))
904b2ef9f5aSRalph Campbell 		mutex_lock(&dmirror->mutex);
905b2ef9f5aSRalph Campbell 	else if (!mutex_trylock(&dmirror->mutex))
906b2ef9f5aSRalph Campbell 		return false;
907b2ef9f5aSRalph Campbell 
908b2ef9f5aSRalph Campbell 	/*
909b2ef9f5aSRalph Campbell 	 * Snapshots only need to set the sequence number since any
910b2ef9f5aSRalph Campbell 	 * invalidation in the interval invalidates the whole snapshot.
911b2ef9f5aSRalph Campbell 	 */
912b2ef9f5aSRalph Campbell 	mmu_interval_set_seq(mni, cur_seq);
913b2ef9f5aSRalph Campbell 
914b2ef9f5aSRalph Campbell 	mutex_unlock(&dmirror->mutex);
915b2ef9f5aSRalph Campbell 	return true;
916b2ef9f5aSRalph Campbell }
917b2ef9f5aSRalph Campbell 
918b2ef9f5aSRalph Campbell static const struct mmu_interval_notifier_ops dmirror_mrn_ops = {
919b2ef9f5aSRalph Campbell 	.invalidate = dmirror_snapshot_invalidate,
920b2ef9f5aSRalph Campbell };
921b2ef9f5aSRalph Campbell 
922b2ef9f5aSRalph Campbell static int dmirror_range_snapshot(struct dmirror *dmirror,
923b2ef9f5aSRalph Campbell 				  struct hmm_range *range,
924b2ef9f5aSRalph Campbell 				  unsigned char *perm)
925b2ef9f5aSRalph Campbell {
926b2ef9f5aSRalph Campbell 	struct mm_struct *mm = dmirror->notifier.mm;
927b2ef9f5aSRalph Campbell 	struct dmirror_interval notifier;
928b2ef9f5aSRalph Campbell 	unsigned long timeout =
929b2ef9f5aSRalph Campbell 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
930b2ef9f5aSRalph Campbell 	unsigned long i;
931b2ef9f5aSRalph Campbell 	unsigned long n;
932b2ef9f5aSRalph Campbell 	int ret = 0;
933b2ef9f5aSRalph Campbell 
934b2ef9f5aSRalph Campbell 	notifier.dmirror = dmirror;
935b2ef9f5aSRalph Campbell 	range->notifier = &notifier.notifier;
936b2ef9f5aSRalph Campbell 
937b2ef9f5aSRalph Campbell 	ret = mmu_interval_notifier_insert(range->notifier, mm,
938b2ef9f5aSRalph Campbell 			range->start, range->end - range->start,
939b2ef9f5aSRalph Campbell 			&dmirror_mrn_ops);
940b2ef9f5aSRalph Campbell 	if (ret)
941b2ef9f5aSRalph Campbell 		return ret;
942b2ef9f5aSRalph Campbell 
943b2ef9f5aSRalph Campbell 	while (true) {
944b2ef9f5aSRalph Campbell 		if (time_after(jiffies, timeout)) {
945b2ef9f5aSRalph Campbell 			ret = -EBUSY;
946b2ef9f5aSRalph Campbell 			goto out;
947b2ef9f5aSRalph Campbell 		}
948b2ef9f5aSRalph Campbell 
949b2ef9f5aSRalph Campbell 		range->notifier_seq = mmu_interval_read_begin(range->notifier);
950b2ef9f5aSRalph Campbell 
95189154dd5SMichel Lespinasse 		mmap_read_lock(mm);
952b2ef9f5aSRalph Campbell 		ret = hmm_range_fault(range);
95389154dd5SMichel Lespinasse 		mmap_read_unlock(mm);
954b2ef9f5aSRalph Campbell 		if (ret) {
955b2ef9f5aSRalph Campbell 			if (ret == -EBUSY)
956b2ef9f5aSRalph Campbell 				continue;
957b2ef9f5aSRalph Campbell 			goto out;
958b2ef9f5aSRalph Campbell 		}
959b2ef9f5aSRalph Campbell 
960b2ef9f5aSRalph Campbell 		mutex_lock(&dmirror->mutex);
961b2ef9f5aSRalph Campbell 		if (mmu_interval_read_retry(range->notifier,
962b2ef9f5aSRalph Campbell 					    range->notifier_seq)) {
963b2ef9f5aSRalph Campbell 			mutex_unlock(&dmirror->mutex);
964b2ef9f5aSRalph Campbell 			continue;
965b2ef9f5aSRalph Campbell 		}
966b2ef9f5aSRalph Campbell 		break;
967b2ef9f5aSRalph Campbell 	}
968b2ef9f5aSRalph Campbell 
969b2ef9f5aSRalph Campbell 	n = (range->end - range->start) >> PAGE_SHIFT;
970b2ef9f5aSRalph Campbell 	for (i = 0; i < n; i++)
971b2ef9f5aSRalph Campbell 		dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]);
972b2ef9f5aSRalph Campbell 
973b2ef9f5aSRalph Campbell 	mutex_unlock(&dmirror->mutex);
974b2ef9f5aSRalph Campbell out:
975b2ef9f5aSRalph Campbell 	mmu_interval_notifier_remove(range->notifier);
976b2ef9f5aSRalph Campbell 	return ret;
977b2ef9f5aSRalph Campbell }
978b2ef9f5aSRalph Campbell 
979b2ef9f5aSRalph Campbell static int dmirror_snapshot(struct dmirror *dmirror,
980b2ef9f5aSRalph Campbell 			    struct hmm_dmirror_cmd *cmd)
981b2ef9f5aSRalph Campbell {
982b2ef9f5aSRalph Campbell 	struct mm_struct *mm = dmirror->notifier.mm;
983b2ef9f5aSRalph Campbell 	unsigned long start, end;
984b2ef9f5aSRalph Campbell 	unsigned long size = cmd->npages << PAGE_SHIFT;
985b2ef9f5aSRalph Campbell 	unsigned long addr;
986b2ef9f5aSRalph Campbell 	unsigned long next;
987b2ef9f5aSRalph Campbell 	unsigned long pfns[64];
988b2ef9f5aSRalph Campbell 	unsigned char perm[64];
989b2ef9f5aSRalph Campbell 	char __user *uptr;
990b2ef9f5aSRalph Campbell 	struct hmm_range range = {
991b2ef9f5aSRalph Campbell 		.hmm_pfns = pfns,
992b2ef9f5aSRalph Campbell 		.dev_private_owner = dmirror->mdevice,
993b2ef9f5aSRalph Campbell 	};
994b2ef9f5aSRalph Campbell 	int ret = 0;
995b2ef9f5aSRalph Campbell 
996b2ef9f5aSRalph Campbell 	start = cmd->addr;
997b2ef9f5aSRalph Campbell 	end = start + size;
998b2ef9f5aSRalph Campbell 	if (end < start)
999b2ef9f5aSRalph Campbell 		return -EINVAL;
1000b2ef9f5aSRalph Campbell 
1001b2ef9f5aSRalph Campbell 	/* Since the mm is for the mirrored process, get a reference first. */
1002b2ef9f5aSRalph Campbell 	if (!mmget_not_zero(mm))
1003b2ef9f5aSRalph Campbell 		return -EINVAL;
1004b2ef9f5aSRalph Campbell 
1005b2ef9f5aSRalph Campbell 	/*
1006b2ef9f5aSRalph Campbell 	 * Register a temporary notifier to detect invalidations even if it
1007b2ef9f5aSRalph Campbell 	 * overlaps with other mmu_interval_notifiers.
1008b2ef9f5aSRalph Campbell 	 */
1009b2ef9f5aSRalph Campbell 	uptr = u64_to_user_ptr(cmd->ptr);
1010b2ef9f5aSRalph Campbell 	for (addr = start; addr < end; addr = next) {
1011b2ef9f5aSRalph Campbell 		unsigned long n;
1012b2ef9f5aSRalph Campbell 
1013b2ef9f5aSRalph Campbell 		next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
1014b2ef9f5aSRalph Campbell 		range.start = addr;
1015b2ef9f5aSRalph Campbell 		range.end = next;
1016b2ef9f5aSRalph Campbell 
1017b2ef9f5aSRalph Campbell 		ret = dmirror_range_snapshot(dmirror, &range, perm);
1018b2ef9f5aSRalph Campbell 		if (ret)
1019b2ef9f5aSRalph Campbell 			break;
1020b2ef9f5aSRalph Campbell 
1021b2ef9f5aSRalph Campbell 		n = (range.end - range.start) >> PAGE_SHIFT;
1022b2ef9f5aSRalph Campbell 		if (copy_to_user(uptr, perm, n)) {
1023b2ef9f5aSRalph Campbell 			ret = -EFAULT;
1024b2ef9f5aSRalph Campbell 			break;
1025b2ef9f5aSRalph Campbell 		}
1026b2ef9f5aSRalph Campbell 
1027b2ef9f5aSRalph Campbell 		cmd->cpages += n;
1028b2ef9f5aSRalph Campbell 		uptr += n;
1029b2ef9f5aSRalph Campbell 	}
1030b2ef9f5aSRalph Campbell 	mmput(mm);
1031b2ef9f5aSRalph Campbell 
1032b2ef9f5aSRalph Campbell 	return ret;
1033b2ef9f5aSRalph Campbell }
1034b2ef9f5aSRalph Campbell 
1035b2ef9f5aSRalph Campbell static long dmirror_fops_unlocked_ioctl(struct file *filp,
1036b2ef9f5aSRalph Campbell 					unsigned int command,
1037b2ef9f5aSRalph Campbell 					unsigned long arg)
1038b2ef9f5aSRalph Campbell {
1039b2ef9f5aSRalph Campbell 	void __user *uarg = (void __user *)arg;
1040b2ef9f5aSRalph Campbell 	struct hmm_dmirror_cmd cmd;
1041b2ef9f5aSRalph Campbell 	struct dmirror *dmirror;
1042b2ef9f5aSRalph Campbell 	int ret;
1043b2ef9f5aSRalph Campbell 
1044b2ef9f5aSRalph Campbell 	dmirror = filp->private_data;
1045b2ef9f5aSRalph Campbell 	if (!dmirror)
1046b2ef9f5aSRalph Campbell 		return -EINVAL;
1047b2ef9f5aSRalph Campbell 
1048b2ef9f5aSRalph Campbell 	if (copy_from_user(&cmd, uarg, sizeof(cmd)))
1049b2ef9f5aSRalph Campbell 		return -EFAULT;
1050b2ef9f5aSRalph Campbell 
1051b2ef9f5aSRalph Campbell 	if (cmd.addr & ~PAGE_MASK)
1052b2ef9f5aSRalph Campbell 		return -EINVAL;
1053b2ef9f5aSRalph Campbell 	if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT)))
1054b2ef9f5aSRalph Campbell 		return -EINVAL;
1055b2ef9f5aSRalph Campbell 
1056b2ef9f5aSRalph Campbell 	cmd.cpages = 0;
1057b2ef9f5aSRalph Campbell 	cmd.faults = 0;
1058b2ef9f5aSRalph Campbell 
1059b2ef9f5aSRalph Campbell 	switch (command) {
1060b2ef9f5aSRalph Campbell 	case HMM_DMIRROR_READ:
1061b2ef9f5aSRalph Campbell 		ret = dmirror_read(dmirror, &cmd);
1062b2ef9f5aSRalph Campbell 		break;
1063b2ef9f5aSRalph Campbell 
1064b2ef9f5aSRalph Campbell 	case HMM_DMIRROR_WRITE:
1065b2ef9f5aSRalph Campbell 		ret = dmirror_write(dmirror, &cmd);
1066b2ef9f5aSRalph Campbell 		break;
1067b2ef9f5aSRalph Campbell 
1068b2ef9f5aSRalph Campbell 	case HMM_DMIRROR_MIGRATE:
1069b2ef9f5aSRalph Campbell 		ret = dmirror_migrate(dmirror, &cmd);
1070b2ef9f5aSRalph Campbell 		break;
1071b2ef9f5aSRalph Campbell 
1072b659baeaSAlistair Popple 	case HMM_DMIRROR_EXCLUSIVE:
1073b659baeaSAlistair Popple 		ret = dmirror_exclusive(dmirror, &cmd);
1074b659baeaSAlistair Popple 		break;
1075b659baeaSAlistair Popple 
1076b659baeaSAlistair Popple 	case HMM_DMIRROR_CHECK_EXCLUSIVE:
1077b659baeaSAlistair Popple 		ret = dmirror_check_atomic(dmirror, cmd.addr,
1078b659baeaSAlistair Popple 					cmd.addr + (cmd.npages << PAGE_SHIFT));
1079b659baeaSAlistair Popple 		break;
1080b659baeaSAlistair Popple 
1081b2ef9f5aSRalph Campbell 	case HMM_DMIRROR_SNAPSHOT:
1082b2ef9f5aSRalph Campbell 		ret = dmirror_snapshot(dmirror, &cmd);
1083b2ef9f5aSRalph Campbell 		break;
1084b2ef9f5aSRalph Campbell 
1085b2ef9f5aSRalph Campbell 	default:
1086b2ef9f5aSRalph Campbell 		return -EINVAL;
1087b2ef9f5aSRalph Campbell 	}
1088b2ef9f5aSRalph Campbell 	if (ret)
1089b2ef9f5aSRalph Campbell 		return ret;
1090b2ef9f5aSRalph Campbell 
1091b2ef9f5aSRalph Campbell 	if (copy_to_user(uarg, &cmd, sizeof(cmd)))
1092b2ef9f5aSRalph Campbell 		return -EFAULT;
1093b2ef9f5aSRalph Campbell 
1094b2ef9f5aSRalph Campbell 	return 0;
1095b2ef9f5aSRalph Campbell }
1096b2ef9f5aSRalph Campbell 
109787c01d57SAlistair Popple static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
109887c01d57SAlistair Popple {
109987c01d57SAlistair Popple 	unsigned long addr;
110087c01d57SAlistair Popple 
110187c01d57SAlistair Popple 	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
110287c01d57SAlistair Popple 		struct page *page;
110387c01d57SAlistair Popple 		int ret;
110487c01d57SAlistair Popple 
110587c01d57SAlistair Popple 		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
110687c01d57SAlistair Popple 		if (!page)
110787c01d57SAlistair Popple 			return -ENOMEM;
110887c01d57SAlistair Popple 
110987c01d57SAlistair Popple 		ret = vm_insert_page(vma, addr, page);
111087c01d57SAlistair Popple 		if (ret) {
111187c01d57SAlistair Popple 			__free_page(page);
111287c01d57SAlistair Popple 			return ret;
111387c01d57SAlistair Popple 		}
111487c01d57SAlistair Popple 		put_page(page);
111587c01d57SAlistair Popple 	}
111687c01d57SAlistair Popple 
111787c01d57SAlistair Popple 	return 0;
111887c01d57SAlistair Popple }
111987c01d57SAlistair Popple 
1120b2ef9f5aSRalph Campbell static const struct file_operations dmirror_fops = {
1121b2ef9f5aSRalph Campbell 	.open		= dmirror_fops_open,
1122b2ef9f5aSRalph Campbell 	.release	= dmirror_fops_release,
112387c01d57SAlistair Popple 	.mmap		= dmirror_fops_mmap,
1124b2ef9f5aSRalph Campbell 	.unlocked_ioctl = dmirror_fops_unlocked_ioctl,
1125b2ef9f5aSRalph Campbell 	.llseek		= default_llseek,
1126b2ef9f5aSRalph Campbell 	.owner		= THIS_MODULE,
1127b2ef9f5aSRalph Campbell };
1128b2ef9f5aSRalph Campbell 
1129b2ef9f5aSRalph Campbell static void dmirror_devmem_free(struct page *page)
1130b2ef9f5aSRalph Campbell {
1131b2ef9f5aSRalph Campbell 	struct page *rpage = page->zone_device_data;
1132b2ef9f5aSRalph Campbell 	struct dmirror_device *mdevice;
1133b2ef9f5aSRalph Campbell 
1134b2ef9f5aSRalph Campbell 	if (rpage)
1135b2ef9f5aSRalph Campbell 		__free_page(rpage);
1136b2ef9f5aSRalph Campbell 
1137b2ef9f5aSRalph Campbell 	mdevice = dmirror_page_to_device(page);
1138b2ef9f5aSRalph Campbell 
1139b2ef9f5aSRalph Campbell 	spin_lock(&mdevice->lock);
1140b2ef9f5aSRalph Campbell 	mdevice->cfree++;
1141b2ef9f5aSRalph Campbell 	page->zone_device_data = mdevice->free_pages;
1142b2ef9f5aSRalph Campbell 	mdevice->free_pages = page;
1143b2ef9f5aSRalph Campbell 	spin_unlock(&mdevice->lock);
1144b2ef9f5aSRalph Campbell }
1145b2ef9f5aSRalph Campbell 
1146b2ef9f5aSRalph Campbell static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
11477d17e83aSRalph Campbell 						      struct dmirror *dmirror)
1148b2ef9f5aSRalph Campbell {
1149b2ef9f5aSRalph Campbell 	const unsigned long *src = args->src;
1150b2ef9f5aSRalph Campbell 	unsigned long *dst = args->dst;
1151b2ef9f5aSRalph Campbell 	unsigned long start = args->start;
1152b2ef9f5aSRalph Campbell 	unsigned long end = args->end;
1153b2ef9f5aSRalph Campbell 	unsigned long addr;
1154b2ef9f5aSRalph Campbell 
1155b2ef9f5aSRalph Campbell 	for (addr = start; addr < end; addr += PAGE_SIZE,
1156b2ef9f5aSRalph Campbell 				       src++, dst++) {
1157b2ef9f5aSRalph Campbell 		struct page *dpage, *spage;
1158b2ef9f5aSRalph Campbell 
1159b2ef9f5aSRalph Campbell 		spage = migrate_pfn_to_page(*src);
1160b2ef9f5aSRalph Campbell 		if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
1161b2ef9f5aSRalph Campbell 			continue;
1162b2ef9f5aSRalph Campbell 		spage = spage->zone_device_data;
1163b2ef9f5aSRalph Campbell 
1164b2ef9f5aSRalph Campbell 		dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
1165b2ef9f5aSRalph Campbell 		if (!dpage)
1166b2ef9f5aSRalph Campbell 			continue;
1167b2ef9f5aSRalph Campbell 
1168b2ef9f5aSRalph Campbell 		lock_page(dpage);
11697d17e83aSRalph Campbell 		xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
1170b2ef9f5aSRalph Campbell 		copy_highpage(dpage, spage);
1171ab09243aSAlistair Popple 		*dst = migrate_pfn(page_to_pfn(dpage));
1172b2ef9f5aSRalph Campbell 		if (*src & MIGRATE_PFN_WRITE)
1173b2ef9f5aSRalph Campbell 			*dst |= MIGRATE_PFN_WRITE;
1174b2ef9f5aSRalph Campbell 	}
1175b2ef9f5aSRalph Campbell 	return 0;
1176b2ef9f5aSRalph Campbell }
1177b2ef9f5aSRalph Campbell 
1178b2ef9f5aSRalph Campbell static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
1179b2ef9f5aSRalph Campbell {
1180b2ef9f5aSRalph Campbell 	struct migrate_vma args;
1181b2ef9f5aSRalph Campbell 	unsigned long src_pfns;
1182b2ef9f5aSRalph Campbell 	unsigned long dst_pfns;
1183b2ef9f5aSRalph Campbell 	struct page *rpage;
1184b2ef9f5aSRalph Campbell 	struct dmirror *dmirror;
1185b2ef9f5aSRalph Campbell 	vm_fault_t ret;
1186b2ef9f5aSRalph Campbell 
1187b2ef9f5aSRalph Campbell 	/*
1188b2ef9f5aSRalph Campbell 	 * Normally, a device would use the page->zone_device_data to point to
1189b2ef9f5aSRalph Campbell 	 * the mirror but here we use it to hold the page for the simulated
1190b2ef9f5aSRalph Campbell 	 * device memory and that page holds the pointer to the mirror.
1191b2ef9f5aSRalph Campbell 	 */
1192b2ef9f5aSRalph Campbell 	rpage = vmf->page->zone_device_data;
1193b2ef9f5aSRalph Campbell 	dmirror = rpage->zone_device_data;
1194b2ef9f5aSRalph Campbell 
1195b2ef9f5aSRalph Campbell 	/* FIXME demonstrate how we can adjust migrate range */
1196b2ef9f5aSRalph Campbell 	args.vma = vmf->vma;
1197b2ef9f5aSRalph Campbell 	args.start = vmf->address;
1198b2ef9f5aSRalph Campbell 	args.end = args.start + PAGE_SIZE;
1199b2ef9f5aSRalph Campbell 	args.src = &src_pfns;
1200b2ef9f5aSRalph Campbell 	args.dst = &dst_pfns;
12015143192cSRalph Campbell 	args.pgmap_owner = dmirror->mdevice;
12025143192cSRalph Campbell 	args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
1203b2ef9f5aSRalph Campbell 
1204b2ef9f5aSRalph Campbell 	if (migrate_vma_setup(&args))
1205b2ef9f5aSRalph Campbell 		return VM_FAULT_SIGBUS;
1206b2ef9f5aSRalph Campbell 
12077d17e83aSRalph Campbell 	ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
1208b2ef9f5aSRalph Campbell 	if (ret)
1209b2ef9f5aSRalph Campbell 		return ret;
1210b2ef9f5aSRalph Campbell 	migrate_vma_pages(&args);
12117d17e83aSRalph Campbell 	/*
12127d17e83aSRalph Campbell 	 * No device finalize step is needed since
12137d17e83aSRalph Campbell 	 * dmirror_devmem_fault_alloc_and_copy() will have already
12147d17e83aSRalph Campbell 	 * invalidated the device page table.
12157d17e83aSRalph Campbell 	 */
1216b2ef9f5aSRalph Campbell 	migrate_vma_finalize(&args);
1217b2ef9f5aSRalph Campbell 	return 0;
1218b2ef9f5aSRalph Campbell }
1219b2ef9f5aSRalph Campbell 
1220b2ef9f5aSRalph Campbell static const struct dev_pagemap_ops dmirror_devmem_ops = {
1221b2ef9f5aSRalph Campbell 	.page_free	= dmirror_devmem_free,
1222b2ef9f5aSRalph Campbell 	.migrate_to_ram	= dmirror_devmem_fault,
1223b2ef9f5aSRalph Campbell };
1224b2ef9f5aSRalph Campbell 
1225b2ef9f5aSRalph Campbell static int dmirror_device_init(struct dmirror_device *mdevice, int id)
1226b2ef9f5aSRalph Campbell {
1227b2ef9f5aSRalph Campbell 	dev_t dev;
1228b2ef9f5aSRalph Campbell 	int ret;
1229b2ef9f5aSRalph Campbell 
1230b2ef9f5aSRalph Campbell 	dev = MKDEV(MAJOR(dmirror_dev), id);
1231b2ef9f5aSRalph Campbell 	mutex_init(&mdevice->devmem_lock);
1232b2ef9f5aSRalph Campbell 	spin_lock_init(&mdevice->lock);
1233b2ef9f5aSRalph Campbell 
1234b2ef9f5aSRalph Campbell 	cdev_init(&mdevice->cdevice, &dmirror_fops);
1235b2ef9f5aSRalph Campbell 	mdevice->cdevice.owner = THIS_MODULE;
1236b2ef9f5aSRalph Campbell 	ret = cdev_add(&mdevice->cdevice, dev, 1);
1237b2ef9f5aSRalph Campbell 	if (ret)
1238b2ef9f5aSRalph Campbell 		return ret;
1239b2ef9f5aSRalph Campbell 
1240b2ef9f5aSRalph Campbell 	/* Build a list of free ZONE_DEVICE private struct pages */
1241b2ef9f5aSRalph Campbell 	dmirror_allocate_chunk(mdevice, NULL);
1242b2ef9f5aSRalph Campbell 
1243b2ef9f5aSRalph Campbell 	return 0;
1244b2ef9f5aSRalph Campbell }
1245b2ef9f5aSRalph Campbell 
1246b2ef9f5aSRalph Campbell static void dmirror_device_remove(struct dmirror_device *mdevice)
1247b2ef9f5aSRalph Campbell {
1248b2ef9f5aSRalph Campbell 	unsigned int i;
1249b2ef9f5aSRalph Campbell 
1250b2ef9f5aSRalph Campbell 	if (mdevice->devmem_chunks) {
1251b2ef9f5aSRalph Campbell 		for (i = 0; i < mdevice->devmem_count; i++) {
1252b2ef9f5aSRalph Campbell 			struct dmirror_chunk *devmem =
1253b2ef9f5aSRalph Campbell 				mdevice->devmem_chunks[i];
1254b2ef9f5aSRalph Campbell 
1255b2ef9f5aSRalph Campbell 			memunmap_pages(&devmem->pagemap);
1256a4574f63SDan Williams 			release_mem_region(devmem->pagemap.range.start,
1257a4574f63SDan Williams 					   range_len(&devmem->pagemap.range));
1258b2ef9f5aSRalph Campbell 			kfree(devmem);
1259b2ef9f5aSRalph Campbell 		}
1260b2ef9f5aSRalph Campbell 		kfree(mdevice->devmem_chunks);
1261b2ef9f5aSRalph Campbell 	}
1262b2ef9f5aSRalph Campbell 
1263b2ef9f5aSRalph Campbell 	cdev_del(&mdevice->cdevice);
1264b2ef9f5aSRalph Campbell }
1265b2ef9f5aSRalph Campbell 
1266b2ef9f5aSRalph Campbell static int __init hmm_dmirror_init(void)
1267b2ef9f5aSRalph Campbell {
1268b2ef9f5aSRalph Campbell 	int ret;
1269b2ef9f5aSRalph Campbell 	int id;
1270b2ef9f5aSRalph Campbell 
1271b2ef9f5aSRalph Campbell 	ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES,
1272b2ef9f5aSRalph Campbell 				  "HMM_DMIRROR");
1273b2ef9f5aSRalph Campbell 	if (ret)
1274b2ef9f5aSRalph Campbell 		goto err_unreg;
1275b2ef9f5aSRalph Campbell 
1276b2ef9f5aSRalph Campbell 	for (id = 0; id < DMIRROR_NDEVICES; id++) {
1277b2ef9f5aSRalph Campbell 		ret = dmirror_device_init(dmirror_devices + id, id);
1278b2ef9f5aSRalph Campbell 		if (ret)
1279b2ef9f5aSRalph Campbell 			goto err_chrdev;
1280b2ef9f5aSRalph Campbell 	}
1281b2ef9f5aSRalph Campbell 
1282b2ef9f5aSRalph Campbell 	pr_info("HMM test module loaded. This is only for testing HMM.\n");
1283b2ef9f5aSRalph Campbell 	return 0;
1284b2ef9f5aSRalph Campbell 
1285b2ef9f5aSRalph Campbell err_chrdev:
1286b2ef9f5aSRalph Campbell 	while (--id >= 0)
1287b2ef9f5aSRalph Campbell 		dmirror_device_remove(dmirror_devices + id);
1288b2ef9f5aSRalph Campbell 	unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
1289b2ef9f5aSRalph Campbell err_unreg:
1290b2ef9f5aSRalph Campbell 	return ret;
1291b2ef9f5aSRalph Campbell }
1292b2ef9f5aSRalph Campbell 
1293b2ef9f5aSRalph Campbell static void __exit hmm_dmirror_exit(void)
1294b2ef9f5aSRalph Campbell {
1295b2ef9f5aSRalph Campbell 	int id;
1296b2ef9f5aSRalph Campbell 
1297b2ef9f5aSRalph Campbell 	for (id = 0; id < DMIRROR_NDEVICES; id++)
1298b2ef9f5aSRalph Campbell 		dmirror_device_remove(dmirror_devices + id);
1299b2ef9f5aSRalph Campbell 	unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
1300b2ef9f5aSRalph Campbell }
1301b2ef9f5aSRalph Campbell 
1302b2ef9f5aSRalph Campbell module_init(hmm_dmirror_init);
1303b2ef9f5aSRalph Campbell module_exit(hmm_dmirror_exit);
1304b2ef9f5aSRalph Campbell MODULE_LICENSE("GPL");
1305