xref: /openbmc/linux/fs/userfaultfd.c (revision 86039bd3b4e6a1129318cbfed4e0a6e001656635)
1*86039bd3SAndrea Arcangeli /*
2*86039bd3SAndrea Arcangeli  *  fs/userfaultfd.c
3*86039bd3SAndrea Arcangeli  *
4*86039bd3SAndrea Arcangeli  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
5*86039bd3SAndrea Arcangeli  *  Copyright (C) 2008-2009 Red Hat, Inc.
6*86039bd3SAndrea Arcangeli  *  Copyright (C) 2015  Red Hat, Inc.
7*86039bd3SAndrea Arcangeli  *
8*86039bd3SAndrea Arcangeli  *  This work is licensed under the terms of the GNU GPL, version 2. See
9*86039bd3SAndrea Arcangeli  *  the COPYING file in the top-level directory.
10*86039bd3SAndrea Arcangeli  *
11*86039bd3SAndrea Arcangeli  *  Some part derived from fs/eventfd.c (anon inode setup) and
12*86039bd3SAndrea Arcangeli  *  mm/ksm.c (mm hashing).
13*86039bd3SAndrea Arcangeli  */
14*86039bd3SAndrea Arcangeli 
15*86039bd3SAndrea Arcangeli #include <linux/hashtable.h>
16*86039bd3SAndrea Arcangeli #include <linux/sched.h>
17*86039bd3SAndrea Arcangeli #include <linux/mm.h>
18*86039bd3SAndrea Arcangeli #include <linux/poll.h>
19*86039bd3SAndrea Arcangeli #include <linux/slab.h>
20*86039bd3SAndrea Arcangeli #include <linux/seq_file.h>
21*86039bd3SAndrea Arcangeli #include <linux/file.h>
22*86039bd3SAndrea Arcangeli #include <linux/bug.h>
23*86039bd3SAndrea Arcangeli #include <linux/anon_inodes.h>
24*86039bd3SAndrea Arcangeli #include <linux/syscalls.h>
25*86039bd3SAndrea Arcangeli #include <linux/userfaultfd_k.h>
26*86039bd3SAndrea Arcangeli #include <linux/mempolicy.h>
27*86039bd3SAndrea Arcangeli #include <linux/ioctl.h>
28*86039bd3SAndrea Arcangeli #include <linux/security.h>
29*86039bd3SAndrea Arcangeli 
30*86039bd3SAndrea Arcangeli enum userfaultfd_state {
31*86039bd3SAndrea Arcangeli 	UFFD_STATE_WAIT_API,
32*86039bd3SAndrea Arcangeli 	UFFD_STATE_RUNNING,
33*86039bd3SAndrea Arcangeli };
34*86039bd3SAndrea Arcangeli 
35*86039bd3SAndrea Arcangeli struct userfaultfd_ctx {
36*86039bd3SAndrea Arcangeli 	/* pseudo fd refcounting */
37*86039bd3SAndrea Arcangeli 	atomic_t refcount;
38*86039bd3SAndrea Arcangeli 	/* waitqueue head for the userfaultfd page faults */
39*86039bd3SAndrea Arcangeli 	wait_queue_head_t fault_wqh;
40*86039bd3SAndrea Arcangeli 	/* waitqueue head for the pseudo fd to wakeup poll/read */
41*86039bd3SAndrea Arcangeli 	wait_queue_head_t fd_wqh;
42*86039bd3SAndrea Arcangeli 	/* userfaultfd syscall flags */
43*86039bd3SAndrea Arcangeli 	unsigned int flags;
44*86039bd3SAndrea Arcangeli 	/* state machine */
45*86039bd3SAndrea Arcangeli 	enum userfaultfd_state state;
46*86039bd3SAndrea Arcangeli 	/* released */
47*86039bd3SAndrea Arcangeli 	bool released;
48*86039bd3SAndrea Arcangeli 	/* mm with one ore more vmas attached to this userfaultfd_ctx */
49*86039bd3SAndrea Arcangeli 	struct mm_struct *mm;
50*86039bd3SAndrea Arcangeli };
51*86039bd3SAndrea Arcangeli 
52*86039bd3SAndrea Arcangeli struct userfaultfd_wait_queue {
53*86039bd3SAndrea Arcangeli 	unsigned long address;
54*86039bd3SAndrea Arcangeli 	wait_queue_t wq;
55*86039bd3SAndrea Arcangeli 	bool pending;
56*86039bd3SAndrea Arcangeli 	struct userfaultfd_ctx *ctx;
57*86039bd3SAndrea Arcangeli };
58*86039bd3SAndrea Arcangeli 
59*86039bd3SAndrea Arcangeli struct userfaultfd_wake_range {
60*86039bd3SAndrea Arcangeli 	unsigned long start;
61*86039bd3SAndrea Arcangeli 	unsigned long len;
62*86039bd3SAndrea Arcangeli };
63*86039bd3SAndrea Arcangeli 
64*86039bd3SAndrea Arcangeli static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
65*86039bd3SAndrea Arcangeli 				     int wake_flags, void *key)
66*86039bd3SAndrea Arcangeli {
67*86039bd3SAndrea Arcangeli 	struct userfaultfd_wake_range *range = key;
68*86039bd3SAndrea Arcangeli 	int ret;
69*86039bd3SAndrea Arcangeli 	struct userfaultfd_wait_queue *uwq;
70*86039bd3SAndrea Arcangeli 	unsigned long start, len;
71*86039bd3SAndrea Arcangeli 
72*86039bd3SAndrea Arcangeli 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
73*86039bd3SAndrea Arcangeli 	ret = 0;
74*86039bd3SAndrea Arcangeli 	/* don't wake the pending ones to avoid reads to block */
75*86039bd3SAndrea Arcangeli 	if (uwq->pending && !ACCESS_ONCE(uwq->ctx->released))
76*86039bd3SAndrea Arcangeli 		goto out;
77*86039bd3SAndrea Arcangeli 	/* len == 0 means wake all */
78*86039bd3SAndrea Arcangeli 	start = range->start;
79*86039bd3SAndrea Arcangeli 	len = range->len;
80*86039bd3SAndrea Arcangeli 	if (len && (start > uwq->address || start + len <= uwq->address))
81*86039bd3SAndrea Arcangeli 		goto out;
82*86039bd3SAndrea Arcangeli 	ret = wake_up_state(wq->private, mode);
83*86039bd3SAndrea Arcangeli 	if (ret)
84*86039bd3SAndrea Arcangeli 		/*
85*86039bd3SAndrea Arcangeli 		 * Wake only once, autoremove behavior.
86*86039bd3SAndrea Arcangeli 		 *
87*86039bd3SAndrea Arcangeli 		 * After the effect of list_del_init is visible to the
88*86039bd3SAndrea Arcangeli 		 * other CPUs, the waitqueue may disappear from under
89*86039bd3SAndrea Arcangeli 		 * us, see the !list_empty_careful() in
90*86039bd3SAndrea Arcangeli 		 * handle_userfault(). try_to_wake_up() has an
91*86039bd3SAndrea Arcangeli 		 * implicit smp_mb__before_spinlock, and the
92*86039bd3SAndrea Arcangeli 		 * wq->private is read before calling the extern
93*86039bd3SAndrea Arcangeli 		 * function "wake_up_state" (which in turns calls
94*86039bd3SAndrea Arcangeli 		 * try_to_wake_up). While the spin_lock;spin_unlock;
95*86039bd3SAndrea Arcangeli 		 * wouldn't be enough, the smp_mb__before_spinlock is
96*86039bd3SAndrea Arcangeli 		 * enough to avoid an explicit smp_mb() here.
97*86039bd3SAndrea Arcangeli 		 */
98*86039bd3SAndrea Arcangeli 		list_del_init(&wq->task_list);
99*86039bd3SAndrea Arcangeli out:
100*86039bd3SAndrea Arcangeli 	return ret;
101*86039bd3SAndrea Arcangeli }
102*86039bd3SAndrea Arcangeli 
103*86039bd3SAndrea Arcangeli /**
104*86039bd3SAndrea Arcangeli  * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
105*86039bd3SAndrea Arcangeli  * context.
106*86039bd3SAndrea Arcangeli  * @ctx: [in] Pointer to the userfaultfd context.
107*86039bd3SAndrea Arcangeli  *
108*86039bd3SAndrea Arcangeli  * Returns: In case of success, returns not zero.
109*86039bd3SAndrea Arcangeli  */
110*86039bd3SAndrea Arcangeli static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
111*86039bd3SAndrea Arcangeli {
112*86039bd3SAndrea Arcangeli 	if (!atomic_inc_not_zero(&ctx->refcount))
113*86039bd3SAndrea Arcangeli 		BUG();
114*86039bd3SAndrea Arcangeli }
115*86039bd3SAndrea Arcangeli 
116*86039bd3SAndrea Arcangeli /**
117*86039bd3SAndrea Arcangeli  * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
118*86039bd3SAndrea Arcangeli  * context.
119*86039bd3SAndrea Arcangeli  * @ctx: [in] Pointer to userfaultfd context.
120*86039bd3SAndrea Arcangeli  *
121*86039bd3SAndrea Arcangeli  * The userfaultfd context reference must have been previously acquired either
122*86039bd3SAndrea Arcangeli  * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
123*86039bd3SAndrea Arcangeli  */
124*86039bd3SAndrea Arcangeli static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
125*86039bd3SAndrea Arcangeli {
126*86039bd3SAndrea Arcangeli 	if (atomic_dec_and_test(&ctx->refcount)) {
127*86039bd3SAndrea Arcangeli 		VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
128*86039bd3SAndrea Arcangeli 		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
129*86039bd3SAndrea Arcangeli 		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
130*86039bd3SAndrea Arcangeli 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
131*86039bd3SAndrea Arcangeli 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
132*86039bd3SAndrea Arcangeli 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
133*86039bd3SAndrea Arcangeli 		mmput(ctx->mm);
134*86039bd3SAndrea Arcangeli 		kfree(ctx);
135*86039bd3SAndrea Arcangeli 	}
136*86039bd3SAndrea Arcangeli }
137*86039bd3SAndrea Arcangeli 
138*86039bd3SAndrea Arcangeli static inline unsigned long userfault_address(unsigned long address,
139*86039bd3SAndrea Arcangeli 					      unsigned int flags,
140*86039bd3SAndrea Arcangeli 					      unsigned long reason)
141*86039bd3SAndrea Arcangeli {
142*86039bd3SAndrea Arcangeli 	BUILD_BUG_ON(PAGE_SHIFT < UFFD_BITS);
143*86039bd3SAndrea Arcangeli 	address &= PAGE_MASK;
144*86039bd3SAndrea Arcangeli 	if (flags & FAULT_FLAG_WRITE)
145*86039bd3SAndrea Arcangeli 		/*
146*86039bd3SAndrea Arcangeli 		 * Encode "write" fault information in the LSB of the
147*86039bd3SAndrea Arcangeli 		 * address read by userland, without depending on
148*86039bd3SAndrea Arcangeli 		 * FAULT_FLAG_WRITE kernel internal value.
149*86039bd3SAndrea Arcangeli 		 */
150*86039bd3SAndrea Arcangeli 		address |= UFFD_BIT_WRITE;
151*86039bd3SAndrea Arcangeli 	if (reason & VM_UFFD_WP)
152*86039bd3SAndrea Arcangeli 		/*
153*86039bd3SAndrea Arcangeli 		 * Encode "reason" fault information as bit number 1
154*86039bd3SAndrea Arcangeli 		 * in the address read by userland. If bit number 1 is
155*86039bd3SAndrea Arcangeli 		 * clear it means the reason is a VM_FAULT_MISSING
156*86039bd3SAndrea Arcangeli 		 * fault.
157*86039bd3SAndrea Arcangeli 		 */
158*86039bd3SAndrea Arcangeli 		address |= UFFD_BIT_WP;
159*86039bd3SAndrea Arcangeli 	return address;
160*86039bd3SAndrea Arcangeli }
161*86039bd3SAndrea Arcangeli 
162*86039bd3SAndrea Arcangeli /*
163*86039bd3SAndrea Arcangeli  * The locking rules involved in returning VM_FAULT_RETRY depending on
164*86039bd3SAndrea Arcangeli  * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
165*86039bd3SAndrea Arcangeli  * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
166*86039bd3SAndrea Arcangeli  * recommendation in __lock_page_or_retry is not an understatement.
167*86039bd3SAndrea Arcangeli  *
168*86039bd3SAndrea Arcangeli  * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
169*86039bd3SAndrea Arcangeli  * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
170*86039bd3SAndrea Arcangeli  * not set.
171*86039bd3SAndrea Arcangeli  *
172*86039bd3SAndrea Arcangeli  * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
173*86039bd3SAndrea Arcangeli  * set, VM_FAULT_RETRY can still be returned if and only if there are
174*86039bd3SAndrea Arcangeli  * fatal_signal_pending()s, and the mmap_sem must be released before
175*86039bd3SAndrea Arcangeli  * returning it.
176*86039bd3SAndrea Arcangeli  */
177*86039bd3SAndrea Arcangeli int handle_userfault(struct vm_area_struct *vma, unsigned long address,
178*86039bd3SAndrea Arcangeli 		     unsigned int flags, unsigned long reason)
179*86039bd3SAndrea Arcangeli {
180*86039bd3SAndrea Arcangeli 	struct mm_struct *mm = vma->vm_mm;
181*86039bd3SAndrea Arcangeli 	struct userfaultfd_ctx *ctx;
182*86039bd3SAndrea Arcangeli 	struct userfaultfd_wait_queue uwq;
183*86039bd3SAndrea Arcangeli 
184*86039bd3SAndrea Arcangeli 	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
185*86039bd3SAndrea Arcangeli 
186*86039bd3SAndrea Arcangeli 	ctx = vma->vm_userfaultfd_ctx.ctx;
187*86039bd3SAndrea Arcangeli 	if (!ctx)
188*86039bd3SAndrea Arcangeli 		return VM_FAULT_SIGBUS;
189*86039bd3SAndrea Arcangeli 
190*86039bd3SAndrea Arcangeli 	BUG_ON(ctx->mm != mm);
191*86039bd3SAndrea Arcangeli 
192*86039bd3SAndrea Arcangeli 	VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
193*86039bd3SAndrea Arcangeli 	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
194*86039bd3SAndrea Arcangeli 
195*86039bd3SAndrea Arcangeli 	/*
196*86039bd3SAndrea Arcangeli 	 * If it's already released don't get it. This avoids to loop
197*86039bd3SAndrea Arcangeli 	 * in __get_user_pages if userfaultfd_release waits on the
198*86039bd3SAndrea Arcangeli 	 * caller of handle_userfault to release the mmap_sem.
199*86039bd3SAndrea Arcangeli 	 */
200*86039bd3SAndrea Arcangeli 	if (unlikely(ACCESS_ONCE(ctx->released)))
201*86039bd3SAndrea Arcangeli 		return VM_FAULT_SIGBUS;
202*86039bd3SAndrea Arcangeli 
203*86039bd3SAndrea Arcangeli 	/*
204*86039bd3SAndrea Arcangeli 	 * Check that we can return VM_FAULT_RETRY.
205*86039bd3SAndrea Arcangeli 	 *
206*86039bd3SAndrea Arcangeli 	 * NOTE: it should become possible to return VM_FAULT_RETRY
207*86039bd3SAndrea Arcangeli 	 * even if FAULT_FLAG_TRIED is set without leading to gup()
208*86039bd3SAndrea Arcangeli 	 * -EBUSY failures, if the userfaultfd is to be extended for
209*86039bd3SAndrea Arcangeli 	 * VM_UFFD_WP tracking and we intend to arm the userfault
210*86039bd3SAndrea Arcangeli 	 * without first stopping userland access to the memory. For
211*86039bd3SAndrea Arcangeli 	 * VM_UFFD_MISSING userfaults this is enough for now.
212*86039bd3SAndrea Arcangeli 	 */
213*86039bd3SAndrea Arcangeli 	if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
214*86039bd3SAndrea Arcangeli 		/*
215*86039bd3SAndrea Arcangeli 		 * Validate the invariant that nowait must allow retry
216*86039bd3SAndrea Arcangeli 		 * to be sure not to return SIGBUS erroneously on
217*86039bd3SAndrea Arcangeli 		 * nowait invocations.
218*86039bd3SAndrea Arcangeli 		 */
219*86039bd3SAndrea Arcangeli 		BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
220*86039bd3SAndrea Arcangeli #ifdef CONFIG_DEBUG_VM
221*86039bd3SAndrea Arcangeli 		if (printk_ratelimit()) {
222*86039bd3SAndrea Arcangeli 			printk(KERN_WARNING
223*86039bd3SAndrea Arcangeli 			       "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
224*86039bd3SAndrea Arcangeli 			dump_stack();
225*86039bd3SAndrea Arcangeli 		}
226*86039bd3SAndrea Arcangeli #endif
227*86039bd3SAndrea Arcangeli 		return VM_FAULT_SIGBUS;
228*86039bd3SAndrea Arcangeli 	}
229*86039bd3SAndrea Arcangeli 
230*86039bd3SAndrea Arcangeli 	/*
231*86039bd3SAndrea Arcangeli 	 * Handle nowait, not much to do other than tell it to retry
232*86039bd3SAndrea Arcangeli 	 * and wait.
233*86039bd3SAndrea Arcangeli 	 */
234*86039bd3SAndrea Arcangeli 	if (flags & FAULT_FLAG_RETRY_NOWAIT)
235*86039bd3SAndrea Arcangeli 		return VM_FAULT_RETRY;
236*86039bd3SAndrea Arcangeli 
237*86039bd3SAndrea Arcangeli 	/* take the reference before dropping the mmap_sem */
238*86039bd3SAndrea Arcangeli 	userfaultfd_ctx_get(ctx);
239*86039bd3SAndrea Arcangeli 
240*86039bd3SAndrea Arcangeli 	/* be gentle and immediately relinquish the mmap_sem */
241*86039bd3SAndrea Arcangeli 	up_read(&mm->mmap_sem);
242*86039bd3SAndrea Arcangeli 
243*86039bd3SAndrea Arcangeli 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
244*86039bd3SAndrea Arcangeli 	uwq.wq.private = current;
245*86039bd3SAndrea Arcangeli 	uwq.address = userfault_address(address, flags, reason);
246*86039bd3SAndrea Arcangeli 	uwq.pending = true;
247*86039bd3SAndrea Arcangeli 	uwq.ctx = ctx;
248*86039bd3SAndrea Arcangeli 
249*86039bd3SAndrea Arcangeli 	spin_lock(&ctx->fault_wqh.lock);
250*86039bd3SAndrea Arcangeli 	/*
251*86039bd3SAndrea Arcangeli 	 * After the __add_wait_queue the uwq is visible to userland
252*86039bd3SAndrea Arcangeli 	 * through poll/read().
253*86039bd3SAndrea Arcangeli 	 */
254*86039bd3SAndrea Arcangeli 	__add_wait_queue(&ctx->fault_wqh, &uwq.wq);
255*86039bd3SAndrea Arcangeli 	for (;;) {
256*86039bd3SAndrea Arcangeli 		set_current_state(TASK_KILLABLE);
257*86039bd3SAndrea Arcangeli 		if (!uwq.pending || ACCESS_ONCE(ctx->released) ||
258*86039bd3SAndrea Arcangeli 		    fatal_signal_pending(current))
259*86039bd3SAndrea Arcangeli 			break;
260*86039bd3SAndrea Arcangeli 		spin_unlock(&ctx->fault_wqh.lock);
261*86039bd3SAndrea Arcangeli 
262*86039bd3SAndrea Arcangeli 		wake_up_poll(&ctx->fd_wqh, POLLIN);
263*86039bd3SAndrea Arcangeli 		schedule();
264*86039bd3SAndrea Arcangeli 
265*86039bd3SAndrea Arcangeli 		spin_lock(&ctx->fault_wqh.lock);
266*86039bd3SAndrea Arcangeli 	}
267*86039bd3SAndrea Arcangeli 	__remove_wait_queue(&ctx->fault_wqh, &uwq.wq);
268*86039bd3SAndrea Arcangeli 	__set_current_state(TASK_RUNNING);
269*86039bd3SAndrea Arcangeli 	spin_unlock(&ctx->fault_wqh.lock);
270*86039bd3SAndrea Arcangeli 
271*86039bd3SAndrea Arcangeli 	/*
272*86039bd3SAndrea Arcangeli 	 * ctx may go away after this if the userfault pseudo fd is
273*86039bd3SAndrea Arcangeli 	 * already released.
274*86039bd3SAndrea Arcangeli 	 */
275*86039bd3SAndrea Arcangeli 	userfaultfd_ctx_put(ctx);
276*86039bd3SAndrea Arcangeli 
277*86039bd3SAndrea Arcangeli 	return VM_FAULT_RETRY;
278*86039bd3SAndrea Arcangeli }
279*86039bd3SAndrea Arcangeli 
280*86039bd3SAndrea Arcangeli static int userfaultfd_release(struct inode *inode, struct file *file)
281*86039bd3SAndrea Arcangeli {
282*86039bd3SAndrea Arcangeli 	struct userfaultfd_ctx *ctx = file->private_data;
283*86039bd3SAndrea Arcangeli 	struct mm_struct *mm = ctx->mm;
284*86039bd3SAndrea Arcangeli 	struct vm_area_struct *vma, *prev;
285*86039bd3SAndrea Arcangeli 	/* len == 0 means wake all */
286*86039bd3SAndrea Arcangeli 	struct userfaultfd_wake_range range = { .len = 0, };
287*86039bd3SAndrea Arcangeli 	unsigned long new_flags;
288*86039bd3SAndrea Arcangeli 
289*86039bd3SAndrea Arcangeli 	ACCESS_ONCE(ctx->released) = true;
290*86039bd3SAndrea Arcangeli 
291*86039bd3SAndrea Arcangeli 	/*
292*86039bd3SAndrea Arcangeli 	 * Flush page faults out of all CPUs. NOTE: all page faults
293*86039bd3SAndrea Arcangeli 	 * must be retried without returning VM_FAULT_SIGBUS if
294*86039bd3SAndrea Arcangeli 	 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
295*86039bd3SAndrea Arcangeli 	 * changes while handle_userfault released the mmap_sem. So
296*86039bd3SAndrea Arcangeli 	 * it's critical that released is set to true (above), before
297*86039bd3SAndrea Arcangeli 	 * taking the mmap_sem for writing.
298*86039bd3SAndrea Arcangeli 	 */
299*86039bd3SAndrea Arcangeli 	down_write(&mm->mmap_sem);
300*86039bd3SAndrea Arcangeli 	prev = NULL;
301*86039bd3SAndrea Arcangeli 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
302*86039bd3SAndrea Arcangeli 		cond_resched();
303*86039bd3SAndrea Arcangeli 		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
304*86039bd3SAndrea Arcangeli 		       !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
305*86039bd3SAndrea Arcangeli 		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
306*86039bd3SAndrea Arcangeli 			prev = vma;
307*86039bd3SAndrea Arcangeli 			continue;
308*86039bd3SAndrea Arcangeli 		}
309*86039bd3SAndrea Arcangeli 		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
310*86039bd3SAndrea Arcangeli 		prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
311*86039bd3SAndrea Arcangeli 				 new_flags, vma->anon_vma,
312*86039bd3SAndrea Arcangeli 				 vma->vm_file, vma->vm_pgoff,
313*86039bd3SAndrea Arcangeli 				 vma_policy(vma),
314*86039bd3SAndrea Arcangeli 				 NULL_VM_UFFD_CTX);
315*86039bd3SAndrea Arcangeli 		if (prev)
316*86039bd3SAndrea Arcangeli 			vma = prev;
317*86039bd3SAndrea Arcangeli 		else
318*86039bd3SAndrea Arcangeli 			prev = vma;
319*86039bd3SAndrea Arcangeli 		vma->vm_flags = new_flags;
320*86039bd3SAndrea Arcangeli 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
321*86039bd3SAndrea Arcangeli 	}
322*86039bd3SAndrea Arcangeli 	up_write(&mm->mmap_sem);
323*86039bd3SAndrea Arcangeli 
324*86039bd3SAndrea Arcangeli 	/*
325*86039bd3SAndrea Arcangeli 	 * After no new page faults can wait on this fault_wqh, flush
326*86039bd3SAndrea Arcangeli 	 * the last page faults that may have been already waiting on
327*86039bd3SAndrea Arcangeli 	 * the fault_wqh.
328*86039bd3SAndrea Arcangeli 	 */
329*86039bd3SAndrea Arcangeli 	spin_lock(&ctx->fault_wqh.lock);
330*86039bd3SAndrea Arcangeli 	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
331*86039bd3SAndrea Arcangeli 	spin_unlock(&ctx->fault_wqh.lock);
332*86039bd3SAndrea Arcangeli 
333*86039bd3SAndrea Arcangeli 	wake_up_poll(&ctx->fd_wqh, POLLHUP);
334*86039bd3SAndrea Arcangeli 	userfaultfd_ctx_put(ctx);
335*86039bd3SAndrea Arcangeli 	return 0;
336*86039bd3SAndrea Arcangeli }
337*86039bd3SAndrea Arcangeli 
338*86039bd3SAndrea Arcangeli /* fault_wqh.lock must be hold by the caller */
339*86039bd3SAndrea Arcangeli static inline unsigned int find_userfault(struct userfaultfd_ctx *ctx,
340*86039bd3SAndrea Arcangeli 					  struct userfaultfd_wait_queue **uwq)
341*86039bd3SAndrea Arcangeli {
342*86039bd3SAndrea Arcangeli 	wait_queue_t *wq;
343*86039bd3SAndrea Arcangeli 	struct userfaultfd_wait_queue *_uwq;
344*86039bd3SAndrea Arcangeli 	unsigned int ret = 0;
345*86039bd3SAndrea Arcangeli 
346*86039bd3SAndrea Arcangeli 	VM_BUG_ON(!spin_is_locked(&ctx->fault_wqh.lock));
347*86039bd3SAndrea Arcangeli 
348*86039bd3SAndrea Arcangeli 	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
349*86039bd3SAndrea Arcangeli 		_uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
350*86039bd3SAndrea Arcangeli 		if (_uwq->pending) {
351*86039bd3SAndrea Arcangeli 			ret = POLLIN;
352*86039bd3SAndrea Arcangeli 			if (!uwq)
353*86039bd3SAndrea Arcangeli 				/*
354*86039bd3SAndrea Arcangeli 				 * If there's at least a pending and
355*86039bd3SAndrea Arcangeli 				 * we don't care which one it is,
356*86039bd3SAndrea Arcangeli 				 * break immediately and leverage the
357*86039bd3SAndrea Arcangeli 				 * efficiency of the LIFO walk.
358*86039bd3SAndrea Arcangeli 				 */
359*86039bd3SAndrea Arcangeli 				break;
360*86039bd3SAndrea Arcangeli 			/*
361*86039bd3SAndrea Arcangeli 			 * If we need to find which one was pending we
362*86039bd3SAndrea Arcangeli 			 * keep walking until we find the first not
363*86039bd3SAndrea Arcangeli 			 * pending one, so we read() them in FIFO order.
364*86039bd3SAndrea Arcangeli 			 */
365*86039bd3SAndrea Arcangeli 			*uwq = _uwq;
366*86039bd3SAndrea Arcangeli 		} else
367*86039bd3SAndrea Arcangeli 			/*
368*86039bd3SAndrea Arcangeli 			 * break the loop at the first not pending
369*86039bd3SAndrea Arcangeli 			 * one, there cannot be pending userfaults
370*86039bd3SAndrea Arcangeli 			 * after the first not pending one, because
371*86039bd3SAndrea Arcangeli 			 * all new pending ones are inserted at the
372*86039bd3SAndrea Arcangeli 			 * head and we walk it in LIFO.
373*86039bd3SAndrea Arcangeli 			 */
374*86039bd3SAndrea Arcangeli 			break;
375*86039bd3SAndrea Arcangeli 	}
376*86039bd3SAndrea Arcangeli 
377*86039bd3SAndrea Arcangeli 	return ret;
378*86039bd3SAndrea Arcangeli }
379*86039bd3SAndrea Arcangeli 
380*86039bd3SAndrea Arcangeli static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
381*86039bd3SAndrea Arcangeli {
382*86039bd3SAndrea Arcangeli 	struct userfaultfd_ctx *ctx = file->private_data;
383*86039bd3SAndrea Arcangeli 	unsigned int ret;
384*86039bd3SAndrea Arcangeli 
385*86039bd3SAndrea Arcangeli 	poll_wait(file, &ctx->fd_wqh, wait);
386*86039bd3SAndrea Arcangeli 
387*86039bd3SAndrea Arcangeli 	switch (ctx->state) {
388*86039bd3SAndrea Arcangeli 	case UFFD_STATE_WAIT_API:
389*86039bd3SAndrea Arcangeli 		return POLLERR;
390*86039bd3SAndrea Arcangeli 	case UFFD_STATE_RUNNING:
391*86039bd3SAndrea Arcangeli 		spin_lock(&ctx->fault_wqh.lock);
392*86039bd3SAndrea Arcangeli 		ret = find_userfault(ctx, NULL);
393*86039bd3SAndrea Arcangeli 		spin_unlock(&ctx->fault_wqh.lock);
394*86039bd3SAndrea Arcangeli 		return ret;
395*86039bd3SAndrea Arcangeli 	default:
396*86039bd3SAndrea Arcangeli 		BUG();
397*86039bd3SAndrea Arcangeli 	}
398*86039bd3SAndrea Arcangeli }
399*86039bd3SAndrea Arcangeli 
400*86039bd3SAndrea Arcangeli static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
401*86039bd3SAndrea Arcangeli 				    __u64 *addr)
402*86039bd3SAndrea Arcangeli {
403*86039bd3SAndrea Arcangeli 	ssize_t ret;
404*86039bd3SAndrea Arcangeli 	DECLARE_WAITQUEUE(wait, current);
405*86039bd3SAndrea Arcangeli 	struct userfaultfd_wait_queue *uwq = NULL;
406*86039bd3SAndrea Arcangeli 
407*86039bd3SAndrea Arcangeli 	/* always take the fd_wqh lock before the fault_wqh lock */
408*86039bd3SAndrea Arcangeli 	spin_lock(&ctx->fd_wqh.lock);
409*86039bd3SAndrea Arcangeli 	__add_wait_queue(&ctx->fd_wqh, &wait);
410*86039bd3SAndrea Arcangeli 	for (;;) {
411*86039bd3SAndrea Arcangeli 		set_current_state(TASK_INTERRUPTIBLE);
412*86039bd3SAndrea Arcangeli 		spin_lock(&ctx->fault_wqh.lock);
413*86039bd3SAndrea Arcangeli 		if (find_userfault(ctx, &uwq)) {
414*86039bd3SAndrea Arcangeli 			/*
415*86039bd3SAndrea Arcangeli 			 * The fault_wqh.lock prevents the uwq to
416*86039bd3SAndrea Arcangeli 			 * disappear from under us.
417*86039bd3SAndrea Arcangeli 			 */
418*86039bd3SAndrea Arcangeli 			uwq->pending = false;
419*86039bd3SAndrea Arcangeli 			/* careful to always initialize addr if ret == 0 */
420*86039bd3SAndrea Arcangeli 			*addr = uwq->address;
421*86039bd3SAndrea Arcangeli 			spin_unlock(&ctx->fault_wqh.lock);
422*86039bd3SAndrea Arcangeli 			ret = 0;
423*86039bd3SAndrea Arcangeli 			break;
424*86039bd3SAndrea Arcangeli 		}
425*86039bd3SAndrea Arcangeli 		spin_unlock(&ctx->fault_wqh.lock);
426*86039bd3SAndrea Arcangeli 		if (signal_pending(current)) {
427*86039bd3SAndrea Arcangeli 			ret = -ERESTARTSYS;
428*86039bd3SAndrea Arcangeli 			break;
429*86039bd3SAndrea Arcangeli 		}
430*86039bd3SAndrea Arcangeli 		if (no_wait) {
431*86039bd3SAndrea Arcangeli 			ret = -EAGAIN;
432*86039bd3SAndrea Arcangeli 			break;
433*86039bd3SAndrea Arcangeli 		}
434*86039bd3SAndrea Arcangeli 		spin_unlock(&ctx->fd_wqh.lock);
435*86039bd3SAndrea Arcangeli 		schedule();
436*86039bd3SAndrea Arcangeli 		spin_lock(&ctx->fd_wqh.lock);
437*86039bd3SAndrea Arcangeli 	}
438*86039bd3SAndrea Arcangeli 	__remove_wait_queue(&ctx->fd_wqh, &wait);
439*86039bd3SAndrea Arcangeli 	__set_current_state(TASK_RUNNING);
440*86039bd3SAndrea Arcangeli 	spin_unlock(&ctx->fd_wqh.lock);
441*86039bd3SAndrea Arcangeli 
442*86039bd3SAndrea Arcangeli 	return ret;
443*86039bd3SAndrea Arcangeli }
444*86039bd3SAndrea Arcangeli 
445*86039bd3SAndrea Arcangeli static ssize_t userfaultfd_read(struct file *file, char __user *buf,
446*86039bd3SAndrea Arcangeli 				size_t count, loff_t *ppos)
447*86039bd3SAndrea Arcangeli {
448*86039bd3SAndrea Arcangeli 	struct userfaultfd_ctx *ctx = file->private_data;
449*86039bd3SAndrea Arcangeli 	ssize_t _ret, ret = 0;
450*86039bd3SAndrea Arcangeli 	/* careful to always initialize addr if ret == 0 */
451*86039bd3SAndrea Arcangeli 	__u64 uninitialized_var(addr);
452*86039bd3SAndrea Arcangeli 	int no_wait = file->f_flags & O_NONBLOCK;
453*86039bd3SAndrea Arcangeli 
454*86039bd3SAndrea Arcangeli 	if (ctx->state == UFFD_STATE_WAIT_API)
455*86039bd3SAndrea Arcangeli 		return -EINVAL;
456*86039bd3SAndrea Arcangeli 	BUG_ON(ctx->state != UFFD_STATE_RUNNING);
457*86039bd3SAndrea Arcangeli 
458*86039bd3SAndrea Arcangeli 	for (;;) {
459*86039bd3SAndrea Arcangeli 		if (count < sizeof(addr))
460*86039bd3SAndrea Arcangeli 			return ret ? ret : -EINVAL;
461*86039bd3SAndrea Arcangeli 		_ret = userfaultfd_ctx_read(ctx, no_wait, &addr);
462*86039bd3SAndrea Arcangeli 		if (_ret < 0)
463*86039bd3SAndrea Arcangeli 			return ret ? ret : _ret;
464*86039bd3SAndrea Arcangeli 		if (put_user(addr, (__u64 __user *) buf))
465*86039bd3SAndrea Arcangeli 			return ret ? ret : -EFAULT;
466*86039bd3SAndrea Arcangeli 		ret += sizeof(addr);
467*86039bd3SAndrea Arcangeli 		buf += sizeof(addr);
468*86039bd3SAndrea Arcangeli 		count -= sizeof(addr);
469*86039bd3SAndrea Arcangeli 		/*
470*86039bd3SAndrea Arcangeli 		 * Allow to read more than one fault at time but only
471*86039bd3SAndrea Arcangeli 		 * block if waiting for the very first one.
472*86039bd3SAndrea Arcangeli 		 */
473*86039bd3SAndrea Arcangeli 		no_wait = O_NONBLOCK;
474*86039bd3SAndrea Arcangeli 	}
475*86039bd3SAndrea Arcangeli }
476*86039bd3SAndrea Arcangeli 
477*86039bd3SAndrea Arcangeli static void __wake_userfault(struct userfaultfd_ctx *ctx,
478*86039bd3SAndrea Arcangeli 			     struct userfaultfd_wake_range *range)
479*86039bd3SAndrea Arcangeli {
480*86039bd3SAndrea Arcangeli 	unsigned long start, end;
481*86039bd3SAndrea Arcangeli 
482*86039bd3SAndrea Arcangeli 	start = range->start;
483*86039bd3SAndrea Arcangeli 	end = range->start + range->len;
484*86039bd3SAndrea Arcangeli 
485*86039bd3SAndrea Arcangeli 	spin_lock(&ctx->fault_wqh.lock);
486*86039bd3SAndrea Arcangeli 	/* wake all in the range and autoremove */
487*86039bd3SAndrea Arcangeli 	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
488*86039bd3SAndrea Arcangeli 	spin_unlock(&ctx->fault_wqh.lock);
489*86039bd3SAndrea Arcangeli }
490*86039bd3SAndrea Arcangeli 
491*86039bd3SAndrea Arcangeli static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
492*86039bd3SAndrea Arcangeli 					   struct userfaultfd_wake_range *range)
493*86039bd3SAndrea Arcangeli {
494*86039bd3SAndrea Arcangeli 	/*
495*86039bd3SAndrea Arcangeli 	 * To be sure waitqueue_active() is not reordered by the CPU
496*86039bd3SAndrea Arcangeli 	 * before the pagetable update, use an explicit SMP memory
497*86039bd3SAndrea Arcangeli 	 * barrier here. PT lock release or up_read(mmap_sem) still
498*86039bd3SAndrea Arcangeli 	 * have release semantics that can allow the
499*86039bd3SAndrea Arcangeli 	 * waitqueue_active() to be reordered before the pte update.
500*86039bd3SAndrea Arcangeli 	 */
501*86039bd3SAndrea Arcangeli 	smp_mb();
502*86039bd3SAndrea Arcangeli 
503*86039bd3SAndrea Arcangeli 	/*
504*86039bd3SAndrea Arcangeli 	 * Use waitqueue_active because it's very frequent to
505*86039bd3SAndrea Arcangeli 	 * change the address space atomically even if there are no
506*86039bd3SAndrea Arcangeli 	 * userfaults yet. So we take the spinlock only when we're
507*86039bd3SAndrea Arcangeli 	 * sure we've userfaults to wake.
508*86039bd3SAndrea Arcangeli 	 */
509*86039bd3SAndrea Arcangeli 	if (waitqueue_active(&ctx->fault_wqh))
510*86039bd3SAndrea Arcangeli 		__wake_userfault(ctx, range);
511*86039bd3SAndrea Arcangeli }
512*86039bd3SAndrea Arcangeli 
513*86039bd3SAndrea Arcangeli static __always_inline int validate_range(struct mm_struct *mm,
514*86039bd3SAndrea Arcangeli 					  __u64 start, __u64 len)
515*86039bd3SAndrea Arcangeli {
516*86039bd3SAndrea Arcangeli 	__u64 task_size = mm->task_size;
517*86039bd3SAndrea Arcangeli 
518*86039bd3SAndrea Arcangeli 	if (start & ~PAGE_MASK)
519*86039bd3SAndrea Arcangeli 		return -EINVAL;
520*86039bd3SAndrea Arcangeli 	if (len & ~PAGE_MASK)
521*86039bd3SAndrea Arcangeli 		return -EINVAL;
522*86039bd3SAndrea Arcangeli 	if (!len)
523*86039bd3SAndrea Arcangeli 		return -EINVAL;
524*86039bd3SAndrea Arcangeli 	if (start < mmap_min_addr)
525*86039bd3SAndrea Arcangeli 		return -EINVAL;
526*86039bd3SAndrea Arcangeli 	if (start >= task_size)
527*86039bd3SAndrea Arcangeli 		return -EINVAL;
528*86039bd3SAndrea Arcangeli 	if (len > task_size - start)
529*86039bd3SAndrea Arcangeli 		return -EINVAL;
530*86039bd3SAndrea Arcangeli 	return 0;
531*86039bd3SAndrea Arcangeli }
532*86039bd3SAndrea Arcangeli 
533*86039bd3SAndrea Arcangeli static int userfaultfd_register(struct userfaultfd_ctx *ctx,
534*86039bd3SAndrea Arcangeli 				unsigned long arg)
535*86039bd3SAndrea Arcangeli {
536*86039bd3SAndrea Arcangeli 	struct mm_struct *mm = ctx->mm;
537*86039bd3SAndrea Arcangeli 	struct vm_area_struct *vma, *prev, *cur;
538*86039bd3SAndrea Arcangeli 	int ret;
539*86039bd3SAndrea Arcangeli 	struct uffdio_register uffdio_register;
540*86039bd3SAndrea Arcangeli 	struct uffdio_register __user *user_uffdio_register;
541*86039bd3SAndrea Arcangeli 	unsigned long vm_flags, new_flags;
542*86039bd3SAndrea Arcangeli 	bool found;
543*86039bd3SAndrea Arcangeli 	unsigned long start, end, vma_end;
544*86039bd3SAndrea Arcangeli 
545*86039bd3SAndrea Arcangeli 	user_uffdio_register = (struct uffdio_register __user *) arg;
546*86039bd3SAndrea Arcangeli 
547*86039bd3SAndrea Arcangeli 	ret = -EFAULT;
548*86039bd3SAndrea Arcangeli 	if (copy_from_user(&uffdio_register, user_uffdio_register,
549*86039bd3SAndrea Arcangeli 			   sizeof(uffdio_register)-sizeof(__u64)))
550*86039bd3SAndrea Arcangeli 		goto out;
551*86039bd3SAndrea Arcangeli 
552*86039bd3SAndrea Arcangeli 	ret = -EINVAL;
553*86039bd3SAndrea Arcangeli 	if (!uffdio_register.mode)
554*86039bd3SAndrea Arcangeli 		goto out;
555*86039bd3SAndrea Arcangeli 	if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
556*86039bd3SAndrea Arcangeli 				     UFFDIO_REGISTER_MODE_WP))
557*86039bd3SAndrea Arcangeli 		goto out;
558*86039bd3SAndrea Arcangeli 	vm_flags = 0;
559*86039bd3SAndrea Arcangeli 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
560*86039bd3SAndrea Arcangeli 		vm_flags |= VM_UFFD_MISSING;
561*86039bd3SAndrea Arcangeli 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
562*86039bd3SAndrea Arcangeli 		vm_flags |= VM_UFFD_WP;
563*86039bd3SAndrea Arcangeli 		/*
564*86039bd3SAndrea Arcangeli 		 * FIXME: remove the below error constraint by
565*86039bd3SAndrea Arcangeli 		 * implementing the wprotect tracking mode.
566*86039bd3SAndrea Arcangeli 		 */
567*86039bd3SAndrea Arcangeli 		ret = -EINVAL;
568*86039bd3SAndrea Arcangeli 		goto out;
569*86039bd3SAndrea Arcangeli 	}
570*86039bd3SAndrea Arcangeli 
571*86039bd3SAndrea Arcangeli 	ret = validate_range(mm, uffdio_register.range.start,
572*86039bd3SAndrea Arcangeli 			     uffdio_register.range.len);
573*86039bd3SAndrea Arcangeli 	if (ret)
574*86039bd3SAndrea Arcangeli 		goto out;
575*86039bd3SAndrea Arcangeli 
576*86039bd3SAndrea Arcangeli 	start = uffdio_register.range.start;
577*86039bd3SAndrea Arcangeli 	end = start + uffdio_register.range.len;
578*86039bd3SAndrea Arcangeli 
579*86039bd3SAndrea Arcangeli 	down_write(&mm->mmap_sem);
580*86039bd3SAndrea Arcangeli 	vma = find_vma_prev(mm, start, &prev);
581*86039bd3SAndrea Arcangeli 
582*86039bd3SAndrea Arcangeli 	ret = -ENOMEM;
583*86039bd3SAndrea Arcangeli 	if (!vma)
584*86039bd3SAndrea Arcangeli 		goto out_unlock;
585*86039bd3SAndrea Arcangeli 
586*86039bd3SAndrea Arcangeli 	/* check that there's at least one vma in the range */
587*86039bd3SAndrea Arcangeli 	ret = -EINVAL;
588*86039bd3SAndrea Arcangeli 	if (vma->vm_start >= end)
589*86039bd3SAndrea Arcangeli 		goto out_unlock;
590*86039bd3SAndrea Arcangeli 
591*86039bd3SAndrea Arcangeli 	/*
592*86039bd3SAndrea Arcangeli 	 * Search for not compatible vmas.
593*86039bd3SAndrea Arcangeli 	 *
594*86039bd3SAndrea Arcangeli 	 * FIXME: this shall be relaxed later so that it doesn't fail
595*86039bd3SAndrea Arcangeli 	 * on tmpfs backed vmas (in addition to the current allowance
596*86039bd3SAndrea Arcangeli 	 * on anonymous vmas).
597*86039bd3SAndrea Arcangeli 	 */
598*86039bd3SAndrea Arcangeli 	found = false;
599*86039bd3SAndrea Arcangeli 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
600*86039bd3SAndrea Arcangeli 		cond_resched();
601*86039bd3SAndrea Arcangeli 
602*86039bd3SAndrea Arcangeli 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
603*86039bd3SAndrea Arcangeli 		       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
604*86039bd3SAndrea Arcangeli 
605*86039bd3SAndrea Arcangeli 		/* check not compatible vmas */
606*86039bd3SAndrea Arcangeli 		ret = -EINVAL;
607*86039bd3SAndrea Arcangeli 		if (cur->vm_ops)
608*86039bd3SAndrea Arcangeli 			goto out_unlock;
609*86039bd3SAndrea Arcangeli 
610*86039bd3SAndrea Arcangeli 		/*
611*86039bd3SAndrea Arcangeli 		 * Check that this vma isn't already owned by a
612*86039bd3SAndrea Arcangeli 		 * different userfaultfd. We can't allow more than one
613*86039bd3SAndrea Arcangeli 		 * userfaultfd to own a single vma simultaneously or we
614*86039bd3SAndrea Arcangeli 		 * wouldn't know which one to deliver the userfaults to.
615*86039bd3SAndrea Arcangeli 		 */
616*86039bd3SAndrea Arcangeli 		ret = -EBUSY;
617*86039bd3SAndrea Arcangeli 		if (cur->vm_userfaultfd_ctx.ctx &&
618*86039bd3SAndrea Arcangeli 		    cur->vm_userfaultfd_ctx.ctx != ctx)
619*86039bd3SAndrea Arcangeli 			goto out_unlock;
620*86039bd3SAndrea Arcangeli 
621*86039bd3SAndrea Arcangeli 		found = true;
622*86039bd3SAndrea Arcangeli 	}
623*86039bd3SAndrea Arcangeli 	BUG_ON(!found);
624*86039bd3SAndrea Arcangeli 
625*86039bd3SAndrea Arcangeli 	if (vma->vm_start < start)
626*86039bd3SAndrea Arcangeli 		prev = vma;
627*86039bd3SAndrea Arcangeli 
628*86039bd3SAndrea Arcangeli 	ret = 0;
629*86039bd3SAndrea Arcangeli 	do {
630*86039bd3SAndrea Arcangeli 		cond_resched();
631*86039bd3SAndrea Arcangeli 
632*86039bd3SAndrea Arcangeli 		BUG_ON(vma->vm_ops);
633*86039bd3SAndrea Arcangeli 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
634*86039bd3SAndrea Arcangeli 		       vma->vm_userfaultfd_ctx.ctx != ctx);
635*86039bd3SAndrea Arcangeli 
636*86039bd3SAndrea Arcangeli 		/*
637*86039bd3SAndrea Arcangeli 		 * Nothing to do: this vma is already registered into this
638*86039bd3SAndrea Arcangeli 		 * userfaultfd and with the right tracking mode too.
639*86039bd3SAndrea Arcangeli 		 */
640*86039bd3SAndrea Arcangeli 		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
641*86039bd3SAndrea Arcangeli 		    (vma->vm_flags & vm_flags) == vm_flags)
642*86039bd3SAndrea Arcangeli 			goto skip;
643*86039bd3SAndrea Arcangeli 
644*86039bd3SAndrea Arcangeli 		if (vma->vm_start > start)
645*86039bd3SAndrea Arcangeli 			start = vma->vm_start;
646*86039bd3SAndrea Arcangeli 		vma_end = min(end, vma->vm_end);
647*86039bd3SAndrea Arcangeli 
648*86039bd3SAndrea Arcangeli 		new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
649*86039bd3SAndrea Arcangeli 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
650*86039bd3SAndrea Arcangeli 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
651*86039bd3SAndrea Arcangeli 				 vma_policy(vma),
652*86039bd3SAndrea Arcangeli 				 ((struct vm_userfaultfd_ctx){ ctx }));
653*86039bd3SAndrea Arcangeli 		if (prev) {
654*86039bd3SAndrea Arcangeli 			vma = prev;
655*86039bd3SAndrea Arcangeli 			goto next;
656*86039bd3SAndrea Arcangeli 		}
657*86039bd3SAndrea Arcangeli 		if (vma->vm_start < start) {
658*86039bd3SAndrea Arcangeli 			ret = split_vma(mm, vma, start, 1);
659*86039bd3SAndrea Arcangeli 			if (ret)
660*86039bd3SAndrea Arcangeli 				break;
661*86039bd3SAndrea Arcangeli 		}
662*86039bd3SAndrea Arcangeli 		if (vma->vm_end > end) {
663*86039bd3SAndrea Arcangeli 			ret = split_vma(mm, vma, end, 0);
664*86039bd3SAndrea Arcangeli 			if (ret)
665*86039bd3SAndrea Arcangeli 				break;
666*86039bd3SAndrea Arcangeli 		}
667*86039bd3SAndrea Arcangeli 	next:
668*86039bd3SAndrea Arcangeli 		/*
669*86039bd3SAndrea Arcangeli 		 * In the vma_merge() successful mprotect-like case 8:
670*86039bd3SAndrea Arcangeli 		 * the next vma was merged into the current one and
671*86039bd3SAndrea Arcangeli 		 * the current one has not been updated yet.
672*86039bd3SAndrea Arcangeli 		 */
673*86039bd3SAndrea Arcangeli 		vma->vm_flags = new_flags;
674*86039bd3SAndrea Arcangeli 		vma->vm_userfaultfd_ctx.ctx = ctx;
675*86039bd3SAndrea Arcangeli 
676*86039bd3SAndrea Arcangeli 	skip:
677*86039bd3SAndrea Arcangeli 		prev = vma;
678*86039bd3SAndrea Arcangeli 		start = vma->vm_end;
679*86039bd3SAndrea Arcangeli 		vma = vma->vm_next;
680*86039bd3SAndrea Arcangeli 	} while (vma && vma->vm_start < end);
681*86039bd3SAndrea Arcangeli out_unlock:
682*86039bd3SAndrea Arcangeli 	up_write(&mm->mmap_sem);
683*86039bd3SAndrea Arcangeli 	if (!ret) {
684*86039bd3SAndrea Arcangeli 		/*
685*86039bd3SAndrea Arcangeli 		 * Now that we scanned all vmas we can already tell
686*86039bd3SAndrea Arcangeli 		 * userland which ioctls methods are guaranteed to
687*86039bd3SAndrea Arcangeli 		 * succeed on this range.
688*86039bd3SAndrea Arcangeli 		 */
689*86039bd3SAndrea Arcangeli 		if (put_user(UFFD_API_RANGE_IOCTLS,
690*86039bd3SAndrea Arcangeli 			     &user_uffdio_register->ioctls))
691*86039bd3SAndrea Arcangeli 			ret = -EFAULT;
692*86039bd3SAndrea Arcangeli 	}
693*86039bd3SAndrea Arcangeli out:
694*86039bd3SAndrea Arcangeli 	return ret;
695*86039bd3SAndrea Arcangeli }
696*86039bd3SAndrea Arcangeli 
697*86039bd3SAndrea Arcangeli static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
698*86039bd3SAndrea Arcangeli 				  unsigned long arg)
699*86039bd3SAndrea Arcangeli {
700*86039bd3SAndrea Arcangeli 	struct mm_struct *mm = ctx->mm;
701*86039bd3SAndrea Arcangeli 	struct vm_area_struct *vma, *prev, *cur;
702*86039bd3SAndrea Arcangeli 	int ret;
703*86039bd3SAndrea Arcangeli 	struct uffdio_range uffdio_unregister;
704*86039bd3SAndrea Arcangeli 	unsigned long new_flags;
705*86039bd3SAndrea Arcangeli 	bool found;
706*86039bd3SAndrea Arcangeli 	unsigned long start, end, vma_end;
707*86039bd3SAndrea Arcangeli 	const void __user *buf = (void __user *)arg;
708*86039bd3SAndrea Arcangeli 
709*86039bd3SAndrea Arcangeli 	ret = -EFAULT;
710*86039bd3SAndrea Arcangeli 	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
711*86039bd3SAndrea Arcangeli 		goto out;
712*86039bd3SAndrea Arcangeli 
713*86039bd3SAndrea Arcangeli 	ret = validate_range(mm, uffdio_unregister.start,
714*86039bd3SAndrea Arcangeli 			     uffdio_unregister.len);
715*86039bd3SAndrea Arcangeli 	if (ret)
716*86039bd3SAndrea Arcangeli 		goto out;
717*86039bd3SAndrea Arcangeli 
718*86039bd3SAndrea Arcangeli 	start = uffdio_unregister.start;
719*86039bd3SAndrea Arcangeli 	end = start + uffdio_unregister.len;
720*86039bd3SAndrea Arcangeli 
721*86039bd3SAndrea Arcangeli 	down_write(&mm->mmap_sem);
722*86039bd3SAndrea Arcangeli 	vma = find_vma_prev(mm, start, &prev);
723*86039bd3SAndrea Arcangeli 
724*86039bd3SAndrea Arcangeli 	ret = -ENOMEM;
725*86039bd3SAndrea Arcangeli 	if (!vma)
726*86039bd3SAndrea Arcangeli 		goto out_unlock;
727*86039bd3SAndrea Arcangeli 
728*86039bd3SAndrea Arcangeli 	/* check that there's at least one vma in the range */
729*86039bd3SAndrea Arcangeli 	ret = -EINVAL;
730*86039bd3SAndrea Arcangeli 	if (vma->vm_start >= end)
731*86039bd3SAndrea Arcangeli 		goto out_unlock;
732*86039bd3SAndrea Arcangeli 
733*86039bd3SAndrea Arcangeli 	/*
734*86039bd3SAndrea Arcangeli 	 * Search for not compatible vmas.
735*86039bd3SAndrea Arcangeli 	 *
736*86039bd3SAndrea Arcangeli 	 * FIXME: this shall be relaxed later so that it doesn't fail
737*86039bd3SAndrea Arcangeli 	 * on tmpfs backed vmas (in addition to the current allowance
738*86039bd3SAndrea Arcangeli 	 * on anonymous vmas).
739*86039bd3SAndrea Arcangeli 	 */
740*86039bd3SAndrea Arcangeli 	found = false;
741*86039bd3SAndrea Arcangeli 	ret = -EINVAL;
742*86039bd3SAndrea Arcangeli 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
743*86039bd3SAndrea Arcangeli 		cond_resched();
744*86039bd3SAndrea Arcangeli 
745*86039bd3SAndrea Arcangeli 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
746*86039bd3SAndrea Arcangeli 		       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
747*86039bd3SAndrea Arcangeli 
748*86039bd3SAndrea Arcangeli 		/*
749*86039bd3SAndrea Arcangeli 		 * Check not compatible vmas, not strictly required
750*86039bd3SAndrea Arcangeli 		 * here as not compatible vmas cannot have an
751*86039bd3SAndrea Arcangeli 		 * userfaultfd_ctx registered on them, but this
752*86039bd3SAndrea Arcangeli 		 * provides for more strict behavior to notice
753*86039bd3SAndrea Arcangeli 		 * unregistration errors.
754*86039bd3SAndrea Arcangeli 		 */
755*86039bd3SAndrea Arcangeli 		if (cur->vm_ops)
756*86039bd3SAndrea Arcangeli 			goto out_unlock;
757*86039bd3SAndrea Arcangeli 
758*86039bd3SAndrea Arcangeli 		found = true;
759*86039bd3SAndrea Arcangeli 	}
760*86039bd3SAndrea Arcangeli 	BUG_ON(!found);
761*86039bd3SAndrea Arcangeli 
762*86039bd3SAndrea Arcangeli 	if (vma->vm_start < start)
763*86039bd3SAndrea Arcangeli 		prev = vma;
764*86039bd3SAndrea Arcangeli 
765*86039bd3SAndrea Arcangeli 	ret = 0;
766*86039bd3SAndrea Arcangeli 	do {
767*86039bd3SAndrea Arcangeli 		cond_resched();
768*86039bd3SAndrea Arcangeli 
769*86039bd3SAndrea Arcangeli 		BUG_ON(vma->vm_ops);
770*86039bd3SAndrea Arcangeli 
771*86039bd3SAndrea Arcangeli 		/*
772*86039bd3SAndrea Arcangeli 		 * Nothing to do: this vma is already registered into this
773*86039bd3SAndrea Arcangeli 		 * userfaultfd and with the right tracking mode too.
774*86039bd3SAndrea Arcangeli 		 */
775*86039bd3SAndrea Arcangeli 		if (!vma->vm_userfaultfd_ctx.ctx)
776*86039bd3SAndrea Arcangeli 			goto skip;
777*86039bd3SAndrea Arcangeli 
778*86039bd3SAndrea Arcangeli 		if (vma->vm_start > start)
779*86039bd3SAndrea Arcangeli 			start = vma->vm_start;
780*86039bd3SAndrea Arcangeli 		vma_end = min(end, vma->vm_end);
781*86039bd3SAndrea Arcangeli 
782*86039bd3SAndrea Arcangeli 		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
783*86039bd3SAndrea Arcangeli 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
784*86039bd3SAndrea Arcangeli 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
785*86039bd3SAndrea Arcangeli 				 vma_policy(vma),
786*86039bd3SAndrea Arcangeli 				 NULL_VM_UFFD_CTX);
787*86039bd3SAndrea Arcangeli 		if (prev) {
788*86039bd3SAndrea Arcangeli 			vma = prev;
789*86039bd3SAndrea Arcangeli 			goto next;
790*86039bd3SAndrea Arcangeli 		}
791*86039bd3SAndrea Arcangeli 		if (vma->vm_start < start) {
792*86039bd3SAndrea Arcangeli 			ret = split_vma(mm, vma, start, 1);
793*86039bd3SAndrea Arcangeli 			if (ret)
794*86039bd3SAndrea Arcangeli 				break;
795*86039bd3SAndrea Arcangeli 		}
796*86039bd3SAndrea Arcangeli 		if (vma->vm_end > end) {
797*86039bd3SAndrea Arcangeli 			ret = split_vma(mm, vma, end, 0);
798*86039bd3SAndrea Arcangeli 			if (ret)
799*86039bd3SAndrea Arcangeli 				break;
800*86039bd3SAndrea Arcangeli 		}
801*86039bd3SAndrea Arcangeli 	next:
802*86039bd3SAndrea Arcangeli 		/*
803*86039bd3SAndrea Arcangeli 		 * In the vma_merge() successful mprotect-like case 8:
804*86039bd3SAndrea Arcangeli 		 * the next vma was merged into the current one and
805*86039bd3SAndrea Arcangeli 		 * the current one has not been updated yet.
806*86039bd3SAndrea Arcangeli 		 */
807*86039bd3SAndrea Arcangeli 		vma->vm_flags = new_flags;
808*86039bd3SAndrea Arcangeli 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
809*86039bd3SAndrea Arcangeli 
810*86039bd3SAndrea Arcangeli 	skip:
811*86039bd3SAndrea Arcangeli 		prev = vma;
812*86039bd3SAndrea Arcangeli 		start = vma->vm_end;
813*86039bd3SAndrea Arcangeli 		vma = vma->vm_next;
814*86039bd3SAndrea Arcangeli 	} while (vma && vma->vm_start < end);
815*86039bd3SAndrea Arcangeli out_unlock:
816*86039bd3SAndrea Arcangeli 	up_write(&mm->mmap_sem);
817*86039bd3SAndrea Arcangeli out:
818*86039bd3SAndrea Arcangeli 	return ret;
819*86039bd3SAndrea Arcangeli }
820*86039bd3SAndrea Arcangeli 
821*86039bd3SAndrea Arcangeli /*
822*86039bd3SAndrea Arcangeli  * This is mostly needed to re-wakeup those userfaults that were still
823*86039bd3SAndrea Arcangeli  * pending when userland wake them up the first time. We don't wake
824*86039bd3SAndrea Arcangeli  * the pending one to avoid blocking reads to block, or non blocking
825*86039bd3SAndrea Arcangeli  * read to return -EAGAIN, if used with POLLIN, to avoid userland
826*86039bd3SAndrea Arcangeli  * doubts on why POLLIN wasn't reliable.
827*86039bd3SAndrea Arcangeli  */
828*86039bd3SAndrea Arcangeli static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
829*86039bd3SAndrea Arcangeli 			    unsigned long arg)
830*86039bd3SAndrea Arcangeli {
831*86039bd3SAndrea Arcangeli 	int ret;
832*86039bd3SAndrea Arcangeli 	struct uffdio_range uffdio_wake;
833*86039bd3SAndrea Arcangeli 	struct userfaultfd_wake_range range;
834*86039bd3SAndrea Arcangeli 	const void __user *buf = (void __user *)arg;
835*86039bd3SAndrea Arcangeli 
836*86039bd3SAndrea Arcangeli 	ret = -EFAULT;
837*86039bd3SAndrea Arcangeli 	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
838*86039bd3SAndrea Arcangeli 		goto out;
839*86039bd3SAndrea Arcangeli 
840*86039bd3SAndrea Arcangeli 	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
841*86039bd3SAndrea Arcangeli 	if (ret)
842*86039bd3SAndrea Arcangeli 		goto out;
843*86039bd3SAndrea Arcangeli 
844*86039bd3SAndrea Arcangeli 	range.start = uffdio_wake.start;
845*86039bd3SAndrea Arcangeli 	range.len = uffdio_wake.len;
846*86039bd3SAndrea Arcangeli 
847*86039bd3SAndrea Arcangeli 	/*
848*86039bd3SAndrea Arcangeli 	 * len == 0 means wake all and we don't want to wake all here,
849*86039bd3SAndrea Arcangeli 	 * so check it again to be sure.
850*86039bd3SAndrea Arcangeli 	 */
851*86039bd3SAndrea Arcangeli 	VM_BUG_ON(!range.len);
852*86039bd3SAndrea Arcangeli 
853*86039bd3SAndrea Arcangeli 	wake_userfault(ctx, &range);
854*86039bd3SAndrea Arcangeli 	ret = 0;
855*86039bd3SAndrea Arcangeli 
856*86039bd3SAndrea Arcangeli out:
857*86039bd3SAndrea Arcangeli 	return ret;
858*86039bd3SAndrea Arcangeli }
859*86039bd3SAndrea Arcangeli 
860*86039bd3SAndrea Arcangeli /*
861*86039bd3SAndrea Arcangeli  * userland asks for a certain API version and we return which bits
862*86039bd3SAndrea Arcangeli  * and ioctl commands are implemented in this kernel for such API
863*86039bd3SAndrea Arcangeli  * version or -EINVAL if unknown.
864*86039bd3SAndrea Arcangeli  */
865*86039bd3SAndrea Arcangeli static int userfaultfd_api(struct userfaultfd_ctx *ctx,
866*86039bd3SAndrea Arcangeli 			   unsigned long arg)
867*86039bd3SAndrea Arcangeli {
868*86039bd3SAndrea Arcangeli 	struct uffdio_api uffdio_api;
869*86039bd3SAndrea Arcangeli 	void __user *buf = (void __user *)arg;
870*86039bd3SAndrea Arcangeli 	int ret;
871*86039bd3SAndrea Arcangeli 
872*86039bd3SAndrea Arcangeli 	ret = -EINVAL;
873*86039bd3SAndrea Arcangeli 	if (ctx->state != UFFD_STATE_WAIT_API)
874*86039bd3SAndrea Arcangeli 		goto out;
875*86039bd3SAndrea Arcangeli 	ret = -EFAULT;
876*86039bd3SAndrea Arcangeli 	if (copy_from_user(&uffdio_api, buf, sizeof(__u64)))
877*86039bd3SAndrea Arcangeli 		goto out;
878*86039bd3SAndrea Arcangeli 	if (uffdio_api.api != UFFD_API) {
879*86039bd3SAndrea Arcangeli 		/* careful not to leak info, we only read the first 8 bytes */
880*86039bd3SAndrea Arcangeli 		memset(&uffdio_api, 0, sizeof(uffdio_api));
881*86039bd3SAndrea Arcangeli 		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
882*86039bd3SAndrea Arcangeli 			goto out;
883*86039bd3SAndrea Arcangeli 		ret = -EINVAL;
884*86039bd3SAndrea Arcangeli 		goto out;
885*86039bd3SAndrea Arcangeli 	}
886*86039bd3SAndrea Arcangeli 	/* careful not to leak info, we only read the first 8 bytes */
887*86039bd3SAndrea Arcangeli 	uffdio_api.bits = UFFD_API_BITS;
888*86039bd3SAndrea Arcangeli 	uffdio_api.ioctls = UFFD_API_IOCTLS;
889*86039bd3SAndrea Arcangeli 	ret = -EFAULT;
890*86039bd3SAndrea Arcangeli 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
891*86039bd3SAndrea Arcangeli 		goto out;
892*86039bd3SAndrea Arcangeli 	ctx->state = UFFD_STATE_RUNNING;
893*86039bd3SAndrea Arcangeli 	ret = 0;
894*86039bd3SAndrea Arcangeli out:
895*86039bd3SAndrea Arcangeli 	return ret;
896*86039bd3SAndrea Arcangeli }
897*86039bd3SAndrea Arcangeli 
898*86039bd3SAndrea Arcangeli static long userfaultfd_ioctl(struct file *file, unsigned cmd,
899*86039bd3SAndrea Arcangeli 			      unsigned long arg)
900*86039bd3SAndrea Arcangeli {
901*86039bd3SAndrea Arcangeli 	int ret = -EINVAL;
902*86039bd3SAndrea Arcangeli 	struct userfaultfd_ctx *ctx = file->private_data;
903*86039bd3SAndrea Arcangeli 
904*86039bd3SAndrea Arcangeli 	switch(cmd) {
905*86039bd3SAndrea Arcangeli 	case UFFDIO_API:
906*86039bd3SAndrea Arcangeli 		ret = userfaultfd_api(ctx, arg);
907*86039bd3SAndrea Arcangeli 		break;
908*86039bd3SAndrea Arcangeli 	case UFFDIO_REGISTER:
909*86039bd3SAndrea Arcangeli 		ret = userfaultfd_register(ctx, arg);
910*86039bd3SAndrea Arcangeli 		break;
911*86039bd3SAndrea Arcangeli 	case UFFDIO_UNREGISTER:
912*86039bd3SAndrea Arcangeli 		ret = userfaultfd_unregister(ctx, arg);
913*86039bd3SAndrea Arcangeli 		break;
914*86039bd3SAndrea Arcangeli 	case UFFDIO_WAKE:
915*86039bd3SAndrea Arcangeli 		ret = userfaultfd_wake(ctx, arg);
916*86039bd3SAndrea Arcangeli 		break;
917*86039bd3SAndrea Arcangeli 	}
918*86039bd3SAndrea Arcangeli 	return ret;
919*86039bd3SAndrea Arcangeli }
920*86039bd3SAndrea Arcangeli 
921*86039bd3SAndrea Arcangeli #ifdef CONFIG_PROC_FS
922*86039bd3SAndrea Arcangeli static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
923*86039bd3SAndrea Arcangeli {
924*86039bd3SAndrea Arcangeli 	struct userfaultfd_ctx *ctx = f->private_data;
925*86039bd3SAndrea Arcangeli 	wait_queue_t *wq;
926*86039bd3SAndrea Arcangeli 	struct userfaultfd_wait_queue *uwq;
927*86039bd3SAndrea Arcangeli 	unsigned long pending = 0, total = 0;
928*86039bd3SAndrea Arcangeli 
929*86039bd3SAndrea Arcangeli 	spin_lock(&ctx->fault_wqh.lock);
930*86039bd3SAndrea Arcangeli 	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
931*86039bd3SAndrea Arcangeli 		uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
932*86039bd3SAndrea Arcangeli 		if (uwq->pending)
933*86039bd3SAndrea Arcangeli 			pending++;
934*86039bd3SAndrea Arcangeli 		total++;
935*86039bd3SAndrea Arcangeli 	}
936*86039bd3SAndrea Arcangeli 	spin_unlock(&ctx->fault_wqh.lock);
937*86039bd3SAndrea Arcangeli 
938*86039bd3SAndrea Arcangeli 	/*
939*86039bd3SAndrea Arcangeli 	 * If more protocols will be added, there will be all shown
940*86039bd3SAndrea Arcangeli 	 * separated by a space. Like this:
941*86039bd3SAndrea Arcangeli 	 *	protocols: aa:... bb:...
942*86039bd3SAndrea Arcangeli 	 */
943*86039bd3SAndrea Arcangeli 	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
944*86039bd3SAndrea Arcangeli 		   pending, total, UFFD_API, UFFD_API_BITS,
945*86039bd3SAndrea Arcangeli 		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
946*86039bd3SAndrea Arcangeli }
947*86039bd3SAndrea Arcangeli #endif
948*86039bd3SAndrea Arcangeli 
949*86039bd3SAndrea Arcangeli static const struct file_operations userfaultfd_fops = {
950*86039bd3SAndrea Arcangeli #ifdef CONFIG_PROC_FS
951*86039bd3SAndrea Arcangeli 	.show_fdinfo	= userfaultfd_show_fdinfo,
952*86039bd3SAndrea Arcangeli #endif
953*86039bd3SAndrea Arcangeli 	.release	= userfaultfd_release,
954*86039bd3SAndrea Arcangeli 	.poll		= userfaultfd_poll,
955*86039bd3SAndrea Arcangeli 	.read		= userfaultfd_read,
956*86039bd3SAndrea Arcangeli 	.unlocked_ioctl = userfaultfd_ioctl,
957*86039bd3SAndrea Arcangeli 	.compat_ioctl	= userfaultfd_ioctl,
958*86039bd3SAndrea Arcangeli 	.llseek		= noop_llseek,
959*86039bd3SAndrea Arcangeli };
960*86039bd3SAndrea Arcangeli 
961*86039bd3SAndrea Arcangeli /**
962*86039bd3SAndrea Arcangeli  * userfaultfd_file_create - Creates an userfaultfd file pointer.
963*86039bd3SAndrea Arcangeli  * @flags: Flags for the userfaultfd file.
964*86039bd3SAndrea Arcangeli  *
965*86039bd3SAndrea Arcangeli  * This function creates an userfaultfd file pointer, w/out installing
966*86039bd3SAndrea Arcangeli  * it into the fd table. This is useful when the userfaultfd file is
967*86039bd3SAndrea Arcangeli  * used during the initialization of data structures that require
968*86039bd3SAndrea Arcangeli  * extra setup after the userfaultfd creation. So the userfaultfd
969*86039bd3SAndrea Arcangeli  * creation is split into the file pointer creation phase, and the
970*86039bd3SAndrea Arcangeli  * file descriptor installation phase.  In this way races with
971*86039bd3SAndrea Arcangeli  * userspace closing the newly installed file descriptor can be
972*86039bd3SAndrea Arcangeli  * avoided.  Returns an userfaultfd file pointer, or a proper error
973*86039bd3SAndrea Arcangeli  * pointer.
974*86039bd3SAndrea Arcangeli  */
975*86039bd3SAndrea Arcangeli static struct file *userfaultfd_file_create(int flags)
976*86039bd3SAndrea Arcangeli {
977*86039bd3SAndrea Arcangeli 	struct file *file;
978*86039bd3SAndrea Arcangeli 	struct userfaultfd_ctx *ctx;
979*86039bd3SAndrea Arcangeli 
980*86039bd3SAndrea Arcangeli 	BUG_ON(!current->mm);
981*86039bd3SAndrea Arcangeli 
982*86039bd3SAndrea Arcangeli 	/* Check the UFFD_* constants for consistency.  */
983*86039bd3SAndrea Arcangeli 	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
984*86039bd3SAndrea Arcangeli 	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
985*86039bd3SAndrea Arcangeli 
986*86039bd3SAndrea Arcangeli 	file = ERR_PTR(-EINVAL);
987*86039bd3SAndrea Arcangeli 	if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
988*86039bd3SAndrea Arcangeli 		goto out;
989*86039bd3SAndrea Arcangeli 
990*86039bd3SAndrea Arcangeli 	file = ERR_PTR(-ENOMEM);
991*86039bd3SAndrea Arcangeli 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
992*86039bd3SAndrea Arcangeli 	if (!ctx)
993*86039bd3SAndrea Arcangeli 		goto out;
994*86039bd3SAndrea Arcangeli 
995*86039bd3SAndrea Arcangeli 	atomic_set(&ctx->refcount, 1);
996*86039bd3SAndrea Arcangeli 	init_waitqueue_head(&ctx->fault_wqh);
997*86039bd3SAndrea Arcangeli 	init_waitqueue_head(&ctx->fd_wqh);
998*86039bd3SAndrea Arcangeli 	ctx->flags = flags;
999*86039bd3SAndrea Arcangeli 	ctx->state = UFFD_STATE_WAIT_API;
1000*86039bd3SAndrea Arcangeli 	ctx->released = false;
1001*86039bd3SAndrea Arcangeli 	ctx->mm = current->mm;
1002*86039bd3SAndrea Arcangeli 	/* prevent the mm struct to be freed */
1003*86039bd3SAndrea Arcangeli 	atomic_inc(&ctx->mm->mm_users);
1004*86039bd3SAndrea Arcangeli 
1005*86039bd3SAndrea Arcangeli 	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
1006*86039bd3SAndrea Arcangeli 				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
1007*86039bd3SAndrea Arcangeli 	if (IS_ERR(file))
1008*86039bd3SAndrea Arcangeli 		kfree(ctx);
1009*86039bd3SAndrea Arcangeli out:
1010*86039bd3SAndrea Arcangeli 	return file;
1011*86039bd3SAndrea Arcangeli }
1012*86039bd3SAndrea Arcangeli 
1013*86039bd3SAndrea Arcangeli SYSCALL_DEFINE1(userfaultfd, int, flags)
1014*86039bd3SAndrea Arcangeli {
1015*86039bd3SAndrea Arcangeli 	int fd, error;
1016*86039bd3SAndrea Arcangeli 	struct file *file;
1017*86039bd3SAndrea Arcangeli 
1018*86039bd3SAndrea Arcangeli 	error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
1019*86039bd3SAndrea Arcangeli 	if (error < 0)
1020*86039bd3SAndrea Arcangeli 		return error;
1021*86039bd3SAndrea Arcangeli 	fd = error;
1022*86039bd3SAndrea Arcangeli 
1023*86039bd3SAndrea Arcangeli 	file = userfaultfd_file_create(flags);
1024*86039bd3SAndrea Arcangeli 	if (IS_ERR(file)) {
1025*86039bd3SAndrea Arcangeli 		error = PTR_ERR(file);
1026*86039bd3SAndrea Arcangeli 		goto err_put_unused_fd;
1027*86039bd3SAndrea Arcangeli 	}
1028*86039bd3SAndrea Arcangeli 	fd_install(fd, file);
1029*86039bd3SAndrea Arcangeli 
1030*86039bd3SAndrea Arcangeli 	return fd;
1031*86039bd3SAndrea Arcangeli 
1032*86039bd3SAndrea Arcangeli err_put_unused_fd:
1033*86039bd3SAndrea Arcangeli 	put_unused_fd(fd);
1034*86039bd3SAndrea Arcangeli 
1035*86039bd3SAndrea Arcangeli 	return error;
1036*86039bd3SAndrea Arcangeli }
1037