xref: /openbmc/linux/fs/eventfd.c (revision e1ad7468c77ddb94b0615d5f50fa255525fde0f0)
1*e1ad7468SDavide Libenzi /*
2*e1ad7468SDavide Libenzi  *  fs/eventfd.c
3*e1ad7468SDavide Libenzi  *
4*e1ad7468SDavide Libenzi  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
5*e1ad7468SDavide Libenzi  *
6*e1ad7468SDavide Libenzi  */
7*e1ad7468SDavide Libenzi 
8*e1ad7468SDavide Libenzi #include <linux/file.h>
9*e1ad7468SDavide Libenzi #include <linux/poll.h>
10*e1ad7468SDavide Libenzi #include <linux/init.h>
11*e1ad7468SDavide Libenzi #include <linux/fs.h>
12*e1ad7468SDavide Libenzi #include <linux/sched.h>
13*e1ad7468SDavide Libenzi #include <linux/kernel.h>
14*e1ad7468SDavide Libenzi #include <linux/list.h>
15*e1ad7468SDavide Libenzi #include <linux/spinlock.h>
16*e1ad7468SDavide Libenzi #include <linux/anon_inodes.h>
17*e1ad7468SDavide Libenzi #include <linux/eventfd.h>
18*e1ad7468SDavide Libenzi 
19*e1ad7468SDavide Libenzi struct eventfd_ctx {
20*e1ad7468SDavide Libenzi 	spinlock_t lock;
21*e1ad7468SDavide Libenzi 	wait_queue_head_t wqh;
22*e1ad7468SDavide Libenzi 	/*
23*e1ad7468SDavide Libenzi 	 * Every time that a write(2) is performed on an eventfd, the
24*e1ad7468SDavide Libenzi 	 * value of the __u64 being written is added to "count" and a
25*e1ad7468SDavide Libenzi 	 * wakeup is performed on "wqh". A read(2) will return the "count"
26*e1ad7468SDavide Libenzi 	 * value to userspace, and will reset "count" to zero. The kernel
27*e1ad7468SDavide Libenzi 	 * size eventfd_signal() also, adds to the "count" counter and
28*e1ad7468SDavide Libenzi 	 * issue a wakeup.
29*e1ad7468SDavide Libenzi 	 */
30*e1ad7468SDavide Libenzi 	__u64 count;
31*e1ad7468SDavide Libenzi };
32*e1ad7468SDavide Libenzi 
33*e1ad7468SDavide Libenzi /*
34*e1ad7468SDavide Libenzi  * Adds "n" to the eventfd counter "count". Returns "n" in case of
35*e1ad7468SDavide Libenzi  * success, or a value lower then "n" in case of coutner overflow.
36*e1ad7468SDavide Libenzi  * This function is supposed to be called by the kernel in paths
37*e1ad7468SDavide Libenzi  * that do not allow sleeping. In this function we allow the counter
38*e1ad7468SDavide Libenzi  * to reach the ULLONG_MAX value, and we signal this as overflow
39*e1ad7468SDavide Libenzi  * condition by returining a POLLERR to poll(2).
40*e1ad7468SDavide Libenzi  */
41*e1ad7468SDavide Libenzi int eventfd_signal(struct file *file, int n)
42*e1ad7468SDavide Libenzi {
43*e1ad7468SDavide Libenzi 	struct eventfd_ctx *ctx = file->private_data;
44*e1ad7468SDavide Libenzi 	unsigned long flags;
45*e1ad7468SDavide Libenzi 
46*e1ad7468SDavide Libenzi 	if (n < 0)
47*e1ad7468SDavide Libenzi 		return -EINVAL;
48*e1ad7468SDavide Libenzi 	spin_lock_irqsave(&ctx->lock, flags);
49*e1ad7468SDavide Libenzi 	if (ULLONG_MAX - ctx->count < n)
50*e1ad7468SDavide Libenzi 		n = (int) (ULLONG_MAX - ctx->count);
51*e1ad7468SDavide Libenzi 	ctx->count += n;
52*e1ad7468SDavide Libenzi 	if (waitqueue_active(&ctx->wqh))
53*e1ad7468SDavide Libenzi 		wake_up_locked(&ctx->wqh);
54*e1ad7468SDavide Libenzi 	spin_unlock_irqrestore(&ctx->lock, flags);
55*e1ad7468SDavide Libenzi 
56*e1ad7468SDavide Libenzi 	return n;
57*e1ad7468SDavide Libenzi }
58*e1ad7468SDavide Libenzi 
59*e1ad7468SDavide Libenzi static int eventfd_release(struct inode *inode, struct file *file)
60*e1ad7468SDavide Libenzi {
61*e1ad7468SDavide Libenzi 	kfree(file->private_data);
62*e1ad7468SDavide Libenzi 	return 0;
63*e1ad7468SDavide Libenzi }
64*e1ad7468SDavide Libenzi 
65*e1ad7468SDavide Libenzi static unsigned int eventfd_poll(struct file *file, poll_table *wait)
66*e1ad7468SDavide Libenzi {
67*e1ad7468SDavide Libenzi 	struct eventfd_ctx *ctx = file->private_data;
68*e1ad7468SDavide Libenzi 	unsigned int events = 0;
69*e1ad7468SDavide Libenzi 	unsigned long flags;
70*e1ad7468SDavide Libenzi 
71*e1ad7468SDavide Libenzi 	poll_wait(file, &ctx->wqh, wait);
72*e1ad7468SDavide Libenzi 
73*e1ad7468SDavide Libenzi 	spin_lock_irqsave(&ctx->lock, flags);
74*e1ad7468SDavide Libenzi 	if (ctx->count > 0)
75*e1ad7468SDavide Libenzi 		events |= POLLIN;
76*e1ad7468SDavide Libenzi 	if (ctx->count == ULLONG_MAX)
77*e1ad7468SDavide Libenzi 		events |= POLLERR;
78*e1ad7468SDavide Libenzi 	if (ULLONG_MAX - 1 > ctx->count)
79*e1ad7468SDavide Libenzi 		events |= POLLOUT;
80*e1ad7468SDavide Libenzi 	spin_unlock_irqrestore(&ctx->lock, flags);
81*e1ad7468SDavide Libenzi 
82*e1ad7468SDavide Libenzi 	return events;
83*e1ad7468SDavide Libenzi }
84*e1ad7468SDavide Libenzi 
85*e1ad7468SDavide Libenzi static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
86*e1ad7468SDavide Libenzi 			    loff_t *ppos)
87*e1ad7468SDavide Libenzi {
88*e1ad7468SDavide Libenzi 	struct eventfd_ctx *ctx = file->private_data;
89*e1ad7468SDavide Libenzi 	ssize_t res;
90*e1ad7468SDavide Libenzi 	__u64 ucnt;
91*e1ad7468SDavide Libenzi 	DECLARE_WAITQUEUE(wait, current);
92*e1ad7468SDavide Libenzi 
93*e1ad7468SDavide Libenzi 	if (count < sizeof(ucnt))
94*e1ad7468SDavide Libenzi 		return -EINVAL;
95*e1ad7468SDavide Libenzi 	spin_lock_irq(&ctx->lock);
96*e1ad7468SDavide Libenzi 	res = -EAGAIN;
97*e1ad7468SDavide Libenzi 	ucnt = ctx->count;
98*e1ad7468SDavide Libenzi 	if (ucnt > 0)
99*e1ad7468SDavide Libenzi 		res = sizeof(ucnt);
100*e1ad7468SDavide Libenzi 	else if (!(file->f_flags & O_NONBLOCK)) {
101*e1ad7468SDavide Libenzi 		__add_wait_queue(&ctx->wqh, &wait);
102*e1ad7468SDavide Libenzi 		for (res = 0;;) {
103*e1ad7468SDavide Libenzi 			set_current_state(TASK_INTERRUPTIBLE);
104*e1ad7468SDavide Libenzi 			if (ctx->count > 0) {
105*e1ad7468SDavide Libenzi 				ucnt = ctx->count;
106*e1ad7468SDavide Libenzi 				res = sizeof(ucnt);
107*e1ad7468SDavide Libenzi 				break;
108*e1ad7468SDavide Libenzi 			}
109*e1ad7468SDavide Libenzi 			if (signal_pending(current)) {
110*e1ad7468SDavide Libenzi 				res = -ERESTARTSYS;
111*e1ad7468SDavide Libenzi 				break;
112*e1ad7468SDavide Libenzi 			}
113*e1ad7468SDavide Libenzi 			spin_unlock_irq(&ctx->lock);
114*e1ad7468SDavide Libenzi 			schedule();
115*e1ad7468SDavide Libenzi 			spin_lock_irq(&ctx->lock);
116*e1ad7468SDavide Libenzi 		}
117*e1ad7468SDavide Libenzi 		__remove_wait_queue(&ctx->wqh, &wait);
118*e1ad7468SDavide Libenzi 		__set_current_state(TASK_RUNNING);
119*e1ad7468SDavide Libenzi 	}
120*e1ad7468SDavide Libenzi 	if (res > 0) {
121*e1ad7468SDavide Libenzi 		ctx->count = 0;
122*e1ad7468SDavide Libenzi 		if (waitqueue_active(&ctx->wqh))
123*e1ad7468SDavide Libenzi 			wake_up_locked(&ctx->wqh);
124*e1ad7468SDavide Libenzi 	}
125*e1ad7468SDavide Libenzi 	spin_unlock_irq(&ctx->lock);
126*e1ad7468SDavide Libenzi 	if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
127*e1ad7468SDavide Libenzi 		return -EFAULT;
128*e1ad7468SDavide Libenzi 
129*e1ad7468SDavide Libenzi 	return res;
130*e1ad7468SDavide Libenzi }
131*e1ad7468SDavide Libenzi 
132*e1ad7468SDavide Libenzi static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
133*e1ad7468SDavide Libenzi 			     loff_t *ppos)
134*e1ad7468SDavide Libenzi {
135*e1ad7468SDavide Libenzi 	struct eventfd_ctx *ctx = file->private_data;
136*e1ad7468SDavide Libenzi 	ssize_t res;
137*e1ad7468SDavide Libenzi 	__u64 ucnt;
138*e1ad7468SDavide Libenzi 	DECLARE_WAITQUEUE(wait, current);
139*e1ad7468SDavide Libenzi 
140*e1ad7468SDavide Libenzi 	if (count < sizeof(ucnt))
141*e1ad7468SDavide Libenzi 		return -EINVAL;
142*e1ad7468SDavide Libenzi 	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
143*e1ad7468SDavide Libenzi 		return -EFAULT;
144*e1ad7468SDavide Libenzi 	if (ucnt == ULLONG_MAX)
145*e1ad7468SDavide Libenzi 		return -EINVAL;
146*e1ad7468SDavide Libenzi 	spin_lock_irq(&ctx->lock);
147*e1ad7468SDavide Libenzi 	res = -EAGAIN;
148*e1ad7468SDavide Libenzi 	if (ULLONG_MAX - ctx->count > ucnt)
149*e1ad7468SDavide Libenzi 		res = sizeof(ucnt);
150*e1ad7468SDavide Libenzi 	else if (!(file->f_flags & O_NONBLOCK)) {
151*e1ad7468SDavide Libenzi 		__add_wait_queue(&ctx->wqh, &wait);
152*e1ad7468SDavide Libenzi 		for (res = 0;;) {
153*e1ad7468SDavide Libenzi 			set_current_state(TASK_INTERRUPTIBLE);
154*e1ad7468SDavide Libenzi 			if (ULLONG_MAX - ctx->count > ucnt) {
155*e1ad7468SDavide Libenzi 				res = sizeof(ucnt);
156*e1ad7468SDavide Libenzi 				break;
157*e1ad7468SDavide Libenzi 			}
158*e1ad7468SDavide Libenzi 			if (signal_pending(current)) {
159*e1ad7468SDavide Libenzi 				res = -ERESTARTSYS;
160*e1ad7468SDavide Libenzi 				break;
161*e1ad7468SDavide Libenzi 			}
162*e1ad7468SDavide Libenzi 			spin_unlock_irq(&ctx->lock);
163*e1ad7468SDavide Libenzi 			schedule();
164*e1ad7468SDavide Libenzi 			spin_lock_irq(&ctx->lock);
165*e1ad7468SDavide Libenzi 		}
166*e1ad7468SDavide Libenzi 		__remove_wait_queue(&ctx->wqh, &wait);
167*e1ad7468SDavide Libenzi 		__set_current_state(TASK_RUNNING);
168*e1ad7468SDavide Libenzi 	}
169*e1ad7468SDavide Libenzi 	if (res > 0) {
170*e1ad7468SDavide Libenzi 		ctx->count += ucnt;
171*e1ad7468SDavide Libenzi 		if (waitqueue_active(&ctx->wqh))
172*e1ad7468SDavide Libenzi 			wake_up_locked(&ctx->wqh);
173*e1ad7468SDavide Libenzi 	}
174*e1ad7468SDavide Libenzi 	spin_unlock_irq(&ctx->lock);
175*e1ad7468SDavide Libenzi 
176*e1ad7468SDavide Libenzi 	return res;
177*e1ad7468SDavide Libenzi }
178*e1ad7468SDavide Libenzi 
179*e1ad7468SDavide Libenzi static const struct file_operations eventfd_fops = {
180*e1ad7468SDavide Libenzi 	.release	= eventfd_release,
181*e1ad7468SDavide Libenzi 	.poll		= eventfd_poll,
182*e1ad7468SDavide Libenzi 	.read		= eventfd_read,
183*e1ad7468SDavide Libenzi 	.write		= eventfd_write,
184*e1ad7468SDavide Libenzi };
185*e1ad7468SDavide Libenzi 
186*e1ad7468SDavide Libenzi struct file *eventfd_fget(int fd)
187*e1ad7468SDavide Libenzi {
188*e1ad7468SDavide Libenzi 	struct file *file;
189*e1ad7468SDavide Libenzi 
190*e1ad7468SDavide Libenzi 	file = fget(fd);
191*e1ad7468SDavide Libenzi 	if (!file)
192*e1ad7468SDavide Libenzi 		return ERR_PTR(-EBADF);
193*e1ad7468SDavide Libenzi 	if (file->f_op != &eventfd_fops) {
194*e1ad7468SDavide Libenzi 		fput(file);
195*e1ad7468SDavide Libenzi 		return ERR_PTR(-EINVAL);
196*e1ad7468SDavide Libenzi 	}
197*e1ad7468SDavide Libenzi 
198*e1ad7468SDavide Libenzi 	return file;
199*e1ad7468SDavide Libenzi }
200*e1ad7468SDavide Libenzi 
201*e1ad7468SDavide Libenzi asmlinkage long sys_eventfd(unsigned int count)
202*e1ad7468SDavide Libenzi {
203*e1ad7468SDavide Libenzi 	int error, fd;
204*e1ad7468SDavide Libenzi 	struct eventfd_ctx *ctx;
205*e1ad7468SDavide Libenzi 	struct file *file;
206*e1ad7468SDavide Libenzi 	struct inode *inode;
207*e1ad7468SDavide Libenzi 
208*e1ad7468SDavide Libenzi 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
209*e1ad7468SDavide Libenzi 	if (!ctx)
210*e1ad7468SDavide Libenzi 		return -ENOMEM;
211*e1ad7468SDavide Libenzi 
212*e1ad7468SDavide Libenzi 	init_waitqueue_head(&ctx->wqh);
213*e1ad7468SDavide Libenzi 	spin_lock_init(&ctx->lock);
214*e1ad7468SDavide Libenzi 	ctx->count = count;
215*e1ad7468SDavide Libenzi 
216*e1ad7468SDavide Libenzi 	/*
217*e1ad7468SDavide Libenzi 	 * When we call this, the initialization must be complete, since
218*e1ad7468SDavide Libenzi 	 * anon_inode_getfd() will install the fd.
219*e1ad7468SDavide Libenzi 	 */
220*e1ad7468SDavide Libenzi 	error = anon_inode_getfd(&fd, &inode, &file, "[eventfd]",
221*e1ad7468SDavide Libenzi 				 &eventfd_fops, ctx);
222*e1ad7468SDavide Libenzi 	if (!error)
223*e1ad7468SDavide Libenzi 		return fd;
224*e1ad7468SDavide Libenzi 
225*e1ad7468SDavide Libenzi 	kfree(ctx);
226*e1ad7468SDavide Libenzi 	return error;
227*e1ad7468SDavide Libenzi }
228*e1ad7468SDavide Libenzi 
229