1 /* 2 * fs/eventfd.c 3 * 4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 5 * 6 */ 7 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/init.h> 11 #include <linux/fs.h> 12 #include <linux/sched/signal.h> 13 #include <linux/kernel.h> 14 #include <linux/slab.h> 15 #include <linux/list.h> 16 #include <linux/spinlock.h> 17 #include <linux/anon_inodes.h> 18 #include <linux/syscalls.h> 19 #include <linux/export.h> 20 #include <linux/kref.h> 21 #include <linux/eventfd.h> 22 #include <linux/proc_fs.h> 23 #include <linux/seq_file.h> 24 25 struct eventfd_ctx { 26 struct kref kref; 27 wait_queue_head_t wqh; 28 /* 29 * Every time that a write(2) is performed on an eventfd, the 30 * value of the __u64 being written is added to "count" and a 31 * wakeup is performed on "wqh". A read(2) will return the "count" 32 * value to userspace, and will reset "count" to zero. The kernel 33 * side eventfd_signal() also, adds to the "count" counter and 34 * issue a wakeup. 35 */ 36 __u64 count; 37 unsigned int flags; 38 }; 39 40 /** 41 * eventfd_signal - Adds @n to the eventfd counter. 42 * @ctx: [in] Pointer to the eventfd context. 43 * @n: [in] Value of the counter to be added to the eventfd internal counter. 44 * The value cannot be negative. 45 * 46 * This function is supposed to be called by the kernel in paths that do not 47 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX 48 * value, and we signal this as overflow condition by returning a EPOLLERR 49 * to poll(2). 50 * 51 * Returns the amount by which the counter was incremented. This will be less 52 * than @n if the counter has overflowed. 53 */ 54 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) 55 { 56 unsigned long flags; 57 58 spin_lock_irqsave(&ctx->wqh.lock, flags); 59 if (ULLONG_MAX - ctx->count < n) 60 n = ULLONG_MAX - ctx->count; 61 ctx->count += n; 62 if (waitqueue_active(&ctx->wqh)) 63 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 64 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 65 66 return n; 67 } 68 EXPORT_SYMBOL_GPL(eventfd_signal); 69 70 static void eventfd_free_ctx(struct eventfd_ctx *ctx) 71 { 72 kfree(ctx); 73 } 74 75 static void eventfd_free(struct kref *kref) 76 { 77 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); 78 79 eventfd_free_ctx(ctx); 80 } 81 82 /** 83 * eventfd_ctx_put - Releases a reference to the internal eventfd context. 84 * @ctx: [in] Pointer to eventfd context. 85 * 86 * The eventfd context reference must have been previously acquired either 87 * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). 88 */ 89 void eventfd_ctx_put(struct eventfd_ctx *ctx) 90 { 91 kref_put(&ctx->kref, eventfd_free); 92 } 93 EXPORT_SYMBOL_GPL(eventfd_ctx_put); 94 95 static int eventfd_release(struct inode *inode, struct file *file) 96 { 97 struct eventfd_ctx *ctx = file->private_data; 98 99 wake_up_poll(&ctx->wqh, EPOLLHUP); 100 eventfd_ctx_put(ctx); 101 return 0; 102 } 103 104 static __poll_t eventfd_poll(struct file *file, poll_table *wait) 105 { 106 struct eventfd_ctx *ctx = file->private_data; 107 __poll_t events = 0; 108 u64 count; 109 110 poll_wait(file, &ctx->wqh, wait); 111 112 /* 113 * All writes to ctx->count occur within ctx->wqh.lock. This read 114 * can be done outside ctx->wqh.lock because we know that poll_wait 115 * takes that lock (through add_wait_queue) if our caller will sleep. 116 * 117 * The read _can_ therefore seep into add_wait_queue's critical 118 * section, but cannot move above it! add_wait_queue's spin_lock acts 119 * as an acquire barrier and ensures that the read be ordered properly 120 * against the writes. The following CAN happen and is safe: 121 * 122 * poll write 123 * ----------------- ------------ 124 * lock ctx->wqh.lock (in poll_wait) 125 * count = ctx->count 126 * __add_wait_queue 127 * unlock ctx->wqh.lock 128 * lock ctx->qwh.lock 129 * ctx->count += n 130 * if (waitqueue_active) 131 * wake_up_locked_poll 132 * unlock ctx->qwh.lock 133 * eventfd_poll returns 0 134 * 135 * but the following, which would miss a wakeup, cannot happen: 136 * 137 * poll write 138 * ----------------- ------------ 139 * count = ctx->count (INVALID!) 140 * lock ctx->qwh.lock 141 * ctx->count += n 142 * **waitqueue_active is false** 143 * **no wake_up_locked_poll!** 144 * unlock ctx->qwh.lock 145 * lock ctx->wqh.lock (in poll_wait) 146 * __add_wait_queue 147 * unlock ctx->wqh.lock 148 * eventfd_poll returns 0 149 */ 150 count = READ_ONCE(ctx->count); 151 152 if (count > 0) 153 events |= EPOLLIN; 154 if (count == ULLONG_MAX) 155 events |= EPOLLERR; 156 if (ULLONG_MAX - 1 > count) 157 events |= EPOLLOUT; 158 159 return events; 160 } 161 162 static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) 163 { 164 *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 165 ctx->count -= *cnt; 166 } 167 168 /** 169 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 170 * @ctx: [in] Pointer to eventfd context. 171 * @wait: [in] Wait queue to be removed. 172 * @cnt: [out] Pointer to the 64-bit counter value. 173 * 174 * Returns %0 if successful, or the following error codes: 175 * 176 * -EAGAIN : The operation would have blocked. 177 * 178 * This is used to atomically remove a wait queue entry from the eventfd wait 179 * queue head, and read/reset the counter value. 180 */ 181 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, 182 __u64 *cnt) 183 { 184 unsigned long flags; 185 186 spin_lock_irqsave(&ctx->wqh.lock, flags); 187 eventfd_ctx_do_read(ctx, cnt); 188 __remove_wait_queue(&ctx->wqh, wait); 189 if (*cnt != 0 && waitqueue_active(&ctx->wqh)) 190 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 191 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 192 193 return *cnt != 0 ? 0 : -EAGAIN; 194 } 195 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); 196 197 static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 198 loff_t *ppos) 199 { 200 struct eventfd_ctx *ctx = file->private_data; 201 ssize_t res; 202 __u64 ucnt = 0; 203 DECLARE_WAITQUEUE(wait, current); 204 205 if (count < sizeof(ucnt)) 206 return -EINVAL; 207 208 spin_lock_irq(&ctx->wqh.lock); 209 res = -EAGAIN; 210 if (ctx->count > 0) 211 res = sizeof(ucnt); 212 else if (!(file->f_flags & O_NONBLOCK)) { 213 __add_wait_queue(&ctx->wqh, &wait); 214 for (;;) { 215 set_current_state(TASK_INTERRUPTIBLE); 216 if (ctx->count > 0) { 217 res = sizeof(ucnt); 218 break; 219 } 220 if (signal_pending(current)) { 221 res = -ERESTARTSYS; 222 break; 223 } 224 spin_unlock_irq(&ctx->wqh.lock); 225 schedule(); 226 spin_lock_irq(&ctx->wqh.lock); 227 } 228 __remove_wait_queue(&ctx->wqh, &wait); 229 __set_current_state(TASK_RUNNING); 230 } 231 if (likely(res > 0)) { 232 eventfd_ctx_do_read(ctx, &ucnt); 233 if (waitqueue_active(&ctx->wqh)) 234 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 235 } 236 spin_unlock_irq(&ctx->wqh.lock); 237 238 if (res > 0 && put_user(ucnt, (__u64 __user *)buf)) 239 return -EFAULT; 240 241 return res; 242 } 243 244 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, 245 loff_t *ppos) 246 { 247 struct eventfd_ctx *ctx = file->private_data; 248 ssize_t res; 249 __u64 ucnt; 250 DECLARE_WAITQUEUE(wait, current); 251 252 if (count < sizeof(ucnt)) 253 return -EINVAL; 254 if (copy_from_user(&ucnt, buf, sizeof(ucnt))) 255 return -EFAULT; 256 if (ucnt == ULLONG_MAX) 257 return -EINVAL; 258 spin_lock_irq(&ctx->wqh.lock); 259 res = -EAGAIN; 260 if (ULLONG_MAX - ctx->count > ucnt) 261 res = sizeof(ucnt); 262 else if (!(file->f_flags & O_NONBLOCK)) { 263 __add_wait_queue(&ctx->wqh, &wait); 264 for (res = 0;;) { 265 set_current_state(TASK_INTERRUPTIBLE); 266 if (ULLONG_MAX - ctx->count > ucnt) { 267 res = sizeof(ucnt); 268 break; 269 } 270 if (signal_pending(current)) { 271 res = -ERESTARTSYS; 272 break; 273 } 274 spin_unlock_irq(&ctx->wqh.lock); 275 schedule(); 276 spin_lock_irq(&ctx->wqh.lock); 277 } 278 __remove_wait_queue(&ctx->wqh, &wait); 279 __set_current_state(TASK_RUNNING); 280 } 281 if (likely(res > 0)) { 282 ctx->count += ucnt; 283 if (waitqueue_active(&ctx->wqh)) 284 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 285 } 286 spin_unlock_irq(&ctx->wqh.lock); 287 288 return res; 289 } 290 291 #ifdef CONFIG_PROC_FS 292 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) 293 { 294 struct eventfd_ctx *ctx = f->private_data; 295 296 spin_lock_irq(&ctx->wqh.lock); 297 seq_printf(m, "eventfd-count: %16llx\n", 298 (unsigned long long)ctx->count); 299 spin_unlock_irq(&ctx->wqh.lock); 300 } 301 #endif 302 303 static const struct file_operations eventfd_fops = { 304 #ifdef CONFIG_PROC_FS 305 .show_fdinfo = eventfd_show_fdinfo, 306 #endif 307 .release = eventfd_release, 308 .poll = eventfd_poll, 309 .read = eventfd_read, 310 .write = eventfd_write, 311 .llseek = noop_llseek, 312 }; 313 314 /** 315 * eventfd_fget - Acquire a reference of an eventfd file descriptor. 316 * @fd: [in] Eventfd file descriptor. 317 * 318 * Returns a pointer to the eventfd file structure in case of success, or the 319 * following error pointer: 320 * 321 * -EBADF : Invalid @fd file descriptor. 322 * -EINVAL : The @fd file descriptor is not an eventfd file. 323 */ 324 struct file *eventfd_fget(int fd) 325 { 326 struct file *file; 327 328 file = fget(fd); 329 if (!file) 330 return ERR_PTR(-EBADF); 331 if (file->f_op != &eventfd_fops) { 332 fput(file); 333 return ERR_PTR(-EINVAL); 334 } 335 336 return file; 337 } 338 EXPORT_SYMBOL_GPL(eventfd_fget); 339 340 /** 341 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. 342 * @fd: [in] Eventfd file descriptor. 343 * 344 * Returns a pointer to the internal eventfd context, otherwise the error 345 * pointers returned by the following functions: 346 * 347 * eventfd_fget 348 */ 349 struct eventfd_ctx *eventfd_ctx_fdget(int fd) 350 { 351 struct eventfd_ctx *ctx; 352 struct fd f = fdget(fd); 353 if (!f.file) 354 return ERR_PTR(-EBADF); 355 ctx = eventfd_ctx_fileget(f.file); 356 fdput(f); 357 return ctx; 358 } 359 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); 360 361 /** 362 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. 363 * @file: [in] Eventfd file pointer. 364 * 365 * Returns a pointer to the internal eventfd context, otherwise the error 366 * pointer: 367 * 368 * -EINVAL : The @fd file descriptor is not an eventfd file. 369 */ 370 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) 371 { 372 struct eventfd_ctx *ctx; 373 374 if (file->f_op != &eventfd_fops) 375 return ERR_PTR(-EINVAL); 376 377 ctx = file->private_data; 378 kref_get(&ctx->kref); 379 return ctx; 380 } 381 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); 382 383 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 384 { 385 struct eventfd_ctx *ctx; 386 int fd; 387 388 /* Check the EFD_* constants for consistency. */ 389 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); 390 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 391 392 if (flags & ~EFD_FLAGS_SET) 393 return -EINVAL; 394 395 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 396 if (!ctx) 397 return -ENOMEM; 398 399 kref_init(&ctx->kref); 400 init_waitqueue_head(&ctx->wqh); 401 ctx->count = count; 402 ctx->flags = flags; 403 404 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 405 O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); 406 if (fd < 0) 407 eventfd_free_ctx(ctx); 408 409 return fd; 410 } 411 412 SYSCALL_DEFINE1(eventfd, unsigned int, count) 413 { 414 return sys_eventfd2(count, 0); 415 } 416 417