1 /* 2 * fs/eventfd.c 3 * 4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 5 * 6 */ 7 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/init.h> 11 #include <linux/fs.h> 12 #include <linux/sched/signal.h> 13 #include <linux/kernel.h> 14 #include <linux/slab.h> 15 #include <linux/list.h> 16 #include <linux/spinlock.h> 17 #include <linux/anon_inodes.h> 18 #include <linux/syscalls.h> 19 #include <linux/export.h> 20 #include <linux/kref.h> 21 #include <linux/eventfd.h> 22 #include <linux/proc_fs.h> 23 #include <linux/seq_file.h> 24 #include <linux/idr.h> 25 26 static DEFINE_IDA(eventfd_ida); 27 28 struct eventfd_ctx { 29 struct kref kref; 30 wait_queue_head_t wqh; 31 /* 32 * Every time that a write(2) is performed on an eventfd, the 33 * value of the __u64 being written is added to "count" and a 34 * wakeup is performed on "wqh". A read(2) will return the "count" 35 * value to userspace, and will reset "count" to zero. The kernel 36 * side eventfd_signal() also, adds to the "count" counter and 37 * issue a wakeup. 38 */ 39 __u64 count; 40 unsigned int flags; 41 int id; 42 }; 43 44 /** 45 * eventfd_signal - Adds @n to the eventfd counter. 46 * @ctx: [in] Pointer to the eventfd context. 47 * @n: [in] Value of the counter to be added to the eventfd internal counter. 48 * The value cannot be negative. 49 * 50 * This function is supposed to be called by the kernel in paths that do not 51 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX 52 * value, and we signal this as overflow condition by returning a EPOLLERR 53 * to poll(2). 54 * 55 * Returns the amount by which the counter was incremented. This will be less 56 * than @n if the counter has overflowed. 57 */ 58 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) 59 { 60 unsigned long flags; 61 62 spin_lock_irqsave(&ctx->wqh.lock, flags); 63 if (ULLONG_MAX - ctx->count < n) 64 n = ULLONG_MAX - ctx->count; 65 ctx->count += n; 66 if (waitqueue_active(&ctx->wqh)) 67 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 68 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 69 70 return n; 71 } 72 EXPORT_SYMBOL_GPL(eventfd_signal); 73 74 static void eventfd_free_ctx(struct eventfd_ctx *ctx) 75 { 76 if (ctx->id >= 0) 77 ida_simple_remove(&eventfd_ida, ctx->id); 78 kfree(ctx); 79 } 80 81 static void eventfd_free(struct kref *kref) 82 { 83 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); 84 85 eventfd_free_ctx(ctx); 86 } 87 88 /** 89 * eventfd_ctx_put - Releases a reference to the internal eventfd context. 90 * @ctx: [in] Pointer to eventfd context. 91 * 92 * The eventfd context reference must have been previously acquired either 93 * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). 94 */ 95 void eventfd_ctx_put(struct eventfd_ctx *ctx) 96 { 97 kref_put(&ctx->kref, eventfd_free); 98 } 99 EXPORT_SYMBOL_GPL(eventfd_ctx_put); 100 101 static int eventfd_release(struct inode *inode, struct file *file) 102 { 103 struct eventfd_ctx *ctx = file->private_data; 104 105 wake_up_poll(&ctx->wqh, EPOLLHUP); 106 eventfd_ctx_put(ctx); 107 return 0; 108 } 109 110 static __poll_t eventfd_poll(struct file *file, poll_table *wait) 111 { 112 struct eventfd_ctx *ctx = file->private_data; 113 __poll_t events = 0; 114 u64 count; 115 116 poll_wait(file, &ctx->wqh, wait); 117 118 /* 119 * All writes to ctx->count occur within ctx->wqh.lock. This read 120 * can be done outside ctx->wqh.lock because we know that poll_wait 121 * takes that lock (through add_wait_queue) if our caller will sleep. 122 * 123 * The read _can_ therefore seep into add_wait_queue's critical 124 * section, but cannot move above it! add_wait_queue's spin_lock acts 125 * as an acquire barrier and ensures that the read be ordered properly 126 * against the writes. The following CAN happen and is safe: 127 * 128 * poll write 129 * ----------------- ------------ 130 * lock ctx->wqh.lock (in poll_wait) 131 * count = ctx->count 132 * __add_wait_queue 133 * unlock ctx->wqh.lock 134 * lock ctx->qwh.lock 135 * ctx->count += n 136 * if (waitqueue_active) 137 * wake_up_locked_poll 138 * unlock ctx->qwh.lock 139 * eventfd_poll returns 0 140 * 141 * but the following, which would miss a wakeup, cannot happen: 142 * 143 * poll write 144 * ----------------- ------------ 145 * count = ctx->count (INVALID!) 146 * lock ctx->qwh.lock 147 * ctx->count += n 148 * **waitqueue_active is false** 149 * **no wake_up_locked_poll!** 150 * unlock ctx->qwh.lock 151 * lock ctx->wqh.lock (in poll_wait) 152 * __add_wait_queue 153 * unlock ctx->wqh.lock 154 * eventfd_poll returns 0 155 */ 156 count = READ_ONCE(ctx->count); 157 158 if (count > 0) 159 events |= EPOLLIN; 160 if (count == ULLONG_MAX) 161 events |= EPOLLERR; 162 if (ULLONG_MAX - 1 > count) 163 events |= EPOLLOUT; 164 165 return events; 166 } 167 168 static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) 169 { 170 *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 171 ctx->count -= *cnt; 172 } 173 174 /** 175 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 176 * @ctx: [in] Pointer to eventfd context. 177 * @wait: [in] Wait queue to be removed. 178 * @cnt: [out] Pointer to the 64-bit counter value. 179 * 180 * Returns %0 if successful, or the following error codes: 181 * 182 * -EAGAIN : The operation would have blocked. 183 * 184 * This is used to atomically remove a wait queue entry from the eventfd wait 185 * queue head, and read/reset the counter value. 186 */ 187 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, 188 __u64 *cnt) 189 { 190 unsigned long flags; 191 192 spin_lock_irqsave(&ctx->wqh.lock, flags); 193 eventfd_ctx_do_read(ctx, cnt); 194 __remove_wait_queue(&ctx->wqh, wait); 195 if (*cnt != 0 && waitqueue_active(&ctx->wqh)) 196 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 197 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 198 199 return *cnt != 0 ? 0 : -EAGAIN; 200 } 201 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); 202 203 static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 204 loff_t *ppos) 205 { 206 struct eventfd_ctx *ctx = file->private_data; 207 ssize_t res; 208 __u64 ucnt = 0; 209 DECLARE_WAITQUEUE(wait, current); 210 211 if (count < sizeof(ucnt)) 212 return -EINVAL; 213 214 spin_lock_irq(&ctx->wqh.lock); 215 res = -EAGAIN; 216 if (ctx->count > 0) 217 res = sizeof(ucnt); 218 else if (!(file->f_flags & O_NONBLOCK)) { 219 __add_wait_queue(&ctx->wqh, &wait); 220 for (;;) { 221 set_current_state(TASK_INTERRUPTIBLE); 222 if (ctx->count > 0) { 223 res = sizeof(ucnt); 224 break; 225 } 226 if (signal_pending(current)) { 227 res = -ERESTARTSYS; 228 break; 229 } 230 spin_unlock_irq(&ctx->wqh.lock); 231 schedule(); 232 spin_lock_irq(&ctx->wqh.lock); 233 } 234 __remove_wait_queue(&ctx->wqh, &wait); 235 __set_current_state(TASK_RUNNING); 236 } 237 if (likely(res > 0)) { 238 eventfd_ctx_do_read(ctx, &ucnt); 239 if (waitqueue_active(&ctx->wqh)) 240 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 241 } 242 spin_unlock_irq(&ctx->wqh.lock); 243 244 if (res > 0 && put_user(ucnt, (__u64 __user *)buf)) 245 return -EFAULT; 246 247 return res; 248 } 249 250 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, 251 loff_t *ppos) 252 { 253 struct eventfd_ctx *ctx = file->private_data; 254 ssize_t res; 255 __u64 ucnt; 256 DECLARE_WAITQUEUE(wait, current); 257 258 if (count < sizeof(ucnt)) 259 return -EINVAL; 260 if (copy_from_user(&ucnt, buf, sizeof(ucnt))) 261 return -EFAULT; 262 if (ucnt == ULLONG_MAX) 263 return -EINVAL; 264 spin_lock_irq(&ctx->wqh.lock); 265 res = -EAGAIN; 266 if (ULLONG_MAX - ctx->count > ucnt) 267 res = sizeof(ucnt); 268 else if (!(file->f_flags & O_NONBLOCK)) { 269 __add_wait_queue(&ctx->wqh, &wait); 270 for (res = 0;;) { 271 set_current_state(TASK_INTERRUPTIBLE); 272 if (ULLONG_MAX - ctx->count > ucnt) { 273 res = sizeof(ucnt); 274 break; 275 } 276 if (signal_pending(current)) { 277 res = -ERESTARTSYS; 278 break; 279 } 280 spin_unlock_irq(&ctx->wqh.lock); 281 schedule(); 282 spin_lock_irq(&ctx->wqh.lock); 283 } 284 __remove_wait_queue(&ctx->wqh, &wait); 285 __set_current_state(TASK_RUNNING); 286 } 287 if (likely(res > 0)) { 288 ctx->count += ucnt; 289 if (waitqueue_active(&ctx->wqh)) 290 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 291 } 292 spin_unlock_irq(&ctx->wqh.lock); 293 294 return res; 295 } 296 297 #ifdef CONFIG_PROC_FS 298 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) 299 { 300 struct eventfd_ctx *ctx = f->private_data; 301 302 spin_lock_irq(&ctx->wqh.lock); 303 seq_printf(m, "eventfd-count: %16llx\n", 304 (unsigned long long)ctx->count); 305 spin_unlock_irq(&ctx->wqh.lock); 306 seq_printf(m, "eventfd-id: %d\n", ctx->id); 307 } 308 #endif 309 310 static const struct file_operations eventfd_fops = { 311 #ifdef CONFIG_PROC_FS 312 .show_fdinfo = eventfd_show_fdinfo, 313 #endif 314 .release = eventfd_release, 315 .poll = eventfd_poll, 316 .read = eventfd_read, 317 .write = eventfd_write, 318 .llseek = noop_llseek, 319 }; 320 321 /** 322 * eventfd_fget - Acquire a reference of an eventfd file descriptor. 323 * @fd: [in] Eventfd file descriptor. 324 * 325 * Returns a pointer to the eventfd file structure in case of success, or the 326 * following error pointer: 327 * 328 * -EBADF : Invalid @fd file descriptor. 329 * -EINVAL : The @fd file descriptor is not an eventfd file. 330 */ 331 struct file *eventfd_fget(int fd) 332 { 333 struct file *file; 334 335 file = fget(fd); 336 if (!file) 337 return ERR_PTR(-EBADF); 338 if (file->f_op != &eventfd_fops) { 339 fput(file); 340 return ERR_PTR(-EINVAL); 341 } 342 343 return file; 344 } 345 EXPORT_SYMBOL_GPL(eventfd_fget); 346 347 /** 348 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. 349 * @fd: [in] Eventfd file descriptor. 350 * 351 * Returns a pointer to the internal eventfd context, otherwise the error 352 * pointers returned by the following functions: 353 * 354 * eventfd_fget 355 */ 356 struct eventfd_ctx *eventfd_ctx_fdget(int fd) 357 { 358 struct eventfd_ctx *ctx; 359 struct fd f = fdget(fd); 360 if (!f.file) 361 return ERR_PTR(-EBADF); 362 ctx = eventfd_ctx_fileget(f.file); 363 fdput(f); 364 return ctx; 365 } 366 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); 367 368 /** 369 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. 370 * @file: [in] Eventfd file pointer. 371 * 372 * Returns a pointer to the internal eventfd context, otherwise the error 373 * pointer: 374 * 375 * -EINVAL : The @fd file descriptor is not an eventfd file. 376 */ 377 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) 378 { 379 struct eventfd_ctx *ctx; 380 381 if (file->f_op != &eventfd_fops) 382 return ERR_PTR(-EINVAL); 383 384 ctx = file->private_data; 385 kref_get(&ctx->kref); 386 return ctx; 387 } 388 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); 389 390 static int do_eventfd(unsigned int count, int flags) 391 { 392 struct eventfd_ctx *ctx; 393 int fd; 394 395 /* Check the EFD_* constants for consistency. */ 396 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); 397 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 398 399 if (flags & ~EFD_FLAGS_SET) 400 return -EINVAL; 401 402 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 403 if (!ctx) 404 return -ENOMEM; 405 406 kref_init(&ctx->kref); 407 init_waitqueue_head(&ctx->wqh); 408 ctx->count = count; 409 ctx->flags = flags; 410 ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); 411 412 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 413 O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); 414 if (fd < 0) 415 eventfd_free_ctx(ctx); 416 417 return fd; 418 } 419 420 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 421 { 422 return do_eventfd(count, flags); 423 } 424 425 SYSCALL_DEFINE1(eventfd, unsigned int, count) 426 { 427 return do_eventfd(count, 0); 428 } 429 430