1 /* 2 * fs/eventfd.c 3 * 4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 5 * 6 */ 7 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/init.h> 11 #include <linux/fs.h> 12 #include <linux/sched/signal.h> 13 #include <linux/kernel.h> 14 #include <linux/slab.h> 15 #include <linux/list.h> 16 #include <linux/spinlock.h> 17 #include <linux/anon_inodes.h> 18 #include <linux/syscalls.h> 19 #include <linux/export.h> 20 #include <linux/kref.h> 21 #include <linux/eventfd.h> 22 #include <linux/proc_fs.h> 23 #include <linux/seq_file.h> 24 25 struct eventfd_ctx { 26 struct kref kref; 27 wait_queue_head_t wqh; 28 /* 29 * Every time that a write(2) is performed on an eventfd, the 30 * value of the __u64 being written is added to "count" and a 31 * wakeup is performed on "wqh". A read(2) will return the "count" 32 * value to userspace, and will reset "count" to zero. The kernel 33 * side eventfd_signal() also, adds to the "count" counter and 34 * issue a wakeup. 35 */ 36 __u64 count; 37 unsigned int flags; 38 }; 39 40 /** 41 * eventfd_signal - Adds @n to the eventfd counter. 42 * @ctx: [in] Pointer to the eventfd context. 43 * @n: [in] Value of the counter to be added to the eventfd internal counter. 44 * The value cannot be negative. 45 * 46 * This function is supposed to be called by the kernel in paths that do not 47 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX 48 * value, and we signal this as overflow condition by returning a EPOLLERR 49 * to poll(2). 50 * 51 * Returns the amount by which the counter was incremented. This will be less 52 * than @n if the counter has overflowed. 53 */ 54 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) 55 { 56 unsigned long flags; 57 58 spin_lock_irqsave(&ctx->wqh.lock, flags); 59 if (ULLONG_MAX - ctx->count < n) 60 n = ULLONG_MAX - ctx->count; 61 ctx->count += n; 62 if (waitqueue_active(&ctx->wqh)) 63 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 64 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 65 66 return n; 67 } 68 EXPORT_SYMBOL_GPL(eventfd_signal); 69 70 static void eventfd_free_ctx(struct eventfd_ctx *ctx) 71 { 72 kfree(ctx); 73 } 74 75 static void eventfd_free(struct kref *kref) 76 { 77 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); 78 79 eventfd_free_ctx(ctx); 80 } 81 82 /** 83 * eventfd_ctx_put - Releases a reference to the internal eventfd context. 84 * @ctx: [in] Pointer to eventfd context. 85 * 86 * The eventfd context reference must have been previously acquired either 87 * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). 88 */ 89 void eventfd_ctx_put(struct eventfd_ctx *ctx) 90 { 91 kref_put(&ctx->kref, eventfd_free); 92 } 93 EXPORT_SYMBOL_GPL(eventfd_ctx_put); 94 95 static int eventfd_release(struct inode *inode, struct file *file) 96 { 97 struct eventfd_ctx *ctx = file->private_data; 98 99 wake_up_poll(&ctx->wqh, EPOLLHUP); 100 eventfd_ctx_put(ctx); 101 return 0; 102 } 103 104 static struct wait_queue_head * 105 eventfd_get_poll_head(struct file *file, __poll_t events) 106 { 107 struct eventfd_ctx *ctx = file->private_data; 108 109 return &ctx->wqh; 110 } 111 112 static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) 113 { 114 struct eventfd_ctx *ctx = file->private_data; 115 __poll_t events = 0; 116 u64 count; 117 118 /* 119 * All writes to ctx->count occur within ctx->wqh.lock. This read 120 * can be done outside ctx->wqh.lock because we know that poll_wait 121 * takes that lock (through add_wait_queue) if our caller will sleep. 122 * 123 * The read _can_ therefore seep into add_wait_queue's critical 124 * section, but cannot move above it! add_wait_queue's spin_lock acts 125 * as an acquire barrier and ensures that the read be ordered properly 126 * against the writes. The following CAN happen and is safe: 127 * 128 * poll write 129 * ----------------- ------------ 130 * lock ctx->wqh.lock (in poll_wait) 131 * count = ctx->count 132 * __add_wait_queue 133 * unlock ctx->wqh.lock 134 * lock ctx->qwh.lock 135 * ctx->count += n 136 * if (waitqueue_active) 137 * wake_up_locked_poll 138 * unlock ctx->qwh.lock 139 * eventfd_poll returns 0 140 * 141 * but the following, which would miss a wakeup, cannot happen: 142 * 143 * poll write 144 * ----------------- ------------ 145 * count = ctx->count (INVALID!) 146 * lock ctx->qwh.lock 147 * ctx->count += n 148 * **waitqueue_active is false** 149 * **no wake_up_locked_poll!** 150 * unlock ctx->qwh.lock 151 * lock ctx->wqh.lock (in poll_wait) 152 * __add_wait_queue 153 * unlock ctx->wqh.lock 154 * eventfd_poll returns 0 155 */ 156 count = READ_ONCE(ctx->count); 157 158 if (count > 0) 159 events |= (EPOLLIN & eventmask); 160 if (count == ULLONG_MAX) 161 events |= EPOLLERR; 162 if (ULLONG_MAX - 1 > count) 163 events |= (EPOLLOUT & eventmask); 164 165 return events; 166 } 167 168 static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) 169 { 170 *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 171 ctx->count -= *cnt; 172 } 173 174 /** 175 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 176 * @ctx: [in] Pointer to eventfd context. 177 * @wait: [in] Wait queue to be removed. 178 * @cnt: [out] Pointer to the 64-bit counter value. 179 * 180 * Returns %0 if successful, or the following error codes: 181 * 182 * -EAGAIN : The operation would have blocked. 183 * 184 * This is used to atomically remove a wait queue entry from the eventfd wait 185 * queue head, and read/reset the counter value. 186 */ 187 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, 188 __u64 *cnt) 189 { 190 unsigned long flags; 191 192 spin_lock_irqsave(&ctx->wqh.lock, flags); 193 eventfd_ctx_do_read(ctx, cnt); 194 __remove_wait_queue(&ctx->wqh, wait); 195 if (*cnt != 0 && waitqueue_active(&ctx->wqh)) 196 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 197 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 198 199 return *cnt != 0 ? 0 : -EAGAIN; 200 } 201 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); 202 203 static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 204 loff_t *ppos) 205 { 206 struct eventfd_ctx *ctx = file->private_data; 207 ssize_t res; 208 __u64 ucnt = 0; 209 DECLARE_WAITQUEUE(wait, current); 210 211 if (count < sizeof(ucnt)) 212 return -EINVAL; 213 214 spin_lock_irq(&ctx->wqh.lock); 215 res = -EAGAIN; 216 if (ctx->count > 0) 217 res = sizeof(ucnt); 218 else if (!(file->f_flags & O_NONBLOCK)) { 219 __add_wait_queue(&ctx->wqh, &wait); 220 for (;;) { 221 set_current_state(TASK_INTERRUPTIBLE); 222 if (ctx->count > 0) { 223 res = sizeof(ucnt); 224 break; 225 } 226 if (signal_pending(current)) { 227 res = -ERESTARTSYS; 228 break; 229 } 230 spin_unlock_irq(&ctx->wqh.lock); 231 schedule(); 232 spin_lock_irq(&ctx->wqh.lock); 233 } 234 __remove_wait_queue(&ctx->wqh, &wait); 235 __set_current_state(TASK_RUNNING); 236 } 237 if (likely(res > 0)) { 238 eventfd_ctx_do_read(ctx, &ucnt); 239 if (waitqueue_active(&ctx->wqh)) 240 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 241 } 242 spin_unlock_irq(&ctx->wqh.lock); 243 244 if (res > 0 && put_user(ucnt, (__u64 __user *)buf)) 245 return -EFAULT; 246 247 return res; 248 } 249 250 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, 251 loff_t *ppos) 252 { 253 struct eventfd_ctx *ctx = file->private_data; 254 ssize_t res; 255 __u64 ucnt; 256 DECLARE_WAITQUEUE(wait, current); 257 258 if (count < sizeof(ucnt)) 259 return -EINVAL; 260 if (copy_from_user(&ucnt, buf, sizeof(ucnt))) 261 return -EFAULT; 262 if (ucnt == ULLONG_MAX) 263 return -EINVAL; 264 spin_lock_irq(&ctx->wqh.lock); 265 res = -EAGAIN; 266 if (ULLONG_MAX - ctx->count > ucnt) 267 res = sizeof(ucnt); 268 else if (!(file->f_flags & O_NONBLOCK)) { 269 __add_wait_queue(&ctx->wqh, &wait); 270 for (res = 0;;) { 271 set_current_state(TASK_INTERRUPTIBLE); 272 if (ULLONG_MAX - ctx->count > ucnt) { 273 res = sizeof(ucnt); 274 break; 275 } 276 if (signal_pending(current)) { 277 res = -ERESTARTSYS; 278 break; 279 } 280 spin_unlock_irq(&ctx->wqh.lock); 281 schedule(); 282 spin_lock_irq(&ctx->wqh.lock); 283 } 284 __remove_wait_queue(&ctx->wqh, &wait); 285 __set_current_state(TASK_RUNNING); 286 } 287 if (likely(res > 0)) { 288 ctx->count += ucnt; 289 if (waitqueue_active(&ctx->wqh)) 290 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 291 } 292 spin_unlock_irq(&ctx->wqh.lock); 293 294 return res; 295 } 296 297 #ifdef CONFIG_PROC_FS 298 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) 299 { 300 struct eventfd_ctx *ctx = f->private_data; 301 302 spin_lock_irq(&ctx->wqh.lock); 303 seq_printf(m, "eventfd-count: %16llx\n", 304 (unsigned long long)ctx->count); 305 spin_unlock_irq(&ctx->wqh.lock); 306 } 307 #endif 308 309 static const struct file_operations eventfd_fops = { 310 #ifdef CONFIG_PROC_FS 311 .show_fdinfo = eventfd_show_fdinfo, 312 #endif 313 .release = eventfd_release, 314 .get_poll_head = eventfd_get_poll_head, 315 .poll_mask = eventfd_poll_mask, 316 .read = eventfd_read, 317 .write = eventfd_write, 318 .llseek = noop_llseek, 319 }; 320 321 /** 322 * eventfd_fget - Acquire a reference of an eventfd file descriptor. 323 * @fd: [in] Eventfd file descriptor. 324 * 325 * Returns a pointer to the eventfd file structure in case of success, or the 326 * following error pointer: 327 * 328 * -EBADF : Invalid @fd file descriptor. 329 * -EINVAL : The @fd file descriptor is not an eventfd file. 330 */ 331 struct file *eventfd_fget(int fd) 332 { 333 struct file *file; 334 335 file = fget(fd); 336 if (!file) 337 return ERR_PTR(-EBADF); 338 if (file->f_op != &eventfd_fops) { 339 fput(file); 340 return ERR_PTR(-EINVAL); 341 } 342 343 return file; 344 } 345 EXPORT_SYMBOL_GPL(eventfd_fget); 346 347 /** 348 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. 349 * @fd: [in] Eventfd file descriptor. 350 * 351 * Returns a pointer to the internal eventfd context, otherwise the error 352 * pointers returned by the following functions: 353 * 354 * eventfd_fget 355 */ 356 struct eventfd_ctx *eventfd_ctx_fdget(int fd) 357 { 358 struct eventfd_ctx *ctx; 359 struct fd f = fdget(fd); 360 if (!f.file) 361 return ERR_PTR(-EBADF); 362 ctx = eventfd_ctx_fileget(f.file); 363 fdput(f); 364 return ctx; 365 } 366 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); 367 368 /** 369 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. 370 * @file: [in] Eventfd file pointer. 371 * 372 * Returns a pointer to the internal eventfd context, otherwise the error 373 * pointer: 374 * 375 * -EINVAL : The @fd file descriptor is not an eventfd file. 376 */ 377 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) 378 { 379 struct eventfd_ctx *ctx; 380 381 if (file->f_op != &eventfd_fops) 382 return ERR_PTR(-EINVAL); 383 384 ctx = file->private_data; 385 kref_get(&ctx->kref); 386 return ctx; 387 } 388 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); 389 390 static int do_eventfd(unsigned int count, int flags) 391 { 392 struct eventfd_ctx *ctx; 393 int fd; 394 395 /* Check the EFD_* constants for consistency. */ 396 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); 397 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 398 399 if (flags & ~EFD_FLAGS_SET) 400 return -EINVAL; 401 402 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 403 if (!ctx) 404 return -ENOMEM; 405 406 kref_init(&ctx->kref); 407 init_waitqueue_head(&ctx->wqh); 408 ctx->count = count; 409 ctx->flags = flags; 410 411 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 412 O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); 413 if (fd < 0) 414 eventfd_free_ctx(ctx); 415 416 return fd; 417 } 418 419 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 420 { 421 return do_eventfd(count, flags); 422 } 423 424 SYSCALL_DEFINE1(eventfd, unsigned int, count) 425 { 426 return do_eventfd(count, 0); 427 } 428 429