1 /* 2 * An async IO implementation for Linux 3 * Written by Benjamin LaHaise <bcrl@kvack.org> 4 * 5 * Implements an efficient asynchronous io interface. 6 * 7 * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. 8 * 9 * See ../COPYING for licensing terms. 10 */ 11 #define pr_fmt(fmt) "%s: " fmt, __func__ 12 13 #include <linux/kernel.h> 14 #include <linux/init.h> 15 #include <linux/errno.h> 16 #include <linux/time.h> 17 #include <linux/aio_abi.h> 18 #include <linux/export.h> 19 #include <linux/syscalls.h> 20 #include <linux/backing-dev.h> 21 #include <linux/uio.h> 22 23 #include <linux/sched/signal.h> 24 #include <linux/fs.h> 25 #include <linux/file.h> 26 #include <linux/mm.h> 27 #include <linux/mman.h> 28 #include <linux/mmu_context.h> 29 #include <linux/percpu.h> 30 #include <linux/slab.h> 31 #include <linux/timer.h> 32 #include <linux/aio.h> 33 #include <linux/highmem.h> 34 #include <linux/workqueue.h> 35 #include <linux/security.h> 36 #include <linux/eventfd.h> 37 #include <linux/blkdev.h> 38 #include <linux/compat.h> 39 #include <linux/migrate.h> 40 #include <linux/ramfs.h> 41 #include <linux/percpu-refcount.h> 42 #include <linux/mount.h> 43 44 #include <asm/kmap_types.h> 45 #include <linux/uaccess.h> 46 47 #include "internal.h" 48 49 #define KIOCB_KEY 0 50 51 #define AIO_RING_MAGIC 0xa10a10a1 52 #define AIO_RING_COMPAT_FEATURES 1 53 #define AIO_RING_INCOMPAT_FEATURES 0 54 struct aio_ring { 55 unsigned id; /* kernel internal index number */ 56 unsigned nr; /* number of io_events */ 57 unsigned head; /* Written to by userland or under ring_lock 58 * mutex by aio_read_events_ring(). */ 59 unsigned tail; 60 61 unsigned magic; 62 unsigned compat_features; 63 unsigned incompat_features; 64 unsigned header_length; /* size of aio_ring */ 65 66 67 struct io_event io_events[0]; 68 }; /* 128 bytes + ring size */ 69 70 #define AIO_RING_PAGES 8 71 72 struct kioctx_table { 73 struct rcu_head rcu; 74 unsigned nr; 75 struct kioctx __rcu *table[]; 76 }; 77 78 struct kioctx_cpu { 79 unsigned reqs_available; 80 }; 81 82 struct ctx_rq_wait { 83 struct completion comp; 84 atomic_t count; 85 }; 86 87 struct kioctx { 88 struct percpu_ref users; 89 atomic_t dead; 90 91 struct percpu_ref reqs; 92 93 unsigned long user_id; 94 95 struct __percpu kioctx_cpu *cpu; 96 97 /* 98 * For percpu reqs_available, number of slots we move to/from global 99 * counter at a time: 100 */ 101 unsigned req_batch; 102 /* 103 * This is what userspace passed to io_setup(), it's not used for 104 * anything but counting against the global max_reqs quota. 105 * 106 * The real limit is nr_events - 1, which will be larger (see 107 * aio_setup_ring()) 108 */ 109 unsigned max_reqs; 110 111 /* Size of ringbuffer, in units of struct io_event */ 112 unsigned nr_events; 113 114 unsigned long mmap_base; 115 unsigned long mmap_size; 116 117 struct page **ring_pages; 118 long nr_pages; 119 120 struct rcu_work free_rwork; /* see free_ioctx() */ 121 122 /* 123 * signals when all in-flight requests are done 124 */ 125 struct ctx_rq_wait *rq_wait; 126 127 struct { 128 /* 129 * This counts the number of available slots in the ringbuffer, 130 * so we avoid overflowing it: it's decremented (if positive) 131 * when allocating a kiocb and incremented when the resulting 132 * io_event is pulled off the ringbuffer. 133 * 134 * We batch accesses to it with a percpu version. 135 */ 136 atomic_t reqs_available; 137 } ____cacheline_aligned_in_smp; 138 139 struct { 140 spinlock_t ctx_lock; 141 struct list_head active_reqs; /* used for cancellation */ 142 } ____cacheline_aligned_in_smp; 143 144 struct { 145 struct mutex ring_lock; 146 wait_queue_head_t wait; 147 } ____cacheline_aligned_in_smp; 148 149 struct { 150 unsigned tail; 151 unsigned completed_events; 152 spinlock_t completion_lock; 153 } ____cacheline_aligned_in_smp; 154 155 struct page *internal_pages[AIO_RING_PAGES]; 156 struct file *aio_ring_file; 157 158 unsigned id; 159 }; 160 161 struct fsync_iocb { 162 struct work_struct work; 163 struct file *file; 164 bool datasync; 165 }; 166 167 struct aio_kiocb { 168 union { 169 struct kiocb rw; 170 struct fsync_iocb fsync; 171 }; 172 173 struct kioctx *ki_ctx; 174 kiocb_cancel_fn *ki_cancel; 175 176 struct iocb __user *ki_user_iocb; /* user's aiocb */ 177 __u64 ki_user_data; /* user's data for completion */ 178 179 struct list_head ki_list; /* the aio core uses this 180 * for cancellation */ 181 182 /* 183 * If the aio_resfd field of the userspace iocb is not zero, 184 * this is the underlying eventfd context to deliver events to. 185 */ 186 struct eventfd_ctx *ki_eventfd; 187 }; 188 189 /*------ sysctl variables----*/ 190 static DEFINE_SPINLOCK(aio_nr_lock); 191 unsigned long aio_nr; /* current system wide number of aio requests */ 192 unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ 193 /*----end sysctl variables---*/ 194 195 static struct kmem_cache *kiocb_cachep; 196 static struct kmem_cache *kioctx_cachep; 197 198 static struct vfsmount *aio_mnt; 199 200 static const struct file_operations aio_ring_fops; 201 static const struct address_space_operations aio_ctx_aops; 202 203 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) 204 { 205 struct qstr this = QSTR_INIT("[aio]", 5); 206 struct file *file; 207 struct path path; 208 struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); 209 if (IS_ERR(inode)) 210 return ERR_CAST(inode); 211 212 inode->i_mapping->a_ops = &aio_ctx_aops; 213 inode->i_mapping->private_data = ctx; 214 inode->i_size = PAGE_SIZE * nr_pages; 215 216 path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); 217 if (!path.dentry) { 218 iput(inode); 219 return ERR_PTR(-ENOMEM); 220 } 221 path.mnt = mntget(aio_mnt); 222 223 d_instantiate(path.dentry, inode); 224 file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops); 225 if (IS_ERR(file)) { 226 path_put(&path); 227 return file; 228 } 229 230 file->f_flags = O_RDWR; 231 return file; 232 } 233 234 static struct dentry *aio_mount(struct file_system_type *fs_type, 235 int flags, const char *dev_name, void *data) 236 { 237 static const struct dentry_operations ops = { 238 .d_dname = simple_dname, 239 }; 240 struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, 241 AIO_RING_MAGIC); 242 243 if (!IS_ERR(root)) 244 root->d_sb->s_iflags |= SB_I_NOEXEC; 245 return root; 246 } 247 248 /* aio_setup 249 * Creates the slab caches used by the aio routines, panic on 250 * failure as this is done early during the boot sequence. 251 */ 252 static int __init aio_setup(void) 253 { 254 static struct file_system_type aio_fs = { 255 .name = "aio", 256 .mount = aio_mount, 257 .kill_sb = kill_anon_super, 258 }; 259 aio_mnt = kern_mount(&aio_fs); 260 if (IS_ERR(aio_mnt)) 261 panic("Failed to create aio fs mount."); 262 263 kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 264 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 265 return 0; 266 } 267 __initcall(aio_setup); 268 269 static void put_aio_ring_file(struct kioctx *ctx) 270 { 271 struct file *aio_ring_file = ctx->aio_ring_file; 272 struct address_space *i_mapping; 273 274 if (aio_ring_file) { 275 truncate_setsize(file_inode(aio_ring_file), 0); 276 277 /* Prevent further access to the kioctx from migratepages */ 278 i_mapping = aio_ring_file->f_mapping; 279 spin_lock(&i_mapping->private_lock); 280 i_mapping->private_data = NULL; 281 ctx->aio_ring_file = NULL; 282 spin_unlock(&i_mapping->private_lock); 283 284 fput(aio_ring_file); 285 } 286 } 287 288 static void aio_free_ring(struct kioctx *ctx) 289 { 290 int i; 291 292 /* Disconnect the kiotx from the ring file. This prevents future 293 * accesses to the kioctx from page migration. 294 */ 295 put_aio_ring_file(ctx); 296 297 for (i = 0; i < ctx->nr_pages; i++) { 298 struct page *page; 299 pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, 300 page_count(ctx->ring_pages[i])); 301 page = ctx->ring_pages[i]; 302 if (!page) 303 continue; 304 ctx->ring_pages[i] = NULL; 305 put_page(page); 306 } 307 308 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { 309 kfree(ctx->ring_pages); 310 ctx->ring_pages = NULL; 311 } 312 } 313 314 static int aio_ring_mremap(struct vm_area_struct *vma) 315 { 316 struct file *file = vma->vm_file; 317 struct mm_struct *mm = vma->vm_mm; 318 struct kioctx_table *table; 319 int i, res = -EINVAL; 320 321 spin_lock(&mm->ioctx_lock); 322 rcu_read_lock(); 323 table = rcu_dereference(mm->ioctx_table); 324 for (i = 0; i < table->nr; i++) { 325 struct kioctx *ctx; 326 327 ctx = rcu_dereference(table->table[i]); 328 if (ctx && ctx->aio_ring_file == file) { 329 if (!atomic_read(&ctx->dead)) { 330 ctx->user_id = ctx->mmap_base = vma->vm_start; 331 res = 0; 332 } 333 break; 334 } 335 } 336 337 rcu_read_unlock(); 338 spin_unlock(&mm->ioctx_lock); 339 return res; 340 } 341 342 static const struct vm_operations_struct aio_ring_vm_ops = { 343 .mremap = aio_ring_mremap, 344 #if IS_ENABLED(CONFIG_MMU) 345 .fault = filemap_fault, 346 .map_pages = filemap_map_pages, 347 .page_mkwrite = filemap_page_mkwrite, 348 #endif 349 }; 350 351 static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) 352 { 353 vma->vm_flags |= VM_DONTEXPAND; 354 vma->vm_ops = &aio_ring_vm_ops; 355 return 0; 356 } 357 358 static const struct file_operations aio_ring_fops = { 359 .mmap = aio_ring_mmap, 360 }; 361 362 #if IS_ENABLED(CONFIG_MIGRATION) 363 static int aio_migratepage(struct address_space *mapping, struct page *new, 364 struct page *old, enum migrate_mode mode) 365 { 366 struct kioctx *ctx; 367 unsigned long flags; 368 pgoff_t idx; 369 int rc; 370 371 /* 372 * We cannot support the _NO_COPY case here, because copy needs to 373 * happen under the ctx->completion_lock. That does not work with the 374 * migration workflow of MIGRATE_SYNC_NO_COPY. 375 */ 376 if (mode == MIGRATE_SYNC_NO_COPY) 377 return -EINVAL; 378 379 rc = 0; 380 381 /* mapping->private_lock here protects against the kioctx teardown. */ 382 spin_lock(&mapping->private_lock); 383 ctx = mapping->private_data; 384 if (!ctx) { 385 rc = -EINVAL; 386 goto out; 387 } 388 389 /* The ring_lock mutex. The prevents aio_read_events() from writing 390 * to the ring's head, and prevents page migration from mucking in 391 * a partially initialized kiotx. 392 */ 393 if (!mutex_trylock(&ctx->ring_lock)) { 394 rc = -EAGAIN; 395 goto out; 396 } 397 398 idx = old->index; 399 if (idx < (pgoff_t)ctx->nr_pages) { 400 /* Make sure the old page hasn't already been changed */ 401 if (ctx->ring_pages[idx] != old) 402 rc = -EAGAIN; 403 } else 404 rc = -EINVAL; 405 406 if (rc != 0) 407 goto out_unlock; 408 409 /* Writeback must be complete */ 410 BUG_ON(PageWriteback(old)); 411 get_page(new); 412 413 rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); 414 if (rc != MIGRATEPAGE_SUCCESS) { 415 put_page(new); 416 goto out_unlock; 417 } 418 419 /* Take completion_lock to prevent other writes to the ring buffer 420 * while the old page is copied to the new. This prevents new 421 * events from being lost. 422 */ 423 spin_lock_irqsave(&ctx->completion_lock, flags); 424 migrate_page_copy(new, old); 425 BUG_ON(ctx->ring_pages[idx] != old); 426 ctx->ring_pages[idx] = new; 427 spin_unlock_irqrestore(&ctx->completion_lock, flags); 428 429 /* The old page is no longer accessible. */ 430 put_page(old); 431 432 out_unlock: 433 mutex_unlock(&ctx->ring_lock); 434 out: 435 spin_unlock(&mapping->private_lock); 436 return rc; 437 } 438 #endif 439 440 static const struct address_space_operations aio_ctx_aops = { 441 .set_page_dirty = __set_page_dirty_no_writeback, 442 #if IS_ENABLED(CONFIG_MIGRATION) 443 .migratepage = aio_migratepage, 444 #endif 445 }; 446 447 static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) 448 { 449 struct aio_ring *ring; 450 struct mm_struct *mm = current->mm; 451 unsigned long size, unused; 452 int nr_pages; 453 int i; 454 struct file *file; 455 456 /* Compensate for the ring buffer's head/tail overlap entry */ 457 nr_events += 2; /* 1 is required, 2 for good luck */ 458 459 size = sizeof(struct aio_ring); 460 size += sizeof(struct io_event) * nr_events; 461 462 nr_pages = PFN_UP(size); 463 if (nr_pages < 0) 464 return -EINVAL; 465 466 file = aio_private_file(ctx, nr_pages); 467 if (IS_ERR(file)) { 468 ctx->aio_ring_file = NULL; 469 return -ENOMEM; 470 } 471 472 ctx->aio_ring_file = file; 473 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) 474 / sizeof(struct io_event); 475 476 ctx->ring_pages = ctx->internal_pages; 477 if (nr_pages > AIO_RING_PAGES) { 478 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), 479 GFP_KERNEL); 480 if (!ctx->ring_pages) { 481 put_aio_ring_file(ctx); 482 return -ENOMEM; 483 } 484 } 485 486 for (i = 0; i < nr_pages; i++) { 487 struct page *page; 488 page = find_or_create_page(file->f_mapping, 489 i, GFP_HIGHUSER | __GFP_ZERO); 490 if (!page) 491 break; 492 pr_debug("pid(%d) page[%d]->count=%d\n", 493 current->pid, i, page_count(page)); 494 SetPageUptodate(page); 495 unlock_page(page); 496 497 ctx->ring_pages[i] = page; 498 } 499 ctx->nr_pages = i; 500 501 if (unlikely(i != nr_pages)) { 502 aio_free_ring(ctx); 503 return -ENOMEM; 504 } 505 506 ctx->mmap_size = nr_pages * PAGE_SIZE; 507 pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); 508 509 if (down_write_killable(&mm->mmap_sem)) { 510 ctx->mmap_size = 0; 511 aio_free_ring(ctx); 512 return -EINTR; 513 } 514 515 ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, 516 PROT_READ | PROT_WRITE, 517 MAP_SHARED, 0, &unused, NULL); 518 up_write(&mm->mmap_sem); 519 if (IS_ERR((void *)ctx->mmap_base)) { 520 ctx->mmap_size = 0; 521 aio_free_ring(ctx); 522 return -ENOMEM; 523 } 524 525 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); 526 527 ctx->user_id = ctx->mmap_base; 528 ctx->nr_events = nr_events; /* trusted copy */ 529 530 ring = kmap_atomic(ctx->ring_pages[0]); 531 ring->nr = nr_events; /* user copy */ 532 ring->id = ~0U; 533 ring->head = ring->tail = 0; 534 ring->magic = AIO_RING_MAGIC; 535 ring->compat_features = AIO_RING_COMPAT_FEATURES; 536 ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; 537 ring->header_length = sizeof(struct aio_ring); 538 kunmap_atomic(ring); 539 flush_dcache_page(ctx->ring_pages[0]); 540 541 return 0; 542 } 543 544 #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) 545 #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) 546 #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) 547 548 void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) 549 { 550 struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); 551 struct kioctx *ctx = req->ki_ctx; 552 unsigned long flags; 553 554 if (WARN_ON_ONCE(!list_empty(&req->ki_list))) 555 return; 556 557 spin_lock_irqsave(&ctx->ctx_lock, flags); 558 list_add_tail(&req->ki_list, &ctx->active_reqs); 559 req->ki_cancel = cancel; 560 spin_unlock_irqrestore(&ctx->ctx_lock, flags); 561 } 562 EXPORT_SYMBOL(kiocb_set_cancel_fn); 563 564 /* 565 * free_ioctx() should be RCU delayed to synchronize against the RCU 566 * protected lookup_ioctx() and also needs process context to call 567 * aio_free_ring(). Use rcu_work. 568 */ 569 static void free_ioctx(struct work_struct *work) 570 { 571 struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx, 572 free_rwork); 573 pr_debug("freeing %p\n", ctx); 574 575 aio_free_ring(ctx); 576 free_percpu(ctx->cpu); 577 percpu_ref_exit(&ctx->reqs); 578 percpu_ref_exit(&ctx->users); 579 kmem_cache_free(kioctx_cachep, ctx); 580 } 581 582 static void free_ioctx_reqs(struct percpu_ref *ref) 583 { 584 struct kioctx *ctx = container_of(ref, struct kioctx, reqs); 585 586 /* At this point we know that there are no any in-flight requests */ 587 if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) 588 complete(&ctx->rq_wait->comp); 589 590 /* Synchronize against RCU protected table->table[] dereferences */ 591 INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); 592 queue_rcu_work(system_wq, &ctx->free_rwork); 593 } 594 595 /* 596 * When this function runs, the kioctx has been removed from the "hash table" 597 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - 598 * now it's safe to cancel any that need to be. 599 */ 600 static void free_ioctx_users(struct percpu_ref *ref) 601 { 602 struct kioctx *ctx = container_of(ref, struct kioctx, users); 603 struct aio_kiocb *req; 604 605 spin_lock_irq(&ctx->ctx_lock); 606 607 while (!list_empty(&ctx->active_reqs)) { 608 req = list_first_entry(&ctx->active_reqs, 609 struct aio_kiocb, ki_list); 610 req->ki_cancel(&req->rw); 611 list_del_init(&req->ki_list); 612 } 613 614 spin_unlock_irq(&ctx->ctx_lock); 615 616 percpu_ref_kill(&ctx->reqs); 617 percpu_ref_put(&ctx->reqs); 618 } 619 620 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) 621 { 622 unsigned i, new_nr; 623 struct kioctx_table *table, *old; 624 struct aio_ring *ring; 625 626 spin_lock(&mm->ioctx_lock); 627 table = rcu_dereference_raw(mm->ioctx_table); 628 629 while (1) { 630 if (table) 631 for (i = 0; i < table->nr; i++) 632 if (!rcu_access_pointer(table->table[i])) { 633 ctx->id = i; 634 rcu_assign_pointer(table->table[i], ctx); 635 spin_unlock(&mm->ioctx_lock); 636 637 /* While kioctx setup is in progress, 638 * we are protected from page migration 639 * changes ring_pages by ->ring_lock. 640 */ 641 ring = kmap_atomic(ctx->ring_pages[0]); 642 ring->id = ctx->id; 643 kunmap_atomic(ring); 644 return 0; 645 } 646 647 new_nr = (table ? table->nr : 1) * 4; 648 spin_unlock(&mm->ioctx_lock); 649 650 table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * 651 new_nr, GFP_KERNEL); 652 if (!table) 653 return -ENOMEM; 654 655 table->nr = new_nr; 656 657 spin_lock(&mm->ioctx_lock); 658 old = rcu_dereference_raw(mm->ioctx_table); 659 660 if (!old) { 661 rcu_assign_pointer(mm->ioctx_table, table); 662 } else if (table->nr > old->nr) { 663 memcpy(table->table, old->table, 664 old->nr * sizeof(struct kioctx *)); 665 666 rcu_assign_pointer(mm->ioctx_table, table); 667 kfree_rcu(old, rcu); 668 } else { 669 kfree(table); 670 table = old; 671 } 672 } 673 } 674 675 static void aio_nr_sub(unsigned nr) 676 { 677 spin_lock(&aio_nr_lock); 678 if (WARN_ON(aio_nr - nr > aio_nr)) 679 aio_nr = 0; 680 else 681 aio_nr -= nr; 682 spin_unlock(&aio_nr_lock); 683 } 684 685 /* ioctx_alloc 686 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. 687 */ 688 static struct kioctx *ioctx_alloc(unsigned nr_events) 689 { 690 struct mm_struct *mm = current->mm; 691 struct kioctx *ctx; 692 int err = -ENOMEM; 693 694 /* 695 * Store the original nr_events -- what userspace passed to io_setup(), 696 * for counting against the global limit -- before it changes. 697 */ 698 unsigned int max_reqs = nr_events; 699 700 /* 701 * We keep track of the number of available ringbuffer slots, to prevent 702 * overflow (reqs_available), and we also use percpu counters for this. 703 * 704 * So since up to half the slots might be on other cpu's percpu counters 705 * and unavailable, double nr_events so userspace sees what they 706 * expected: additionally, we move req_batch slots to/from percpu 707 * counters at a time, so make sure that isn't 0: 708 */ 709 nr_events = max(nr_events, num_possible_cpus() * 4); 710 nr_events *= 2; 711 712 /* Prevent overflows */ 713 if (nr_events > (0x10000000U / sizeof(struct io_event))) { 714 pr_debug("ENOMEM: nr_events too high\n"); 715 return ERR_PTR(-EINVAL); 716 } 717 718 if (!nr_events || (unsigned long)max_reqs > aio_max_nr) 719 return ERR_PTR(-EAGAIN); 720 721 ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); 722 if (!ctx) 723 return ERR_PTR(-ENOMEM); 724 725 ctx->max_reqs = max_reqs; 726 727 spin_lock_init(&ctx->ctx_lock); 728 spin_lock_init(&ctx->completion_lock); 729 mutex_init(&ctx->ring_lock); 730 /* Protect against page migration throughout kiotx setup by keeping 731 * the ring_lock mutex held until setup is complete. */ 732 mutex_lock(&ctx->ring_lock); 733 init_waitqueue_head(&ctx->wait); 734 735 INIT_LIST_HEAD(&ctx->active_reqs); 736 737 if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) 738 goto err; 739 740 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) 741 goto err; 742 743 ctx->cpu = alloc_percpu(struct kioctx_cpu); 744 if (!ctx->cpu) 745 goto err; 746 747 err = aio_setup_ring(ctx, nr_events); 748 if (err < 0) 749 goto err; 750 751 atomic_set(&ctx->reqs_available, ctx->nr_events - 1); 752 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); 753 if (ctx->req_batch < 1) 754 ctx->req_batch = 1; 755 756 /* limit the number of system wide aios */ 757 spin_lock(&aio_nr_lock); 758 if (aio_nr + ctx->max_reqs > aio_max_nr || 759 aio_nr + ctx->max_reqs < aio_nr) { 760 spin_unlock(&aio_nr_lock); 761 err = -EAGAIN; 762 goto err_ctx; 763 } 764 aio_nr += ctx->max_reqs; 765 spin_unlock(&aio_nr_lock); 766 767 percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ 768 percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ 769 770 err = ioctx_add_table(ctx, mm); 771 if (err) 772 goto err_cleanup; 773 774 /* Release the ring_lock mutex now that all setup is complete. */ 775 mutex_unlock(&ctx->ring_lock); 776 777 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 778 ctx, ctx->user_id, mm, ctx->nr_events); 779 return ctx; 780 781 err_cleanup: 782 aio_nr_sub(ctx->max_reqs); 783 err_ctx: 784 atomic_set(&ctx->dead, 1); 785 if (ctx->mmap_size) 786 vm_munmap(ctx->mmap_base, ctx->mmap_size); 787 aio_free_ring(ctx); 788 err: 789 mutex_unlock(&ctx->ring_lock); 790 free_percpu(ctx->cpu); 791 percpu_ref_exit(&ctx->reqs); 792 percpu_ref_exit(&ctx->users); 793 kmem_cache_free(kioctx_cachep, ctx); 794 pr_debug("error allocating ioctx %d\n", err); 795 return ERR_PTR(err); 796 } 797 798 /* kill_ioctx 799 * Cancels all outstanding aio requests on an aio context. Used 800 * when the processes owning a context have all exited to encourage 801 * the rapid destruction of the kioctx. 802 */ 803 static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, 804 struct ctx_rq_wait *wait) 805 { 806 struct kioctx_table *table; 807 808 spin_lock(&mm->ioctx_lock); 809 if (atomic_xchg(&ctx->dead, 1)) { 810 spin_unlock(&mm->ioctx_lock); 811 return -EINVAL; 812 } 813 814 table = rcu_dereference_raw(mm->ioctx_table); 815 WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); 816 RCU_INIT_POINTER(table->table[ctx->id], NULL); 817 spin_unlock(&mm->ioctx_lock); 818 819 /* free_ioctx_reqs() will do the necessary RCU synchronization */ 820 wake_up_all(&ctx->wait); 821 822 /* 823 * It'd be more correct to do this in free_ioctx(), after all 824 * the outstanding kiocbs have finished - but by then io_destroy 825 * has already returned, so io_setup() could potentially return 826 * -EAGAIN with no ioctxs actually in use (as far as userspace 827 * could tell). 828 */ 829 aio_nr_sub(ctx->max_reqs); 830 831 if (ctx->mmap_size) 832 vm_munmap(ctx->mmap_base, ctx->mmap_size); 833 834 ctx->rq_wait = wait; 835 percpu_ref_kill(&ctx->users); 836 return 0; 837 } 838 839 /* 840 * exit_aio: called when the last user of mm goes away. At this point, there is 841 * no way for any new requests to be submited or any of the io_* syscalls to be 842 * called on the context. 843 * 844 * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on 845 * them. 846 */ 847 void exit_aio(struct mm_struct *mm) 848 { 849 struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); 850 struct ctx_rq_wait wait; 851 int i, skipped; 852 853 if (!table) 854 return; 855 856 atomic_set(&wait.count, table->nr); 857 init_completion(&wait.comp); 858 859 skipped = 0; 860 for (i = 0; i < table->nr; ++i) { 861 struct kioctx *ctx = 862 rcu_dereference_protected(table->table[i], true); 863 864 if (!ctx) { 865 skipped++; 866 continue; 867 } 868 869 /* 870 * We don't need to bother with munmap() here - exit_mmap(mm) 871 * is coming and it'll unmap everything. And we simply can't, 872 * this is not necessarily our ->mm. 873 * Since kill_ioctx() uses non-zero ->mmap_size as indicator 874 * that it needs to unmap the area, just set it to 0. 875 */ 876 ctx->mmap_size = 0; 877 kill_ioctx(mm, ctx, &wait); 878 } 879 880 if (!atomic_sub_and_test(skipped, &wait.count)) { 881 /* Wait until all IO for the context are done. */ 882 wait_for_completion(&wait.comp); 883 } 884 885 RCU_INIT_POINTER(mm->ioctx_table, NULL); 886 kfree(table); 887 } 888 889 static void put_reqs_available(struct kioctx *ctx, unsigned nr) 890 { 891 struct kioctx_cpu *kcpu; 892 unsigned long flags; 893 894 local_irq_save(flags); 895 kcpu = this_cpu_ptr(ctx->cpu); 896 kcpu->reqs_available += nr; 897 898 while (kcpu->reqs_available >= ctx->req_batch * 2) { 899 kcpu->reqs_available -= ctx->req_batch; 900 atomic_add(ctx->req_batch, &ctx->reqs_available); 901 } 902 903 local_irq_restore(flags); 904 } 905 906 static bool get_reqs_available(struct kioctx *ctx) 907 { 908 struct kioctx_cpu *kcpu; 909 bool ret = false; 910 unsigned long flags; 911 912 local_irq_save(flags); 913 kcpu = this_cpu_ptr(ctx->cpu); 914 if (!kcpu->reqs_available) { 915 int old, avail = atomic_read(&ctx->reqs_available); 916 917 do { 918 if (avail < ctx->req_batch) 919 goto out; 920 921 old = avail; 922 avail = atomic_cmpxchg(&ctx->reqs_available, 923 avail, avail - ctx->req_batch); 924 } while (avail != old); 925 926 kcpu->reqs_available += ctx->req_batch; 927 } 928 929 ret = true; 930 kcpu->reqs_available--; 931 out: 932 local_irq_restore(flags); 933 return ret; 934 } 935 936 /* refill_reqs_available 937 * Updates the reqs_available reference counts used for tracking the 938 * number of free slots in the completion ring. This can be called 939 * from aio_complete() (to optimistically update reqs_available) or 940 * from aio_get_req() (the we're out of events case). It must be 941 * called holding ctx->completion_lock. 942 */ 943 static void refill_reqs_available(struct kioctx *ctx, unsigned head, 944 unsigned tail) 945 { 946 unsigned events_in_ring, completed; 947 948 /* Clamp head since userland can write to it. */ 949 head %= ctx->nr_events; 950 if (head <= tail) 951 events_in_ring = tail - head; 952 else 953 events_in_ring = ctx->nr_events - (head - tail); 954 955 completed = ctx->completed_events; 956 if (events_in_ring < completed) 957 completed -= events_in_ring; 958 else 959 completed = 0; 960 961 if (!completed) 962 return; 963 964 ctx->completed_events -= completed; 965 put_reqs_available(ctx, completed); 966 } 967 968 /* user_refill_reqs_available 969 * Called to refill reqs_available when aio_get_req() encounters an 970 * out of space in the completion ring. 971 */ 972 static void user_refill_reqs_available(struct kioctx *ctx) 973 { 974 spin_lock_irq(&ctx->completion_lock); 975 if (ctx->completed_events) { 976 struct aio_ring *ring; 977 unsigned head; 978 979 /* Access of ring->head may race with aio_read_events_ring() 980 * here, but that's okay since whether we read the old version 981 * or the new version, and either will be valid. The important 982 * part is that head cannot pass tail since we prevent 983 * aio_complete() from updating tail by holding 984 * ctx->completion_lock. Even if head is invalid, the check 985 * against ctx->completed_events below will make sure we do the 986 * safe/right thing. 987 */ 988 ring = kmap_atomic(ctx->ring_pages[0]); 989 head = ring->head; 990 kunmap_atomic(ring); 991 992 refill_reqs_available(ctx, head, ctx->tail); 993 } 994 995 spin_unlock_irq(&ctx->completion_lock); 996 } 997 998 /* aio_get_req 999 * Allocate a slot for an aio request. 1000 * Returns NULL if no requests are free. 1001 */ 1002 static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) 1003 { 1004 struct aio_kiocb *req; 1005 1006 if (!get_reqs_available(ctx)) { 1007 user_refill_reqs_available(ctx); 1008 if (!get_reqs_available(ctx)) 1009 return NULL; 1010 } 1011 1012 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); 1013 if (unlikely(!req)) 1014 goto out_put; 1015 1016 percpu_ref_get(&ctx->reqs); 1017 INIT_LIST_HEAD(&req->ki_list); 1018 req->ki_ctx = ctx; 1019 return req; 1020 out_put: 1021 put_reqs_available(ctx, 1); 1022 return NULL; 1023 } 1024 1025 static struct kioctx *lookup_ioctx(unsigned long ctx_id) 1026 { 1027 struct aio_ring __user *ring = (void __user *)ctx_id; 1028 struct mm_struct *mm = current->mm; 1029 struct kioctx *ctx, *ret = NULL; 1030 struct kioctx_table *table; 1031 unsigned id; 1032 1033 if (get_user(id, &ring->id)) 1034 return NULL; 1035 1036 rcu_read_lock(); 1037 table = rcu_dereference(mm->ioctx_table); 1038 1039 if (!table || id >= table->nr) 1040 goto out; 1041 1042 ctx = rcu_dereference(table->table[id]); 1043 if (ctx && ctx->user_id == ctx_id) { 1044 if (percpu_ref_tryget_live(&ctx->users)) 1045 ret = ctx; 1046 } 1047 out: 1048 rcu_read_unlock(); 1049 return ret; 1050 } 1051 1052 /* aio_complete 1053 * Called when the io request on the given iocb is complete. 1054 */ 1055 static void aio_complete(struct aio_kiocb *iocb, long res, long res2) 1056 { 1057 struct kioctx *ctx = iocb->ki_ctx; 1058 struct aio_ring *ring; 1059 struct io_event *ev_page, *event; 1060 unsigned tail, pos, head; 1061 unsigned long flags; 1062 1063 /* 1064 * Add a completion event to the ring buffer. Must be done holding 1065 * ctx->completion_lock to prevent other code from messing with the tail 1066 * pointer since we might be called from irq context. 1067 */ 1068 spin_lock_irqsave(&ctx->completion_lock, flags); 1069 1070 tail = ctx->tail; 1071 pos = tail + AIO_EVENTS_OFFSET; 1072 1073 if (++tail >= ctx->nr_events) 1074 tail = 0; 1075 1076 ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); 1077 event = ev_page + pos % AIO_EVENTS_PER_PAGE; 1078 1079 event->obj = (u64)(unsigned long)iocb->ki_user_iocb; 1080 event->data = iocb->ki_user_data; 1081 event->res = res; 1082 event->res2 = res2; 1083 1084 kunmap_atomic(ev_page); 1085 flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); 1086 1087 pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", 1088 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, 1089 res, res2); 1090 1091 /* after flagging the request as done, we 1092 * must never even look at it again 1093 */ 1094 smp_wmb(); /* make event visible before updating tail */ 1095 1096 ctx->tail = tail; 1097 1098 ring = kmap_atomic(ctx->ring_pages[0]); 1099 head = ring->head; 1100 ring->tail = tail; 1101 kunmap_atomic(ring); 1102 flush_dcache_page(ctx->ring_pages[0]); 1103 1104 ctx->completed_events++; 1105 if (ctx->completed_events > 1) 1106 refill_reqs_available(ctx, head, tail); 1107 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1108 1109 pr_debug("added to ring %p at [%u]\n", iocb, tail); 1110 1111 /* 1112 * Check if the user asked us to deliver the result through an 1113 * eventfd. The eventfd_signal() function is safe to be called 1114 * from IRQ context. 1115 */ 1116 if (iocb->ki_eventfd) { 1117 eventfd_signal(iocb->ki_eventfd, 1); 1118 eventfd_ctx_put(iocb->ki_eventfd); 1119 } 1120 1121 kmem_cache_free(kiocb_cachep, iocb); 1122 1123 /* 1124 * We have to order our ring_info tail store above and test 1125 * of the wait list below outside the wait lock. This is 1126 * like in wake_up_bit() where clearing a bit has to be 1127 * ordered with the unlocked test. 1128 */ 1129 smp_mb(); 1130 1131 if (waitqueue_active(&ctx->wait)) 1132 wake_up(&ctx->wait); 1133 1134 percpu_ref_put(&ctx->reqs); 1135 } 1136 1137 /* aio_read_events_ring 1138 * Pull an event off of the ioctx's event ring. Returns the number of 1139 * events fetched 1140 */ 1141 static long aio_read_events_ring(struct kioctx *ctx, 1142 struct io_event __user *event, long nr) 1143 { 1144 struct aio_ring *ring; 1145 unsigned head, tail, pos; 1146 long ret = 0; 1147 int copy_ret; 1148 1149 /* 1150 * The mutex can block and wake us up and that will cause 1151 * wait_event_interruptible_hrtimeout() to schedule without sleeping 1152 * and repeat. This should be rare enough that it doesn't cause 1153 * peformance issues. See the comment in read_events() for more detail. 1154 */ 1155 sched_annotate_sleep(); 1156 mutex_lock(&ctx->ring_lock); 1157 1158 /* Access to ->ring_pages here is protected by ctx->ring_lock. */ 1159 ring = kmap_atomic(ctx->ring_pages[0]); 1160 head = ring->head; 1161 tail = ring->tail; 1162 kunmap_atomic(ring); 1163 1164 /* 1165 * Ensure that once we've read the current tail pointer, that 1166 * we also see the events that were stored up to the tail. 1167 */ 1168 smp_rmb(); 1169 1170 pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); 1171 1172 if (head == tail) 1173 goto out; 1174 1175 head %= ctx->nr_events; 1176 tail %= ctx->nr_events; 1177 1178 while (ret < nr) { 1179 long avail; 1180 struct io_event *ev; 1181 struct page *page; 1182 1183 avail = (head <= tail ? tail : ctx->nr_events) - head; 1184 if (head == tail) 1185 break; 1186 1187 pos = head + AIO_EVENTS_OFFSET; 1188 page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; 1189 pos %= AIO_EVENTS_PER_PAGE; 1190 1191 avail = min(avail, nr - ret); 1192 avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); 1193 1194 ev = kmap(page); 1195 copy_ret = copy_to_user(event + ret, ev + pos, 1196 sizeof(*ev) * avail); 1197 kunmap(page); 1198 1199 if (unlikely(copy_ret)) { 1200 ret = -EFAULT; 1201 goto out; 1202 } 1203 1204 ret += avail; 1205 head += avail; 1206 head %= ctx->nr_events; 1207 } 1208 1209 ring = kmap_atomic(ctx->ring_pages[0]); 1210 ring->head = head; 1211 kunmap_atomic(ring); 1212 flush_dcache_page(ctx->ring_pages[0]); 1213 1214 pr_debug("%li h%u t%u\n", ret, head, tail); 1215 out: 1216 mutex_unlock(&ctx->ring_lock); 1217 1218 return ret; 1219 } 1220 1221 static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, 1222 struct io_event __user *event, long *i) 1223 { 1224 long ret = aio_read_events_ring(ctx, event + *i, nr - *i); 1225 1226 if (ret > 0) 1227 *i += ret; 1228 1229 if (unlikely(atomic_read(&ctx->dead))) 1230 ret = -EINVAL; 1231 1232 if (!*i) 1233 *i = ret; 1234 1235 return ret < 0 || *i >= min_nr; 1236 } 1237 1238 static long read_events(struct kioctx *ctx, long min_nr, long nr, 1239 struct io_event __user *event, 1240 ktime_t until) 1241 { 1242 long ret = 0; 1243 1244 /* 1245 * Note that aio_read_events() is being called as the conditional - i.e. 1246 * we're calling it after prepare_to_wait() has set task state to 1247 * TASK_INTERRUPTIBLE. 1248 * 1249 * But aio_read_events() can block, and if it blocks it's going to flip 1250 * the task state back to TASK_RUNNING. 1251 * 1252 * This should be ok, provided it doesn't flip the state back to 1253 * TASK_RUNNING and return 0 too much - that causes us to spin. That 1254 * will only happen if the mutex_lock() call blocks, and we then find 1255 * the ringbuffer empty. So in practice we should be ok, but it's 1256 * something to be aware of when touching this code. 1257 */ 1258 if (until == 0) 1259 aio_read_events(ctx, min_nr, nr, event, &ret); 1260 else 1261 wait_event_interruptible_hrtimeout(ctx->wait, 1262 aio_read_events(ctx, min_nr, nr, event, &ret), 1263 until); 1264 return ret; 1265 } 1266 1267 /* sys_io_setup: 1268 * Create an aio_context capable of receiving at least nr_events. 1269 * ctxp must not point to an aio_context that already exists, and 1270 * must be initialized to 0 prior to the call. On successful 1271 * creation of the aio_context, *ctxp is filled in with the resulting 1272 * handle. May fail with -EINVAL if *ctxp is not initialized, 1273 * if the specified nr_events exceeds internal limits. May fail 1274 * with -EAGAIN if the specified nr_events exceeds the user's limit 1275 * of available events. May fail with -ENOMEM if insufficient kernel 1276 * resources are available. May fail with -EFAULT if an invalid 1277 * pointer is passed for ctxp. Will fail with -ENOSYS if not 1278 * implemented. 1279 */ 1280 SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) 1281 { 1282 struct kioctx *ioctx = NULL; 1283 unsigned long ctx; 1284 long ret; 1285 1286 ret = get_user(ctx, ctxp); 1287 if (unlikely(ret)) 1288 goto out; 1289 1290 ret = -EINVAL; 1291 if (unlikely(ctx || nr_events == 0)) { 1292 pr_debug("EINVAL: ctx %lu nr_events %u\n", 1293 ctx, nr_events); 1294 goto out; 1295 } 1296 1297 ioctx = ioctx_alloc(nr_events); 1298 ret = PTR_ERR(ioctx); 1299 if (!IS_ERR(ioctx)) { 1300 ret = put_user(ioctx->user_id, ctxp); 1301 if (ret) 1302 kill_ioctx(current->mm, ioctx, NULL); 1303 percpu_ref_put(&ioctx->users); 1304 } 1305 1306 out: 1307 return ret; 1308 } 1309 1310 #ifdef CONFIG_COMPAT 1311 COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) 1312 { 1313 struct kioctx *ioctx = NULL; 1314 unsigned long ctx; 1315 long ret; 1316 1317 ret = get_user(ctx, ctx32p); 1318 if (unlikely(ret)) 1319 goto out; 1320 1321 ret = -EINVAL; 1322 if (unlikely(ctx || nr_events == 0)) { 1323 pr_debug("EINVAL: ctx %lu nr_events %u\n", 1324 ctx, nr_events); 1325 goto out; 1326 } 1327 1328 ioctx = ioctx_alloc(nr_events); 1329 ret = PTR_ERR(ioctx); 1330 if (!IS_ERR(ioctx)) { 1331 /* truncating is ok because it's a user address */ 1332 ret = put_user((u32)ioctx->user_id, ctx32p); 1333 if (ret) 1334 kill_ioctx(current->mm, ioctx, NULL); 1335 percpu_ref_put(&ioctx->users); 1336 } 1337 1338 out: 1339 return ret; 1340 } 1341 #endif 1342 1343 /* sys_io_destroy: 1344 * Destroy the aio_context specified. May cancel any outstanding 1345 * AIOs and block on completion. Will fail with -ENOSYS if not 1346 * implemented. May fail with -EINVAL if the context pointed to 1347 * is invalid. 1348 */ 1349 SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) 1350 { 1351 struct kioctx *ioctx = lookup_ioctx(ctx); 1352 if (likely(NULL != ioctx)) { 1353 struct ctx_rq_wait wait; 1354 int ret; 1355 1356 init_completion(&wait.comp); 1357 atomic_set(&wait.count, 1); 1358 1359 /* Pass requests_done to kill_ioctx() where it can be set 1360 * in a thread-safe way. If we try to set it here then we have 1361 * a race condition if two io_destroy() called simultaneously. 1362 */ 1363 ret = kill_ioctx(current->mm, ioctx, &wait); 1364 percpu_ref_put(&ioctx->users); 1365 1366 /* Wait until all IO for the context are done. Otherwise kernel 1367 * keep using user-space buffers even if user thinks the context 1368 * is destroyed. 1369 */ 1370 if (!ret) 1371 wait_for_completion(&wait.comp); 1372 1373 return ret; 1374 } 1375 pr_debug("EINVAL: invalid context id\n"); 1376 return -EINVAL; 1377 } 1378 1379 static void aio_remove_iocb(struct aio_kiocb *iocb) 1380 { 1381 struct kioctx *ctx = iocb->ki_ctx; 1382 unsigned long flags; 1383 1384 spin_lock_irqsave(&ctx->ctx_lock, flags); 1385 list_del(&iocb->ki_list); 1386 spin_unlock_irqrestore(&ctx->ctx_lock, flags); 1387 } 1388 1389 static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) 1390 { 1391 struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); 1392 1393 if (!list_empty_careful(&iocb->ki_list)) 1394 aio_remove_iocb(iocb); 1395 1396 if (kiocb->ki_flags & IOCB_WRITE) { 1397 struct inode *inode = file_inode(kiocb->ki_filp); 1398 1399 /* 1400 * Tell lockdep we inherited freeze protection from submission 1401 * thread. 1402 */ 1403 if (S_ISREG(inode->i_mode)) 1404 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); 1405 file_end_write(kiocb->ki_filp); 1406 } 1407 1408 fput(kiocb->ki_filp); 1409 aio_complete(iocb, res, res2); 1410 } 1411 1412 static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) 1413 { 1414 int ret; 1415 1416 req->ki_filp = fget(iocb->aio_fildes); 1417 if (unlikely(!req->ki_filp)) 1418 return -EBADF; 1419 req->ki_complete = aio_complete_rw; 1420 req->ki_pos = iocb->aio_offset; 1421 req->ki_flags = iocb_flags(req->ki_filp); 1422 if (iocb->aio_flags & IOCB_FLAG_RESFD) 1423 req->ki_flags |= IOCB_EVENTFD; 1424 req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); 1425 if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { 1426 /* 1427 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then 1428 * aio_reqprio is interpreted as an I/O scheduling 1429 * class and priority. 1430 */ 1431 ret = ioprio_check_cap(iocb->aio_reqprio); 1432 if (ret) { 1433 pr_debug("aio ioprio check cap error: %d\n", ret); 1434 return ret; 1435 } 1436 1437 req->ki_ioprio = iocb->aio_reqprio; 1438 } else 1439 req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); 1440 1441 ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); 1442 if (unlikely(ret)) 1443 fput(req->ki_filp); 1444 return ret; 1445 } 1446 1447 static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, 1448 bool vectored, bool compat, struct iov_iter *iter) 1449 { 1450 void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; 1451 size_t len = iocb->aio_nbytes; 1452 1453 if (!vectored) { 1454 ssize_t ret = import_single_range(rw, buf, len, *iovec, iter); 1455 *iovec = NULL; 1456 return ret; 1457 } 1458 #ifdef CONFIG_COMPAT 1459 if (compat) 1460 return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec, 1461 iter); 1462 #endif 1463 return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter); 1464 } 1465 1466 static inline void aio_rw_done(struct kiocb *req, ssize_t ret) 1467 { 1468 switch (ret) { 1469 case -EIOCBQUEUED: 1470 break; 1471 case -ERESTARTSYS: 1472 case -ERESTARTNOINTR: 1473 case -ERESTARTNOHAND: 1474 case -ERESTART_RESTARTBLOCK: 1475 /* 1476 * There's no easy way to restart the syscall since other AIO's 1477 * may be already running. Just fail this IO with EINTR. 1478 */ 1479 ret = -EINTR; 1480 /*FALLTHRU*/ 1481 default: 1482 aio_complete_rw(req, ret, 0); 1483 } 1484 } 1485 1486 static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, 1487 bool compat) 1488 { 1489 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 1490 struct iov_iter iter; 1491 struct file *file; 1492 ssize_t ret; 1493 1494 ret = aio_prep_rw(req, iocb); 1495 if (ret) 1496 return ret; 1497 file = req->ki_filp; 1498 1499 ret = -EBADF; 1500 if (unlikely(!(file->f_mode & FMODE_READ))) 1501 goto out_fput; 1502 ret = -EINVAL; 1503 if (unlikely(!file->f_op->read_iter)) 1504 goto out_fput; 1505 1506 ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); 1507 if (ret) 1508 goto out_fput; 1509 ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); 1510 if (!ret) 1511 aio_rw_done(req, call_read_iter(file, req, &iter)); 1512 kfree(iovec); 1513 out_fput: 1514 if (unlikely(ret)) 1515 fput(file); 1516 return ret; 1517 } 1518 1519 static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, 1520 bool compat) 1521 { 1522 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 1523 struct iov_iter iter; 1524 struct file *file; 1525 ssize_t ret; 1526 1527 ret = aio_prep_rw(req, iocb); 1528 if (ret) 1529 return ret; 1530 file = req->ki_filp; 1531 1532 ret = -EBADF; 1533 if (unlikely(!(file->f_mode & FMODE_WRITE))) 1534 goto out_fput; 1535 ret = -EINVAL; 1536 if (unlikely(!file->f_op->write_iter)) 1537 goto out_fput; 1538 1539 ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); 1540 if (ret) 1541 goto out_fput; 1542 ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); 1543 if (!ret) { 1544 /* 1545 * Open-code file_start_write here to grab freeze protection, 1546 * which will be released by another thread in 1547 * aio_complete_rw(). Fool lockdep by telling it the lock got 1548 * released so that it doesn't complain about the held lock when 1549 * we return to userspace. 1550 */ 1551 if (S_ISREG(file_inode(file)->i_mode)) { 1552 __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); 1553 __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); 1554 } 1555 req->ki_flags |= IOCB_WRITE; 1556 aio_rw_done(req, call_write_iter(file, req, &iter)); 1557 } 1558 kfree(iovec); 1559 out_fput: 1560 if (unlikely(ret)) 1561 fput(file); 1562 return ret; 1563 } 1564 1565 static void aio_fsync_work(struct work_struct *work) 1566 { 1567 struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); 1568 int ret; 1569 1570 ret = vfs_fsync(req->file, req->datasync); 1571 fput(req->file); 1572 aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); 1573 } 1574 1575 static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) 1576 { 1577 if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || 1578 iocb->aio_rw_flags)) 1579 return -EINVAL; 1580 1581 req->file = fget(iocb->aio_fildes); 1582 if (unlikely(!req->file)) 1583 return -EBADF; 1584 if (unlikely(!req->file->f_op->fsync)) { 1585 fput(req->file); 1586 return -EINVAL; 1587 } 1588 1589 req->datasync = datasync; 1590 INIT_WORK(&req->work, aio_fsync_work); 1591 schedule_work(&req->work); 1592 return 0; 1593 } 1594 1595 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1596 bool compat) 1597 { 1598 struct aio_kiocb *req; 1599 struct iocb iocb; 1600 ssize_t ret; 1601 1602 if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) 1603 return -EFAULT; 1604 1605 /* enforce forwards compatibility on users */ 1606 if (unlikely(iocb.aio_reserved2)) { 1607 pr_debug("EINVAL: reserve field set\n"); 1608 return -EINVAL; 1609 } 1610 1611 /* prevent overflows */ 1612 if (unlikely( 1613 (iocb.aio_buf != (unsigned long)iocb.aio_buf) || 1614 (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || 1615 ((ssize_t)iocb.aio_nbytes < 0) 1616 )) { 1617 pr_debug("EINVAL: overflow check\n"); 1618 return -EINVAL; 1619 } 1620 1621 req = aio_get_req(ctx); 1622 if (unlikely(!req)) 1623 return -EAGAIN; 1624 1625 if (iocb.aio_flags & IOCB_FLAG_RESFD) { 1626 /* 1627 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an 1628 * instance of the file* now. The file descriptor must be 1629 * an eventfd() fd, and will be signaled for each completed 1630 * event using the eventfd_signal() function. 1631 */ 1632 req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); 1633 if (IS_ERR(req->ki_eventfd)) { 1634 ret = PTR_ERR(req->ki_eventfd); 1635 req->ki_eventfd = NULL; 1636 goto out_put_req; 1637 } 1638 } 1639 1640 ret = put_user(KIOCB_KEY, &user_iocb->aio_key); 1641 if (unlikely(ret)) { 1642 pr_debug("EFAULT: aio_key\n"); 1643 goto out_put_req; 1644 } 1645 1646 req->ki_user_iocb = user_iocb; 1647 req->ki_user_data = iocb.aio_data; 1648 1649 switch (iocb.aio_lio_opcode) { 1650 case IOCB_CMD_PREAD: 1651 ret = aio_read(&req->rw, &iocb, false, compat); 1652 break; 1653 case IOCB_CMD_PWRITE: 1654 ret = aio_write(&req->rw, &iocb, false, compat); 1655 break; 1656 case IOCB_CMD_PREADV: 1657 ret = aio_read(&req->rw, &iocb, true, compat); 1658 break; 1659 case IOCB_CMD_PWRITEV: 1660 ret = aio_write(&req->rw, &iocb, true, compat); 1661 break; 1662 case IOCB_CMD_FSYNC: 1663 ret = aio_fsync(&req->fsync, &iocb, false); 1664 break; 1665 case IOCB_CMD_FDSYNC: 1666 ret = aio_fsync(&req->fsync, &iocb, true); 1667 break; 1668 default: 1669 pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); 1670 ret = -EINVAL; 1671 break; 1672 } 1673 1674 /* 1675 * If ret is 0, we'd either done aio_complete() ourselves or have 1676 * arranged for that to be done asynchronously. Anything non-zero 1677 * means that we need to destroy req ourselves. 1678 */ 1679 if (ret) 1680 goto out_put_req; 1681 return 0; 1682 out_put_req: 1683 put_reqs_available(ctx, 1); 1684 percpu_ref_put(&ctx->reqs); 1685 if (req->ki_eventfd) 1686 eventfd_ctx_put(req->ki_eventfd); 1687 kmem_cache_free(kiocb_cachep, req); 1688 return ret; 1689 } 1690 1691 /* sys_io_submit: 1692 * Queue the nr iocbs pointed to by iocbpp for processing. Returns 1693 * the number of iocbs queued. May return -EINVAL if the aio_context 1694 * specified by ctx_id is invalid, if nr is < 0, if the iocb at 1695 * *iocbpp[0] is not properly initialized, if the operation specified 1696 * is invalid for the file descriptor in the iocb. May fail with 1697 * -EFAULT if any of the data structures point to invalid data. May 1698 * fail with -EBADF if the file descriptor specified in the first 1699 * iocb is invalid. May fail with -EAGAIN if insufficient resources 1700 * are available to queue any iocbs. Will return 0 if nr is 0. Will 1701 * fail with -ENOSYS if not implemented. 1702 */ 1703 SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, 1704 struct iocb __user * __user *, iocbpp) 1705 { 1706 struct kioctx *ctx; 1707 long ret = 0; 1708 int i = 0; 1709 struct blk_plug plug; 1710 1711 if (unlikely(nr < 0)) 1712 return -EINVAL; 1713 1714 ctx = lookup_ioctx(ctx_id); 1715 if (unlikely(!ctx)) { 1716 pr_debug("EINVAL: invalid context id\n"); 1717 return -EINVAL; 1718 } 1719 1720 if (nr > ctx->nr_events) 1721 nr = ctx->nr_events; 1722 1723 blk_start_plug(&plug); 1724 for (i = 0; i < nr; i++) { 1725 struct iocb __user *user_iocb; 1726 1727 if (unlikely(get_user(user_iocb, iocbpp + i))) { 1728 ret = -EFAULT; 1729 break; 1730 } 1731 1732 ret = io_submit_one(ctx, user_iocb, false); 1733 if (ret) 1734 break; 1735 } 1736 blk_finish_plug(&plug); 1737 1738 percpu_ref_put(&ctx->users); 1739 return i ? i : ret; 1740 } 1741 1742 #ifdef CONFIG_COMPAT 1743 COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, 1744 int, nr, compat_uptr_t __user *, iocbpp) 1745 { 1746 struct kioctx *ctx; 1747 long ret = 0; 1748 int i = 0; 1749 struct blk_plug plug; 1750 1751 if (unlikely(nr < 0)) 1752 return -EINVAL; 1753 1754 ctx = lookup_ioctx(ctx_id); 1755 if (unlikely(!ctx)) { 1756 pr_debug("EINVAL: invalid context id\n"); 1757 return -EINVAL; 1758 } 1759 1760 if (nr > ctx->nr_events) 1761 nr = ctx->nr_events; 1762 1763 blk_start_plug(&plug); 1764 for (i = 0; i < nr; i++) { 1765 compat_uptr_t user_iocb; 1766 1767 if (unlikely(get_user(user_iocb, iocbpp + i))) { 1768 ret = -EFAULT; 1769 break; 1770 } 1771 1772 ret = io_submit_one(ctx, compat_ptr(user_iocb), true); 1773 if (ret) 1774 break; 1775 } 1776 blk_finish_plug(&plug); 1777 1778 percpu_ref_put(&ctx->users); 1779 return i ? i : ret; 1780 } 1781 #endif 1782 1783 /* lookup_kiocb 1784 * Finds a given iocb for cancellation. 1785 */ 1786 static struct aio_kiocb * 1787 lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) 1788 { 1789 struct aio_kiocb *kiocb; 1790 1791 assert_spin_locked(&ctx->ctx_lock); 1792 1793 /* TODO: use a hash or array, this sucks. */ 1794 list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { 1795 if (kiocb->ki_user_iocb == iocb) 1796 return kiocb; 1797 } 1798 return NULL; 1799 } 1800 1801 /* sys_io_cancel: 1802 * Attempts to cancel an iocb previously passed to io_submit. If 1803 * the operation is successfully cancelled, the resulting event is 1804 * copied into the memory pointed to by result without being placed 1805 * into the completion queue and 0 is returned. May fail with 1806 * -EFAULT if any of the data structures pointed to are invalid. 1807 * May fail with -EINVAL if aio_context specified by ctx_id is 1808 * invalid. May fail with -EAGAIN if the iocb specified was not 1809 * cancelled. Will fail with -ENOSYS if not implemented. 1810 */ 1811 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, 1812 struct io_event __user *, result) 1813 { 1814 struct kioctx *ctx; 1815 struct aio_kiocb *kiocb; 1816 int ret = -EINVAL; 1817 u32 key; 1818 1819 if (unlikely(get_user(key, &iocb->aio_key))) 1820 return -EFAULT; 1821 if (unlikely(key != KIOCB_KEY)) 1822 return -EINVAL; 1823 1824 ctx = lookup_ioctx(ctx_id); 1825 if (unlikely(!ctx)) 1826 return -EINVAL; 1827 1828 spin_lock_irq(&ctx->ctx_lock); 1829 kiocb = lookup_kiocb(ctx, iocb); 1830 if (kiocb) { 1831 ret = kiocb->ki_cancel(&kiocb->rw); 1832 list_del_init(&kiocb->ki_list); 1833 } 1834 spin_unlock_irq(&ctx->ctx_lock); 1835 1836 if (!ret) { 1837 /* 1838 * The result argument is no longer used - the io_event is 1839 * always delivered via the ring buffer. -EINPROGRESS indicates 1840 * cancellation is progress: 1841 */ 1842 ret = -EINPROGRESS; 1843 } 1844 1845 percpu_ref_put(&ctx->users); 1846 1847 return ret; 1848 } 1849 1850 static long do_io_getevents(aio_context_t ctx_id, 1851 long min_nr, 1852 long nr, 1853 struct io_event __user *events, 1854 struct timespec64 *ts) 1855 { 1856 ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX; 1857 struct kioctx *ioctx = lookup_ioctx(ctx_id); 1858 long ret = -EINVAL; 1859 1860 if (likely(ioctx)) { 1861 if (likely(min_nr <= nr && min_nr >= 0)) 1862 ret = read_events(ioctx, min_nr, nr, events, until); 1863 percpu_ref_put(&ioctx->users); 1864 } 1865 1866 return ret; 1867 } 1868 1869 /* io_getevents: 1870 * Attempts to read at least min_nr events and up to nr events from 1871 * the completion queue for the aio_context specified by ctx_id. If 1872 * it succeeds, the number of read events is returned. May fail with 1873 * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is 1874 * out of range, if timeout is out of range. May fail with -EFAULT 1875 * if any of the memory specified is invalid. May return 0 or 1876 * < min_nr if the timeout specified by timeout has elapsed 1877 * before sufficient events are available, where timeout == NULL 1878 * specifies an infinite timeout. Note that the timeout pointed to by 1879 * timeout is relative. Will fail with -ENOSYS if not implemented. 1880 */ 1881 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, 1882 long, min_nr, 1883 long, nr, 1884 struct io_event __user *, events, 1885 struct timespec __user *, timeout) 1886 { 1887 struct timespec64 ts; 1888 int ret; 1889 1890 if (timeout && unlikely(get_timespec64(&ts, timeout))) 1891 return -EFAULT; 1892 1893 ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); 1894 if (!ret && signal_pending(current)) 1895 ret = -EINTR; 1896 return ret; 1897 } 1898 1899 struct __aio_sigset { 1900 const sigset_t __user *sigmask; 1901 size_t sigsetsize; 1902 }; 1903 1904 SYSCALL_DEFINE6(io_pgetevents, 1905 aio_context_t, ctx_id, 1906 long, min_nr, 1907 long, nr, 1908 struct io_event __user *, events, 1909 struct timespec __user *, timeout, 1910 const struct __aio_sigset __user *, usig) 1911 { 1912 struct __aio_sigset ksig = { NULL, }; 1913 sigset_t ksigmask, sigsaved; 1914 struct timespec64 ts; 1915 int ret; 1916 1917 if (timeout && unlikely(get_timespec64(&ts, timeout))) 1918 return -EFAULT; 1919 1920 if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) 1921 return -EFAULT; 1922 1923 if (ksig.sigmask) { 1924 if (ksig.sigsetsize != sizeof(sigset_t)) 1925 return -EINVAL; 1926 if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask))) 1927 return -EFAULT; 1928 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 1929 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 1930 } 1931 1932 ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); 1933 if (signal_pending(current)) { 1934 if (ksig.sigmask) { 1935 current->saved_sigmask = sigsaved; 1936 set_restore_sigmask(); 1937 } 1938 1939 if (!ret) 1940 ret = -ERESTARTNOHAND; 1941 } else { 1942 if (ksig.sigmask) 1943 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1944 } 1945 1946 return ret; 1947 } 1948 1949 #ifdef CONFIG_COMPAT 1950 COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, 1951 compat_long_t, min_nr, 1952 compat_long_t, nr, 1953 struct io_event __user *, events, 1954 struct compat_timespec __user *, timeout) 1955 { 1956 struct timespec64 t; 1957 int ret; 1958 1959 if (timeout && compat_get_timespec64(&t, timeout)) 1960 return -EFAULT; 1961 1962 ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); 1963 if (!ret && signal_pending(current)) 1964 ret = -EINTR; 1965 return ret; 1966 } 1967 1968 1969 struct __compat_aio_sigset { 1970 compat_sigset_t __user *sigmask; 1971 compat_size_t sigsetsize; 1972 }; 1973 1974 COMPAT_SYSCALL_DEFINE6(io_pgetevents, 1975 compat_aio_context_t, ctx_id, 1976 compat_long_t, min_nr, 1977 compat_long_t, nr, 1978 struct io_event __user *, events, 1979 struct compat_timespec __user *, timeout, 1980 const struct __compat_aio_sigset __user *, usig) 1981 { 1982 struct __compat_aio_sigset ksig = { NULL, }; 1983 sigset_t ksigmask, sigsaved; 1984 struct timespec64 t; 1985 int ret; 1986 1987 if (timeout && compat_get_timespec64(&t, timeout)) 1988 return -EFAULT; 1989 1990 if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) 1991 return -EFAULT; 1992 1993 if (ksig.sigmask) { 1994 if (ksig.sigsetsize != sizeof(compat_sigset_t)) 1995 return -EINVAL; 1996 if (get_compat_sigset(&ksigmask, ksig.sigmask)) 1997 return -EFAULT; 1998 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 1999 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 2000 } 2001 2002 ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); 2003 if (signal_pending(current)) { 2004 if (ksig.sigmask) { 2005 current->saved_sigmask = sigsaved; 2006 set_restore_sigmask(); 2007 } 2008 if (!ret) 2009 ret = -ERESTARTNOHAND; 2010 } else { 2011 if (ksig.sigmask) 2012 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 2013 } 2014 2015 return ret; 2016 } 2017 #endif 2018