aio.c (4cd81c3dfc4a34e4a0b6fa577860077c8e5b13af) aio.c (db446a08c23d5475e6b08c87acca79ebb20f283c)
1/*
2 * An async IO implementation for Linux
3 * Written by Benjamin LaHaise <bcrl@kvack.org>
4 *
5 * Implements an efficient asynchronous io interface.
6 *
7 * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
8 *

--- 52 unchanged lines hidden (view full) ---

61 unsigned header_length; /* size of aio_ring */
62
63
64 struct io_event io_events[0];
65}; /* 128 bytes + ring size */
66
67#define AIO_RING_PAGES 8
68
1/*
2 * An async IO implementation for Linux
3 * Written by Benjamin LaHaise <bcrl@kvack.org>
4 *
5 * Implements an efficient asynchronous io interface.
6 *
7 * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
8 *

--- 52 unchanged lines hidden (view full) ---

61 unsigned header_length; /* size of aio_ring */
62
63
64 struct io_event io_events[0];
65}; /* 128 bytes + ring size */
66
67#define AIO_RING_PAGES 8
68
69struct kioctx_table {
70 struct rcu_head rcu;
71 unsigned nr;
72 struct kioctx *table[];
73};
74
69struct kioctx_cpu {
70 unsigned reqs_available;
71};
72
73struct kioctx {
74 struct percpu_ref users;
75 atomic_t dead;
76
75struct kioctx_cpu {
76 unsigned reqs_available;
77};
78
79struct kioctx {
80 struct percpu_ref users;
81 atomic_t dead;
82
77 /* This needs improving */
78 unsigned long user_id;
83 unsigned long user_id;
79 struct hlist_node list;
80
81 struct __percpu kioctx_cpu *cpu;
82
83 /*
84 * For percpu reqs_available, number of slots we move to/from global
85 * counter at a time:
86 */
87 unsigned req_batch;

--- 42 unchanged lines hidden (view full) ---

130
131 struct {
132 unsigned tail;
133 spinlock_t completion_lock;
134 } ____cacheline_aligned_in_smp;
135
136 struct page *internal_pages[AIO_RING_PAGES];
137 struct file *aio_ring_file;
84
85 struct __percpu kioctx_cpu *cpu;
86
87 /*
88 * For percpu reqs_available, number of slots we move to/from global
89 * counter at a time:
90 */
91 unsigned req_batch;

--- 42 unchanged lines hidden (view full) ---

134
135 struct {
136 unsigned tail;
137 spinlock_t completion_lock;
138 } ____cacheline_aligned_in_smp;
139
140 struct page *internal_pages[AIO_RING_PAGES];
141 struct file *aio_ring_file;
142
143 unsigned id;
138};
139
140/*------ sysctl variables----*/
141static DEFINE_SPINLOCK(aio_nr_lock);
142unsigned long aio_nr; /* current system wide number of aio requests */
143unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
144/*----end sysctl variables---*/
145

--- 175 unchanged lines hidden (view full) ---

321 return -EAGAIN;
322 }
323
324 ctx->user_id = ctx->mmap_base;
325 ctx->nr_events = nr_events; /* trusted copy */
326
327 ring = kmap_atomic(ctx->ring_pages[0]);
328 ring->nr = nr_events; /* user copy */
144};
145
146/*------ sysctl variables----*/
147static DEFINE_SPINLOCK(aio_nr_lock);
148unsigned long aio_nr; /* current system wide number of aio requests */
149unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
150/*----end sysctl variables---*/
151

--- 175 unchanged lines hidden (view full) ---

327 return -EAGAIN;
328 }
329
330 ctx->user_id = ctx->mmap_base;
331 ctx->nr_events = nr_events; /* trusted copy */
332
333 ring = kmap_atomic(ctx->ring_pages[0]);
334 ring->nr = nr_events; /* user copy */
329 ring->id = ctx->user_id;
335 ring->id = ~0U;
330 ring->head = ring->tail = 0;
331 ring->magic = AIO_RING_MAGIC;
332 ring->compat_features = AIO_RING_COMPAT_FEATURES;
333 ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
334 ring->header_length = sizeof(struct aio_ring);
335 kunmap_atomic(ring);
336 flush_dcache_page(ctx->ring_pages[0]);
337

--- 119 unchanged lines hidden (view full) ---

457static void free_ioctx_ref(struct percpu_ref *ref)
458{
459 struct kioctx *ctx = container_of(ref, struct kioctx, users);
460
461 INIT_WORK(&ctx->free_work, free_ioctx);
462 schedule_work(&ctx->free_work);
463}
464
336 ring->head = ring->tail = 0;
337 ring->magic = AIO_RING_MAGIC;
338 ring->compat_features = AIO_RING_COMPAT_FEATURES;
339 ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
340 ring->header_length = sizeof(struct aio_ring);
341 kunmap_atomic(ring);
342 flush_dcache_page(ctx->ring_pages[0]);
343

--- 119 unchanged lines hidden (view full) ---

463static void free_ioctx_ref(struct percpu_ref *ref)
464{
465 struct kioctx *ctx = container_of(ref, struct kioctx, users);
466
467 INIT_WORK(&ctx->free_work, free_ioctx);
468 schedule_work(&ctx->free_work);
469}
470
471static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
472{
473 unsigned i, new_nr;
474 struct kioctx_table *table, *old;
475 struct aio_ring *ring;
476
477 spin_lock(&mm->ioctx_lock);
478 table = rcu_dereference(mm->ioctx_table);
479
480 while (1) {
481 if (table)
482 for (i = 0; i < table->nr; i++)
483 if (!table->table[i]) {
484 ctx->id = i;
485 table->table[i] = ctx;
486 spin_unlock(&mm->ioctx_lock);
487
488 ring = kmap_atomic(ctx->ring_pages[0]);
489 ring->id = ctx->id;
490 kunmap_atomic(ring);
491 return 0;
492 }
493
494 new_nr = (table ? table->nr : 1) * 4;
495
496 spin_unlock(&mm->ioctx_lock);
497
498 table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
499 new_nr, GFP_KERNEL);
500 if (!table)
501 return -ENOMEM;
502
503 table->nr = new_nr;
504
505 spin_lock(&mm->ioctx_lock);
506 old = rcu_dereference(mm->ioctx_table);
507
508 if (!old) {
509 rcu_assign_pointer(mm->ioctx_table, table);
510 } else if (table->nr > old->nr) {
511 memcpy(table->table, old->table,
512 old->nr * sizeof(struct kioctx *));
513
514 rcu_assign_pointer(mm->ioctx_table, table);
515 kfree_rcu(old, rcu);
516 } else {
517 kfree(table);
518 table = old;
519 }
520 }
521}
522
465/* ioctx_alloc
466 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
467 */
468static struct kioctx *ioctx_alloc(unsigned nr_events)
469{
470 struct mm_struct *mm = current->mm;
471 struct kioctx *ctx;
472 int err = -ENOMEM;

--- 42 unchanged lines hidden (view full) ---

515
516 if (aio_setup_ring(ctx) < 0)
517 goto out_freepcpu;
518
519 atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
520 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
521 BUG_ON(!ctx->req_batch);
522
523/* ioctx_alloc
524 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
525 */
526static struct kioctx *ioctx_alloc(unsigned nr_events)
527{
528 struct mm_struct *mm = current->mm;
529 struct kioctx *ctx;
530 int err = -ENOMEM;

--- 42 unchanged lines hidden (view full) ---

573
574 if (aio_setup_ring(ctx) < 0)
575 goto out_freepcpu;
576
577 atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
578 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
579 BUG_ON(!ctx->req_batch);
580
581 err = ioctx_add_table(ctx, mm);
582 if (err)
583 goto out_cleanup_noerr;
584
523 /* limit the number of system wide aios */
524 spin_lock(&aio_nr_lock);
525 if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
526 aio_nr + nr_events < aio_nr) {
527 spin_unlock(&aio_nr_lock);
528 goto out_cleanup;
529 }
530 aio_nr += ctx->max_reqs;
531 spin_unlock(&aio_nr_lock);
532
533 percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
534
585 /* limit the number of system wide aios */
586 spin_lock(&aio_nr_lock);
587 if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
588 aio_nr + nr_events < aio_nr) {
589 spin_unlock(&aio_nr_lock);
590 goto out_cleanup;
591 }
592 aio_nr += ctx->max_reqs;
593 spin_unlock(&aio_nr_lock);
594
595 percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
596
535 /* now link into global list. */
536 spin_lock(&mm->ioctx_lock);
537 hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
538 spin_unlock(&mm->ioctx_lock);
539
540 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
541 ctx, ctx->user_id, mm, ctx->nr_events);
542 return ctx;
543
544out_cleanup:
545 err = -EAGAIN;
597 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
598 ctx, ctx->user_id, mm, ctx->nr_events);
599 return ctx;
600
601out_cleanup:
602 err = -EAGAIN;
603out_cleanup_noerr:
546 aio_free_ring(ctx);
547out_freepcpu:
548 free_percpu(ctx->cpu);
549out_freeref:
550 free_percpu(ctx->users.pcpu_count);
551out_freectx:
552 if (ctx->aio_ring_file)
553 fput(ctx->aio_ring_file);
554 kmem_cache_free(kioctx_cachep, ctx);
555 pr_debug("error allocating ioctx %d\n", err);
556 return ERR_PTR(err);
557}
558
559/* kill_ioctx
560 * Cancels all outstanding aio requests on an aio context. Used
561 * when the processes owning a context have all exited to encourage
562 * the rapid destruction of the kioctx.
563 */
604 aio_free_ring(ctx);
605out_freepcpu:
606 free_percpu(ctx->cpu);
607out_freeref:
608 free_percpu(ctx->users.pcpu_count);
609out_freectx:
610 if (ctx->aio_ring_file)
611 fput(ctx->aio_ring_file);
612 kmem_cache_free(kioctx_cachep, ctx);
613 pr_debug("error allocating ioctx %d\n", err);
614 return ERR_PTR(err);
615}
616
617/* kill_ioctx
618 * Cancels all outstanding aio requests on an aio context. Used
619 * when the processes owning a context have all exited to encourage
620 * the rapid destruction of the kioctx.
621 */
564static void kill_ioctx(struct kioctx *ctx)
622static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
565{
566 if (!atomic_xchg(&ctx->dead, 1)) {
623{
624 if (!atomic_xchg(&ctx->dead, 1)) {
567 hlist_del_rcu(&ctx->list);
625 struct kioctx_table *table;
626
627 spin_lock(&mm->ioctx_lock);
628 table = rcu_dereference(mm->ioctx_table);
629
630 WARN_ON(ctx != table->table[ctx->id]);
631 table->table[ctx->id] = NULL;
632 spin_unlock(&mm->ioctx_lock);
633
568 /* percpu_ref_kill() will do the necessary call_rcu() */
569 wake_up_all(&ctx->wait);
570
571 /*
572 * It'd be more correct to do this in free_ioctx(), after all
573 * the outstanding kiocbs have finished - but by then io_destroy
574 * has already returned, so io_setup() could potentially return
575 * -EAGAIN with no ioctxs actually in use (as far as userspace

--- 32 unchanged lines hidden (view full) ---

608 * no way for any new requests to be submited or any of the io_* syscalls to be
609 * called on the context.
610 *
611 * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
612 * them.
613 */
614void exit_aio(struct mm_struct *mm)
615{
634 /* percpu_ref_kill() will do the necessary call_rcu() */
635 wake_up_all(&ctx->wait);
636
637 /*
638 * It'd be more correct to do this in free_ioctx(), after all
639 * the outstanding kiocbs have finished - but by then io_destroy
640 * has already returned, so io_setup() could potentially return
641 * -EAGAIN with no ioctxs actually in use (as far as userspace

--- 32 unchanged lines hidden (view full) ---

674 * no way for any new requests to be submited or any of the io_* syscalls to be
675 * called on the context.
676 *
677 * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
678 * them.
679 */
680void exit_aio(struct mm_struct *mm)
681{
682 struct kioctx_table *table;
616 struct kioctx *ctx;
683 struct kioctx *ctx;
617 struct hlist_node *n;
684 unsigned i = 0;
618
685
619 hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
686 while (1) {
687 rcu_read_lock();
688 table = rcu_dereference(mm->ioctx_table);
689
690 do {
691 if (!table || i >= table->nr) {
692 rcu_read_unlock();
693 rcu_assign_pointer(mm->ioctx_table, NULL);
694 if (table)
695 kfree(table);
696 return;
697 }
698
699 ctx = table->table[i++];
700 } while (!ctx);
701
702 rcu_read_unlock();
703
620 /*
621 * We don't need to bother with munmap() here -
622 * exit_mmap(mm) is coming and it'll unmap everything.
623 * Since aio_free_ring() uses non-zero ->mmap_size
624 * as indicator that it needs to unmap the area,
625 * just set it to 0; aio_free_ring() is the only
626 * place that uses ->mmap_size, so it's safe.
627 */
628 ctx->mmap_size = 0;
629
704 /*
705 * We don't need to bother with munmap() here -
706 * exit_mmap(mm) is coming and it'll unmap everything.
707 * Since aio_free_ring() uses non-zero ->mmap_size
708 * as indicator that it needs to unmap the area,
709 * just set it to 0; aio_free_ring() is the only
710 * place that uses ->mmap_size, so it's safe.
711 */
712 ctx->mmap_size = 0;
713
630 kill_ioctx(ctx);
714 kill_ioctx(mm, ctx);
631 }
632}
633
634static void put_reqs_available(struct kioctx *ctx, unsigned nr)
635{
636 struct kioctx_cpu *kcpu;
637
638 preempt_disable();

--- 66 unchanged lines hidden (view full) ---

705 fput(req->ki_filp);
706 if (req->ki_eventfd != NULL)
707 eventfd_ctx_put(req->ki_eventfd);
708 kmem_cache_free(kiocb_cachep, req);
709}
710
711static struct kioctx *lookup_ioctx(unsigned long ctx_id)
712{
715 }
716}
717
718static void put_reqs_available(struct kioctx *ctx, unsigned nr)
719{
720 struct kioctx_cpu *kcpu;
721
722 preempt_disable();

--- 66 unchanged lines hidden (view full) ---

789 fput(req->ki_filp);
790 if (req->ki_eventfd != NULL)
791 eventfd_ctx_put(req->ki_eventfd);
792 kmem_cache_free(kiocb_cachep, req);
793}
794
795static struct kioctx *lookup_ioctx(unsigned long ctx_id)
796{
797 struct aio_ring __user *ring = (void __user *)ctx_id;
713 struct mm_struct *mm = current->mm;
714 struct kioctx *ctx, *ret = NULL;
798 struct mm_struct *mm = current->mm;
799 struct kioctx *ctx, *ret = NULL;
800 struct kioctx_table *table;
801 unsigned id;
715
802
803 if (get_user(id, &ring->id))
804 return NULL;
805
716 rcu_read_lock();
806 rcu_read_lock();
807 table = rcu_dereference(mm->ioctx_table);
717
808
718 hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
719 if (ctx->user_id == ctx_id) {
720 percpu_ref_get(&ctx->users);
721 ret = ctx;
722 break;
723 }
724 }
809 if (!table || id >= table->nr)
810 goto out;
725
811
812 ctx = table->table[id];
813 if (ctx->user_id == ctx_id) {
814 percpu_ref_get(&ctx->users);
815 ret = ctx;
816 }
817out:
726 rcu_read_unlock();
727 return ret;
728}
729
730/* aio_complete
731 * Called when the io request on the given iocb is complete.
732 */
733void aio_complete(struct kiocb *iocb, long res, long res2)

--- 259 unchanged lines hidden (view full) ---

993 goto out;
994 }
995
996 ioctx = ioctx_alloc(nr_events);
997 ret = PTR_ERR(ioctx);
998 if (!IS_ERR(ioctx)) {
999 ret = put_user(ioctx->user_id, ctxp);
1000 if (ret)
818 rcu_read_unlock();
819 return ret;
820}
821
822/* aio_complete
823 * Called when the io request on the given iocb is complete.
824 */
825void aio_complete(struct kiocb *iocb, long res, long res2)

--- 259 unchanged lines hidden (view full) ---

1085 goto out;
1086 }
1087
1088 ioctx = ioctx_alloc(nr_events);
1089 ret = PTR_ERR(ioctx);
1090 if (!IS_ERR(ioctx)) {
1091 ret = put_user(ioctx->user_id, ctxp);
1092 if (ret)
1001 kill_ioctx(ioctx);
1093 kill_ioctx(current->mm, ioctx);
1002 percpu_ref_put(&ioctx->users);
1003 }
1004
1005out:
1006 return ret;
1007}
1008
1009/* sys_io_destroy:
1010 * Destroy the aio_context specified. May cancel any outstanding
1011 * AIOs and block on completion. Will fail with -ENOSYS if not
1012 * implemented. May fail with -EINVAL if the context pointed to
1013 * is invalid.
1014 */
1015SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1016{
1017 struct kioctx *ioctx = lookup_ioctx(ctx);
1018 if (likely(NULL != ioctx)) {
1094 percpu_ref_put(&ioctx->users);
1095 }
1096
1097out:
1098 return ret;
1099}
1100
1101/* sys_io_destroy:
1102 * Destroy the aio_context specified. May cancel any outstanding
1103 * AIOs and block on completion. Will fail with -ENOSYS if not
1104 * implemented. May fail with -EINVAL if the context pointed to
1105 * is invalid.
1106 */
1107SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1108{
1109 struct kioctx *ioctx = lookup_ioctx(ctx);
1110 if (likely(NULL != ioctx)) {
1019 kill_ioctx(ioctx);
1111 kill_ioctx(current->mm, ioctx);
1020 percpu_ref_put(&ioctx->users);
1021 return 0;
1022 }
1023 pr_debug("EINVAL: io_destroy: invalid context id\n");
1024 return -EINVAL;
1025}
1026
1027typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,

--- 394 unchanged lines hidden ---
1112 percpu_ref_put(&ioctx->users);
1113 return 0;
1114 }
1115 pr_debug("EINVAL: io_destroy: invalid context id\n");
1116 return -EINVAL;
1117}
1118
1119typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,

--- 394 unchanged lines hidden ---