11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds * An async IO implementation for Linux
31da177e4SLinus Torvalds * Written by Benjamin LaHaise <bcrl@kvack.org>
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Implements an efficient asynchronous io interface.
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
8bfe4037eSChristoph Hellwig * Copyright 2018 Christoph Hellwig.
91da177e4SLinus Torvalds *
101da177e4SLinus Torvalds * See ../COPYING for licensing terms.
111da177e4SLinus Torvalds */
12caf4167aSKent Overstreet #define pr_fmt(fmt) "%s: " fmt, __func__
13caf4167aSKent Overstreet
141da177e4SLinus Torvalds #include <linux/kernel.h>
151da177e4SLinus Torvalds #include <linux/init.h>
161da177e4SLinus Torvalds #include <linux/errno.h>
171da177e4SLinus Torvalds #include <linux/time.h>
181da177e4SLinus Torvalds #include <linux/aio_abi.h>
19630d9c47SPaul Gortmaker #include <linux/export.h>
201da177e4SLinus Torvalds #include <linux/syscalls.h>
21b9d128f1SJens Axboe #include <linux/backing-dev.h>
229018ccc4SChristoph Hellwig #include <linux/refcount.h>
23027445c3SBadari Pulavarty #include <linux/uio.h>
241da177e4SLinus Torvalds
25174cd4b1SIngo Molnar #include <linux/sched/signal.h>
261da177e4SLinus Torvalds #include <linux/fs.h>
271da177e4SLinus Torvalds #include <linux/file.h>
281da177e4SLinus Torvalds #include <linux/mm.h>
291da177e4SLinus Torvalds #include <linux/mman.h>
30e1bdd5f2SKent Overstreet #include <linux/percpu.h>
311da177e4SLinus Torvalds #include <linux/slab.h>
321da177e4SLinus Torvalds #include <linux/timer.h>
331da177e4SLinus Torvalds #include <linux/aio.h>
341da177e4SLinus Torvalds #include <linux/highmem.h>
351da177e4SLinus Torvalds #include <linux/workqueue.h>
361da177e4SLinus Torvalds #include <linux/security.h>
379c3060beSDavide Libenzi #include <linux/eventfd.h>
38cfb1e33eSJeff Moyer #include <linux/blkdev.h>
399d85cba7SJeff Moyer #include <linux/compat.h>
4036bc08ccSGu Zheng #include <linux/migrate.h>
4136bc08ccSGu Zheng #include <linux/ramfs.h>
42723be6e3SKent Overstreet #include <linux/percpu-refcount.h>
4371ad7490SBenjamin LaHaise #include <linux/mount.h>
4452db59dfSDavid Howells #include <linux/pseudo_fs.h>
451da177e4SLinus Torvalds
467c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
47a538e3ffSJeff Moyer #include <linux/nospec.h>
481da177e4SLinus Torvalds
4968d70d03SAl Viro #include "internal.h"
5068d70d03SAl Viro
51f3a2752aSChristoph Hellwig #define KIOCB_KEY 0
52f3a2752aSChristoph Hellwig
534e179bcaSKent Overstreet #define AIO_RING_MAGIC 0xa10a10a1
544e179bcaSKent Overstreet #define AIO_RING_COMPAT_FEATURES 1
554e179bcaSKent Overstreet #define AIO_RING_INCOMPAT_FEATURES 0
564e179bcaSKent Overstreet struct aio_ring {
574e179bcaSKent Overstreet unsigned id; /* kernel internal index number */
584e179bcaSKent Overstreet unsigned nr; /* number of io_events */
59fa8a53c3SBenjamin LaHaise unsigned head; /* Written to by userland or under ring_lock
60fa8a53c3SBenjamin LaHaise * mutex by aio_read_events_ring(). */
614e179bcaSKent Overstreet unsigned tail;
624e179bcaSKent Overstreet
634e179bcaSKent Overstreet unsigned magic;
644e179bcaSKent Overstreet unsigned compat_features;
654e179bcaSKent Overstreet unsigned incompat_features;
664e179bcaSKent Overstreet unsigned header_length; /* size of aio_ring */
674e179bcaSKent Overstreet
684e179bcaSKent Overstreet
69241cb28eSGustavo A. R. Silva struct io_event io_events[];
704e179bcaSKent Overstreet }; /* 128 bytes + ring size */
714e179bcaSKent Overstreet
72a79d40e9SJens Axboe /*
73a79d40e9SJens Axboe * Plugging is meant to work with larger batches of IOs. If we don't
74a79d40e9SJens Axboe * have more than the below, then don't bother setting up a plug.
75a79d40e9SJens Axboe */
76a79d40e9SJens Axboe #define AIO_PLUG_THRESHOLD 2
77a79d40e9SJens Axboe
784e179bcaSKent Overstreet #define AIO_RING_PAGES 8
794e179bcaSKent Overstreet
80db446a08SBenjamin LaHaise struct kioctx_table {
81db446a08SBenjamin LaHaise struct rcu_head rcu;
82db446a08SBenjamin LaHaise unsigned nr;
83db7fcc88SKees Cook struct kioctx __rcu *table[] __counted_by(nr);
84db446a08SBenjamin LaHaise };
85db446a08SBenjamin LaHaise
86e1bdd5f2SKent Overstreet struct kioctx_cpu {
87e1bdd5f2SKent Overstreet unsigned reqs_available;
88e1bdd5f2SKent Overstreet };
89e1bdd5f2SKent Overstreet
90dc48e56dSJens Axboe struct ctx_rq_wait {
91dc48e56dSJens Axboe struct completion comp;
92dc48e56dSJens Axboe atomic_t count;
93dc48e56dSJens Axboe };
94dc48e56dSJens Axboe
954e179bcaSKent Overstreet struct kioctx {
96723be6e3SKent Overstreet struct percpu_ref users;
9736f55889SKent Overstreet atomic_t dead;
984e179bcaSKent Overstreet
99e34ecee2SKent Overstreet struct percpu_ref reqs;
100e34ecee2SKent Overstreet
1014e179bcaSKent Overstreet unsigned long user_id;
1024e179bcaSKent Overstreet
103e1bdd5f2SKent Overstreet struct __percpu kioctx_cpu *cpu;
104e1bdd5f2SKent Overstreet
105e1bdd5f2SKent Overstreet /*
106e1bdd5f2SKent Overstreet * For percpu reqs_available, number of slots we move to/from global
107e1bdd5f2SKent Overstreet * counter at a time:
108e1bdd5f2SKent Overstreet */
109e1bdd5f2SKent Overstreet unsigned req_batch;
1103e845ce0SKent Overstreet /*
1113e845ce0SKent Overstreet * This is what userspace passed to io_setup(), it's not used for
1123e845ce0SKent Overstreet * anything but counting against the global max_reqs quota.
1133e845ce0SKent Overstreet *
11458c85dc2SKent Overstreet * The real limit is nr_events - 1, which will be larger (see
1153e845ce0SKent Overstreet * aio_setup_ring())
1163e845ce0SKent Overstreet */
1174e179bcaSKent Overstreet unsigned max_reqs;
1184e179bcaSKent Overstreet
11958c85dc2SKent Overstreet /* Size of ringbuffer, in units of struct io_event */
12058c85dc2SKent Overstreet unsigned nr_events;
1214e179bcaSKent Overstreet
12258c85dc2SKent Overstreet unsigned long mmap_base;
12358c85dc2SKent Overstreet unsigned long mmap_size;
12458c85dc2SKent Overstreet
12558c85dc2SKent Overstreet struct page **ring_pages;
12658c85dc2SKent Overstreet long nr_pages;
12758c85dc2SKent Overstreet
128f729863aSTejun Heo struct rcu_work free_rwork; /* see free_ioctx() */
1294e23bcaeSKent Overstreet
130e02ba72aSAnatol Pomozov /*
131e02ba72aSAnatol Pomozov * signals when all in-flight requests are done
132e02ba72aSAnatol Pomozov */
133dc48e56dSJens Axboe struct ctx_rq_wait *rq_wait;
134e02ba72aSAnatol Pomozov
1354e23bcaeSKent Overstreet struct {
13634e83fc6SKent Overstreet /*
13734e83fc6SKent Overstreet * This counts the number of available slots in the ringbuffer,
13834e83fc6SKent Overstreet * so we avoid overflowing it: it's decremented (if positive)
13934e83fc6SKent Overstreet * when allocating a kiocb and incremented when the resulting
14034e83fc6SKent Overstreet * io_event is pulled off the ringbuffer.
141e1bdd5f2SKent Overstreet *
142e1bdd5f2SKent Overstreet * We batch accesses to it with a percpu version.
14334e83fc6SKent Overstreet */
14434e83fc6SKent Overstreet atomic_t reqs_available;
1454e23bcaeSKent Overstreet } ____cacheline_aligned_in_smp;
1464e23bcaeSKent Overstreet
1474e23bcaeSKent Overstreet struct {
1484e23bcaeSKent Overstreet spinlock_t ctx_lock;
1494e23bcaeSKent Overstreet struct list_head active_reqs; /* used for cancellation */
1504e23bcaeSKent Overstreet } ____cacheline_aligned_in_smp;
1514e23bcaeSKent Overstreet
15258c85dc2SKent Overstreet struct {
15358c85dc2SKent Overstreet struct mutex ring_lock;
1544e23bcaeSKent Overstreet wait_queue_head_t wait;
1554e23bcaeSKent Overstreet } ____cacheline_aligned_in_smp;
15658c85dc2SKent Overstreet
15758c85dc2SKent Overstreet struct {
15858c85dc2SKent Overstreet unsigned tail;
159d856f32aSBenjamin LaHaise unsigned completed_events;
1600460fef2SKent Overstreet spinlock_t completion_lock;
1614e23bcaeSKent Overstreet } ____cacheline_aligned_in_smp;
16258c85dc2SKent Overstreet
16358c85dc2SKent Overstreet struct page *internal_pages[AIO_RING_PAGES];
16436bc08ccSGu Zheng struct file *aio_ring_file;
165db446a08SBenjamin LaHaise
166db446a08SBenjamin LaHaise unsigned id;
1674e179bcaSKent Overstreet };
1684e179bcaSKent Overstreet
16984c4e1f8SLinus Torvalds /*
17084c4e1f8SLinus Torvalds * First field must be the file pointer in all the
17184c4e1f8SLinus Torvalds * iocb unions! See also 'struct kiocb' in <linux/fs.h>
17284c4e1f8SLinus Torvalds */
173a3c0d439SChristoph Hellwig struct fsync_iocb {
174a3c0d439SChristoph Hellwig struct file *file;
17584c4e1f8SLinus Torvalds struct work_struct work;
176a3c0d439SChristoph Hellwig bool datasync;
177530f32fcSMiklos Szeredi struct cred *creds;
178a3c0d439SChristoph Hellwig };
179a3c0d439SChristoph Hellwig
180bfe4037eSChristoph Hellwig struct poll_iocb {
181bfe4037eSChristoph Hellwig struct file *file;
182bfe4037eSChristoph Hellwig struct wait_queue_head *head;
183bfe4037eSChristoph Hellwig __poll_t events;
184bfe4037eSChristoph Hellwig bool cancelled;
185363bee27SEric Biggers bool work_scheduled;
186363bee27SEric Biggers bool work_need_resched;
187bfe4037eSChristoph Hellwig struct wait_queue_entry wait;
188bfe4037eSChristoph Hellwig struct work_struct work;
189bfe4037eSChristoph Hellwig };
190bfe4037eSChristoph Hellwig
19184c4e1f8SLinus Torvalds /*
19284c4e1f8SLinus Torvalds * NOTE! Each of the iocb union members has the file pointer
19384c4e1f8SLinus Torvalds * as the first entry in their struct definition. So you can
19484c4e1f8SLinus Torvalds * access the file pointer through any of the sub-structs,
19584c4e1f8SLinus Torvalds * or directly as just 'ki_filp' in this struct.
19684c4e1f8SLinus Torvalds */
19704b2fa9fSChristoph Hellwig struct aio_kiocb {
19854843f87SChristoph Hellwig union {
19984c4e1f8SLinus Torvalds struct file *ki_filp;
20054843f87SChristoph Hellwig struct kiocb rw;
201a3c0d439SChristoph Hellwig struct fsync_iocb fsync;
202bfe4037eSChristoph Hellwig struct poll_iocb poll;
20354843f87SChristoph Hellwig };
20404b2fa9fSChristoph Hellwig
20504b2fa9fSChristoph Hellwig struct kioctx *ki_ctx;
20604b2fa9fSChristoph Hellwig kiocb_cancel_fn *ki_cancel;
20704b2fa9fSChristoph Hellwig
208a9339b78SAl Viro struct io_event ki_res;
20904b2fa9fSChristoph Hellwig
21004b2fa9fSChristoph Hellwig struct list_head ki_list; /* the aio core uses this
21104b2fa9fSChristoph Hellwig * for cancellation */
2129018ccc4SChristoph Hellwig refcount_t ki_refcnt;
21304b2fa9fSChristoph Hellwig
21404b2fa9fSChristoph Hellwig /*
21504b2fa9fSChristoph Hellwig * If the aio_resfd field of the userspace iocb is not zero,
21604b2fa9fSChristoph Hellwig * this is the underlying eventfd context to deliver events to.
21704b2fa9fSChristoph Hellwig */
21804b2fa9fSChristoph Hellwig struct eventfd_ctx *ki_eventfd;
21904b2fa9fSChristoph Hellwig };
22004b2fa9fSChristoph Hellwig
2211da177e4SLinus Torvalds /*------ sysctl variables----*/
222d55b5fdaSZach Brown static DEFINE_SPINLOCK(aio_nr_lock);
22386b12b6cSXiaoming Ni static unsigned long aio_nr; /* current system wide number of aio requests */
22486b12b6cSXiaoming Ni static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
2251da177e4SLinus Torvalds /*----end sysctl variables---*/
22686b12b6cSXiaoming Ni #ifdef CONFIG_SYSCTL
22786b12b6cSXiaoming Ni static struct ctl_table aio_sysctls[] = {
22886b12b6cSXiaoming Ni {
22986b12b6cSXiaoming Ni .procname = "aio-nr",
23086b12b6cSXiaoming Ni .data = &aio_nr,
23186b12b6cSXiaoming Ni .maxlen = sizeof(aio_nr),
23286b12b6cSXiaoming Ni .mode = 0444,
23386b12b6cSXiaoming Ni .proc_handler = proc_doulongvec_minmax,
23486b12b6cSXiaoming Ni },
23586b12b6cSXiaoming Ni {
23686b12b6cSXiaoming Ni .procname = "aio-max-nr",
23786b12b6cSXiaoming Ni .data = &aio_max_nr,
23886b12b6cSXiaoming Ni .maxlen = sizeof(aio_max_nr),
23986b12b6cSXiaoming Ni .mode = 0644,
24086b12b6cSXiaoming Ni .proc_handler = proc_doulongvec_minmax,
24186b12b6cSXiaoming Ni },
24286b12b6cSXiaoming Ni {}
24386b12b6cSXiaoming Ni };
24486b12b6cSXiaoming Ni
aio_sysctl_init(void)24586b12b6cSXiaoming Ni static void __init aio_sysctl_init(void)
24686b12b6cSXiaoming Ni {
24786b12b6cSXiaoming Ni register_sysctl_init("fs", aio_sysctls);
24886b12b6cSXiaoming Ni }
24986b12b6cSXiaoming Ni #else
25086b12b6cSXiaoming Ni #define aio_sysctl_init() do { } while (0)
25186b12b6cSXiaoming Ni #endif
2521da177e4SLinus Torvalds
253e18b890bSChristoph Lameter static struct kmem_cache *kiocb_cachep;
254e18b890bSChristoph Lameter static struct kmem_cache *kioctx_cachep;
2551da177e4SLinus Torvalds
25671ad7490SBenjamin LaHaise static struct vfsmount *aio_mnt;
25771ad7490SBenjamin LaHaise
25871ad7490SBenjamin LaHaise static const struct file_operations aio_ring_fops;
25971ad7490SBenjamin LaHaise static const struct address_space_operations aio_ctx_aops;
26071ad7490SBenjamin LaHaise
aio_private_file(struct kioctx * ctx,loff_t nr_pages)26171ad7490SBenjamin LaHaise static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
26271ad7490SBenjamin LaHaise {
26371ad7490SBenjamin LaHaise struct file *file;
26471ad7490SBenjamin LaHaise struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
2657f62656bSDan Carpenter if (IS_ERR(inode))
2667f62656bSDan Carpenter return ERR_CAST(inode);
26771ad7490SBenjamin LaHaise
26871ad7490SBenjamin LaHaise inode->i_mapping->a_ops = &aio_ctx_aops;
26971ad7490SBenjamin LaHaise inode->i_mapping->private_data = ctx;
27071ad7490SBenjamin LaHaise inode->i_size = PAGE_SIZE * nr_pages;
27171ad7490SBenjamin LaHaise
272d93aa9d8SAl Viro file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
273d93aa9d8SAl Viro O_RDWR, &aio_ring_fops);
274c9c554f2SAl Viro if (IS_ERR(file))
27571ad7490SBenjamin LaHaise iput(inode);
27671ad7490SBenjamin LaHaise return file;
27771ad7490SBenjamin LaHaise }
27871ad7490SBenjamin LaHaise
aio_init_fs_context(struct fs_context * fc)27952db59dfSDavid Howells static int aio_init_fs_context(struct fs_context *fc)
28071ad7490SBenjamin LaHaise {
28152db59dfSDavid Howells if (!init_pseudo(fc, AIO_RING_MAGIC))
28252db59dfSDavid Howells return -ENOMEM;
28352db59dfSDavid Howells fc->s_iflags |= SB_I_NOEXEC;
28452db59dfSDavid Howells return 0;
28571ad7490SBenjamin LaHaise }
28671ad7490SBenjamin LaHaise
2871da177e4SLinus Torvalds /* aio_setup
2881da177e4SLinus Torvalds * Creates the slab caches used by the aio routines, panic on
2891da177e4SLinus Torvalds * failure as this is done early during the boot sequence.
2901da177e4SLinus Torvalds */
aio_setup(void)2911da177e4SLinus Torvalds static int __init aio_setup(void)
2921da177e4SLinus Torvalds {
29371ad7490SBenjamin LaHaise static struct file_system_type aio_fs = {
29471ad7490SBenjamin LaHaise .name = "aio",
29552db59dfSDavid Howells .init_fs_context = aio_init_fs_context,
29671ad7490SBenjamin LaHaise .kill_sb = kill_anon_super,
29771ad7490SBenjamin LaHaise };
29871ad7490SBenjamin LaHaise aio_mnt = kern_mount(&aio_fs);
29971ad7490SBenjamin LaHaise if (IS_ERR(aio_mnt))
30071ad7490SBenjamin LaHaise panic("Failed to create aio fs mount.");
30171ad7490SBenjamin LaHaise
30204b2fa9fSChristoph Hellwig kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
3030a31bd5fSChristoph Lameter kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
30486b12b6cSXiaoming Ni aio_sysctl_init();
3051da177e4SLinus Torvalds return 0;
3061da177e4SLinus Torvalds }
307385773e0SH Hartley Sweeten __initcall(aio_setup);
3081da177e4SLinus Torvalds
put_aio_ring_file(struct kioctx * ctx)3095e9ae2e5SBenjamin LaHaise static void put_aio_ring_file(struct kioctx *ctx)
3105e9ae2e5SBenjamin LaHaise {
3115e9ae2e5SBenjamin LaHaise struct file *aio_ring_file = ctx->aio_ring_file;
312de04e769SRasmus Villemoes struct address_space *i_mapping;
313de04e769SRasmus Villemoes
3145e9ae2e5SBenjamin LaHaise if (aio_ring_file) {
31545063097SAl Viro truncate_setsize(file_inode(aio_ring_file), 0);
3165e9ae2e5SBenjamin LaHaise
3175e9ae2e5SBenjamin LaHaise /* Prevent further access to the kioctx from migratepages */
31845063097SAl Viro i_mapping = aio_ring_file->f_mapping;
319de04e769SRasmus Villemoes spin_lock(&i_mapping->private_lock);
320de04e769SRasmus Villemoes i_mapping->private_data = NULL;
3215e9ae2e5SBenjamin LaHaise ctx->aio_ring_file = NULL;
322de04e769SRasmus Villemoes spin_unlock(&i_mapping->private_lock);
3235e9ae2e5SBenjamin LaHaise
3245e9ae2e5SBenjamin LaHaise fput(aio_ring_file);
3255e9ae2e5SBenjamin LaHaise }
3265e9ae2e5SBenjamin LaHaise }
3275e9ae2e5SBenjamin LaHaise
aio_free_ring(struct kioctx * ctx)3281da177e4SLinus Torvalds static void aio_free_ring(struct kioctx *ctx)
3291da177e4SLinus Torvalds {
33036bc08ccSGu Zheng int i;
3311da177e4SLinus Torvalds
332fa8a53c3SBenjamin LaHaise /* Disconnect the kiotx from the ring file. This prevents future
333fa8a53c3SBenjamin LaHaise * accesses to the kioctx from page migration.
334fa8a53c3SBenjamin LaHaise */
335fa8a53c3SBenjamin LaHaise put_aio_ring_file(ctx);
336fa8a53c3SBenjamin LaHaise
33736bc08ccSGu Zheng for (i = 0; i < ctx->nr_pages; i++) {
3388e321fefSBenjamin LaHaise struct page *page;
33936bc08ccSGu Zheng pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
34036bc08ccSGu Zheng page_count(ctx->ring_pages[i]));
3418e321fefSBenjamin LaHaise page = ctx->ring_pages[i];
3428e321fefSBenjamin LaHaise if (!page)
3438e321fefSBenjamin LaHaise continue;
3448e321fefSBenjamin LaHaise ctx->ring_pages[i] = NULL;
3458e321fefSBenjamin LaHaise put_page(page);
34636bc08ccSGu Zheng }
3471da177e4SLinus Torvalds
348ddb8c45bSSasha Levin if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
34958c85dc2SKent Overstreet kfree(ctx->ring_pages);
350ddb8c45bSSasha Levin ctx->ring_pages = NULL;
351ddb8c45bSSasha Levin }
35236bc08ccSGu Zheng }
35336bc08ccSGu Zheng
aio_ring_mremap(struct vm_area_struct * vma)35414d07113SBrian Geffon static int aio_ring_mremap(struct vm_area_struct *vma)
35536bc08ccSGu Zheng {
3565477e70aSOleg Nesterov struct file *file = vma->vm_file;
357e4a0d3e7SPavel Emelyanov struct mm_struct *mm = vma->vm_mm;
358e4a0d3e7SPavel Emelyanov struct kioctx_table *table;
359b2edffddSAl Viro int i, res = -EINVAL;
360e4a0d3e7SPavel Emelyanov
361e4a0d3e7SPavel Emelyanov spin_lock(&mm->ioctx_lock);
362e4a0d3e7SPavel Emelyanov rcu_read_lock();
363e4a0d3e7SPavel Emelyanov table = rcu_dereference(mm->ioctx_table);
36481e9d6f8SSeth Jenkins if (!table)
36581e9d6f8SSeth Jenkins goto out_unlock;
36681e9d6f8SSeth Jenkins
367e4a0d3e7SPavel Emelyanov for (i = 0; i < table->nr; i++) {
368e4a0d3e7SPavel Emelyanov struct kioctx *ctx;
369e4a0d3e7SPavel Emelyanov
370d0264c01STejun Heo ctx = rcu_dereference(table->table[i]);
371e4a0d3e7SPavel Emelyanov if (ctx && ctx->aio_ring_file == file) {
372b2edffddSAl Viro if (!atomic_read(&ctx->dead)) {
373e4a0d3e7SPavel Emelyanov ctx->user_id = ctx->mmap_base = vma->vm_start;
374b2edffddSAl Viro res = 0;
375b2edffddSAl Viro }
376e4a0d3e7SPavel Emelyanov break;
377e4a0d3e7SPavel Emelyanov }
378e4a0d3e7SPavel Emelyanov }
379e4a0d3e7SPavel Emelyanov
38081e9d6f8SSeth Jenkins out_unlock:
381e4a0d3e7SPavel Emelyanov rcu_read_unlock();
382e4a0d3e7SPavel Emelyanov spin_unlock(&mm->ioctx_lock);
383b2edffddSAl Viro return res;
384e4a0d3e7SPavel Emelyanov }
385e4a0d3e7SPavel Emelyanov
3865477e70aSOleg Nesterov static const struct vm_operations_struct aio_ring_vm_ops = {
3875477e70aSOleg Nesterov .mremap = aio_ring_mremap,
3885477e70aSOleg Nesterov #if IS_ENABLED(CONFIG_MMU)
3895477e70aSOleg Nesterov .fault = filemap_fault,
3905477e70aSOleg Nesterov .map_pages = filemap_map_pages,
3915477e70aSOleg Nesterov .page_mkwrite = filemap_page_mkwrite,
3925477e70aSOleg Nesterov #endif
3935477e70aSOleg Nesterov };
3945477e70aSOleg Nesterov
aio_ring_mmap(struct file * file,struct vm_area_struct * vma)3955477e70aSOleg Nesterov static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
3965477e70aSOleg Nesterov {
3971c71222eSSuren Baghdasaryan vm_flags_set(vma, VM_DONTEXPAND);
3985477e70aSOleg Nesterov vma->vm_ops = &aio_ring_vm_ops;
3995477e70aSOleg Nesterov return 0;
4005477e70aSOleg Nesterov }
4015477e70aSOleg Nesterov
40236bc08ccSGu Zheng static const struct file_operations aio_ring_fops = {
40336bc08ccSGu Zheng .mmap = aio_ring_mmap,
40436bc08ccSGu Zheng };
40536bc08ccSGu Zheng
4060c45355fSBenjamin LaHaise #if IS_ENABLED(CONFIG_MIGRATION)
aio_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)4073648951cSMatthew Wilcox (Oracle) static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
4083648951cSMatthew Wilcox (Oracle) struct folio *src, enum migrate_mode mode)
40936bc08ccSGu Zheng {
4105e9ae2e5SBenjamin LaHaise struct kioctx *ctx;
41136bc08ccSGu Zheng unsigned long flags;
412fa8a53c3SBenjamin LaHaise pgoff_t idx;
41336bc08ccSGu Zheng int rc;
41436bc08ccSGu Zheng
4152916ecc0SJérôme Glisse /*
4162916ecc0SJérôme Glisse * We cannot support the _NO_COPY case here, because copy needs to
4172916ecc0SJérôme Glisse * happen under the ctx->completion_lock. That does not work with the
4182916ecc0SJérôme Glisse * migration workflow of MIGRATE_SYNC_NO_COPY.
4192916ecc0SJérôme Glisse */
4202916ecc0SJérôme Glisse if (mode == MIGRATE_SYNC_NO_COPY)
4212916ecc0SJérôme Glisse return -EINVAL;
4222916ecc0SJérôme Glisse
4238e321fefSBenjamin LaHaise rc = 0;
4248e321fefSBenjamin LaHaise
425fa8a53c3SBenjamin LaHaise /* mapping->private_lock here protects against the kioctx teardown. */
4268e321fefSBenjamin LaHaise spin_lock(&mapping->private_lock);
4278e321fefSBenjamin LaHaise ctx = mapping->private_data;
428fa8a53c3SBenjamin LaHaise if (!ctx) {
429fa8a53c3SBenjamin LaHaise rc = -EINVAL;
430fa8a53c3SBenjamin LaHaise goto out;
431fa8a53c3SBenjamin LaHaise }
432fa8a53c3SBenjamin LaHaise
433fa8a53c3SBenjamin LaHaise /* The ring_lock mutex. The prevents aio_read_events() from writing
434fa8a53c3SBenjamin LaHaise * to the ring's head, and prevents page migration from mucking in
435fa8a53c3SBenjamin LaHaise * a partially initialized kiotx.
436fa8a53c3SBenjamin LaHaise */
437fa8a53c3SBenjamin LaHaise if (!mutex_trylock(&ctx->ring_lock)) {
438fa8a53c3SBenjamin LaHaise rc = -EAGAIN;
439fa8a53c3SBenjamin LaHaise goto out;
440fa8a53c3SBenjamin LaHaise }
441fa8a53c3SBenjamin LaHaise
4423648951cSMatthew Wilcox (Oracle) idx = src->index;
4438e321fefSBenjamin LaHaise if (idx < (pgoff_t)ctx->nr_pages) {
4443648951cSMatthew Wilcox (Oracle) /* Make sure the old folio hasn't already been changed */
4453648951cSMatthew Wilcox (Oracle) if (ctx->ring_pages[idx] != &src->page)
4468e321fefSBenjamin LaHaise rc = -EAGAIN;
4478e321fefSBenjamin LaHaise } else
4488e321fefSBenjamin LaHaise rc = -EINVAL;
4498e321fefSBenjamin LaHaise
4508e321fefSBenjamin LaHaise if (rc != 0)
451fa8a53c3SBenjamin LaHaise goto out_unlock;
4528e321fefSBenjamin LaHaise
45336bc08ccSGu Zheng /* Writeback must be complete */
4543648951cSMatthew Wilcox (Oracle) BUG_ON(folio_test_writeback(src));
4553648951cSMatthew Wilcox (Oracle) folio_get(dst);
45636bc08ccSGu Zheng
4573648951cSMatthew Wilcox (Oracle) rc = folio_migrate_mapping(mapping, dst, src, 1);
45836bc08ccSGu Zheng if (rc != MIGRATEPAGE_SUCCESS) {
4593648951cSMatthew Wilcox (Oracle) folio_put(dst);
460fa8a53c3SBenjamin LaHaise goto out_unlock;
46136bc08ccSGu Zheng }
46236bc08ccSGu Zheng
463fa8a53c3SBenjamin LaHaise /* Take completion_lock to prevent other writes to the ring buffer
4643648951cSMatthew Wilcox (Oracle) * while the old folio is copied to the new. This prevents new
465fa8a53c3SBenjamin LaHaise * events from being lost.
4665e9ae2e5SBenjamin LaHaise */
46736bc08ccSGu Zheng spin_lock_irqsave(&ctx->completion_lock, flags);
4683648951cSMatthew Wilcox (Oracle) folio_migrate_copy(dst, src);
4693648951cSMatthew Wilcox (Oracle) BUG_ON(ctx->ring_pages[idx] != &src->page);
4703648951cSMatthew Wilcox (Oracle) ctx->ring_pages[idx] = &dst->page;
47136bc08ccSGu Zheng spin_unlock_irqrestore(&ctx->completion_lock, flags);
47236bc08ccSGu Zheng
4733648951cSMatthew Wilcox (Oracle) /* The old folio is no longer accessible. */
4743648951cSMatthew Wilcox (Oracle) folio_put(src);
4758e321fefSBenjamin LaHaise
476fa8a53c3SBenjamin LaHaise out_unlock:
477fa8a53c3SBenjamin LaHaise mutex_unlock(&ctx->ring_lock);
478fa8a53c3SBenjamin LaHaise out:
479fa8a53c3SBenjamin LaHaise spin_unlock(&mapping->private_lock);
48036bc08ccSGu Zheng return rc;
48136bc08ccSGu Zheng }
4823648951cSMatthew Wilcox (Oracle) #else
4833648951cSMatthew Wilcox (Oracle) #define aio_migrate_folio NULL
4840c45355fSBenjamin LaHaise #endif
48536bc08ccSGu Zheng
48636bc08ccSGu Zheng static const struct address_space_operations aio_ctx_aops = {
48746de8b97SMatthew Wilcox (Oracle) .dirty_folio = noop_dirty_folio,
4883648951cSMatthew Wilcox (Oracle) .migrate_folio = aio_migrate_folio,
48936bc08ccSGu Zheng };
4901da177e4SLinus Torvalds
aio_setup_ring(struct kioctx * ctx,unsigned int nr_events)4912a8a9867SMauricio Faria de Oliveira static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
4921da177e4SLinus Torvalds {
4931da177e4SLinus Torvalds struct aio_ring *ring;
49441003a7bSZach Brown struct mm_struct *mm = current->mm;
4953dc9acb6SLinus Torvalds unsigned long size, unused;
4961da177e4SLinus Torvalds int nr_pages;
49736bc08ccSGu Zheng int i;
49836bc08ccSGu Zheng struct file *file;
4991da177e4SLinus Torvalds
5001da177e4SLinus Torvalds /* Compensate for the ring buffer's head/tail overlap entry */
5011da177e4SLinus Torvalds nr_events += 2; /* 1 is required, 2 for good luck */
5021da177e4SLinus Torvalds
5031da177e4SLinus Torvalds size = sizeof(struct aio_ring);
5041da177e4SLinus Torvalds size += sizeof(struct io_event) * nr_events;
5051da177e4SLinus Torvalds
50636bc08ccSGu Zheng nr_pages = PFN_UP(size);
5071da177e4SLinus Torvalds if (nr_pages < 0)
5081da177e4SLinus Torvalds return -EINVAL;
5091da177e4SLinus Torvalds
51071ad7490SBenjamin LaHaise file = aio_private_file(ctx, nr_pages);
51136bc08ccSGu Zheng if (IS_ERR(file)) {
51236bc08ccSGu Zheng ctx->aio_ring_file = NULL;
513fa8a53c3SBenjamin LaHaise return -ENOMEM;
51436bc08ccSGu Zheng }
5151da177e4SLinus Torvalds
51636bc08ccSGu Zheng ctx->aio_ring_file = file;
51736bc08ccSGu Zheng nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
51836bc08ccSGu Zheng / sizeof(struct io_event);
51936bc08ccSGu Zheng
52058c85dc2SKent Overstreet ctx->ring_pages = ctx->internal_pages;
5211da177e4SLinus Torvalds if (nr_pages > AIO_RING_PAGES) {
52258c85dc2SKent Overstreet ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
52358c85dc2SKent Overstreet GFP_KERNEL);
524d1b94327SGu Zheng if (!ctx->ring_pages) {
525d1b94327SGu Zheng put_aio_ring_file(ctx);
5261da177e4SLinus Torvalds return -ENOMEM;
5271da177e4SLinus Torvalds }
528d1b94327SGu Zheng }
5291da177e4SLinus Torvalds
5303dc9acb6SLinus Torvalds for (i = 0; i < nr_pages; i++) {
5313dc9acb6SLinus Torvalds struct page *page;
53245063097SAl Viro page = find_or_create_page(file->f_mapping,
5335c075c5bSFabio M. De Francesco i, GFP_USER | __GFP_ZERO);
5343dc9acb6SLinus Torvalds if (!page)
5353dc9acb6SLinus Torvalds break;
5363dc9acb6SLinus Torvalds pr_debug("pid(%d) page[%d]->count=%d\n",
5373dc9acb6SLinus Torvalds current->pid, i, page_count(page));
5383dc9acb6SLinus Torvalds SetPageUptodate(page);
5393dc9acb6SLinus Torvalds unlock_page(page);
5403dc9acb6SLinus Torvalds
5413dc9acb6SLinus Torvalds ctx->ring_pages[i] = page;
5423dc9acb6SLinus Torvalds }
5433dc9acb6SLinus Torvalds ctx->nr_pages = i;
5443dc9acb6SLinus Torvalds
5453dc9acb6SLinus Torvalds if (unlikely(i != nr_pages)) {
5463dc9acb6SLinus Torvalds aio_free_ring(ctx);
547fa8a53c3SBenjamin LaHaise return -ENOMEM;
5483dc9acb6SLinus Torvalds }
5493dc9acb6SLinus Torvalds
55058c85dc2SKent Overstreet ctx->mmap_size = nr_pages * PAGE_SIZE;
55158c85dc2SKent Overstreet pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
55236bc08ccSGu Zheng
553d8ed45c5SMichel Lespinasse if (mmap_write_lock_killable(mm)) {
554013373e8SMichal Hocko ctx->mmap_size = 0;
555013373e8SMichal Hocko aio_free_ring(ctx);
556013373e8SMichal Hocko return -EINTR;
557013373e8SMichal Hocko }
558013373e8SMichal Hocko
55945e55300SPeter Collingbourne ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
560e3fc629dSAl Viro PROT_READ | PROT_WRITE,
561592b5fadSYu-cheng Yu MAP_SHARED, 0, 0, &unused, NULL);
562d8ed45c5SMichel Lespinasse mmap_write_unlock(mm);
5633dc9acb6SLinus Torvalds if (IS_ERR((void *)ctx->mmap_base)) {
56458c85dc2SKent Overstreet ctx->mmap_size = 0;
5651da177e4SLinus Torvalds aio_free_ring(ctx);
566fa8a53c3SBenjamin LaHaise return -ENOMEM;
5671da177e4SLinus Torvalds }
5681da177e4SLinus Torvalds
56958c85dc2SKent Overstreet pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
570d6c355c7SBenjamin LaHaise
57158c85dc2SKent Overstreet ctx->user_id = ctx->mmap_base;
57258c85dc2SKent Overstreet ctx->nr_events = nr_events; /* trusted copy */
5731da177e4SLinus Torvalds
5745c075c5bSFabio M. De Francesco ring = page_address(ctx->ring_pages[0]);
5751da177e4SLinus Torvalds ring->nr = nr_events; /* user copy */
576db446a08SBenjamin LaHaise ring->id = ~0U;
5771da177e4SLinus Torvalds ring->head = ring->tail = 0;
5781da177e4SLinus Torvalds ring->magic = AIO_RING_MAGIC;
5791da177e4SLinus Torvalds ring->compat_features = AIO_RING_COMPAT_FEATURES;
5801da177e4SLinus Torvalds ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
5811da177e4SLinus Torvalds ring->header_length = sizeof(struct aio_ring);
58258c85dc2SKent Overstreet flush_dcache_page(ctx->ring_pages[0]);
5831da177e4SLinus Torvalds
5841da177e4SLinus Torvalds return 0;
5851da177e4SLinus Torvalds }
5861da177e4SLinus Torvalds
5871da177e4SLinus Torvalds #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
5881da177e4SLinus Torvalds #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
5891da177e4SLinus Torvalds #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
5901da177e4SLinus Torvalds
kiocb_set_cancel_fn(struct kiocb * iocb,kiocb_cancel_fn * cancel)59104b2fa9fSChristoph Hellwig void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
5920460fef2SKent Overstreet {
593*c01ed748SBart Van Assche struct aio_kiocb *req;
594*c01ed748SBart Van Assche struct kioctx *ctx;
5950460fef2SKent Overstreet unsigned long flags;
5960460fef2SKent Overstreet
597e7e23fc5SBart Van Assche /*
598e7e23fc5SBart Van Assche * kiocb didn't come from aio or is neither a read nor a write, hence
599e7e23fc5SBart Van Assche * ignore it.
600e7e23fc5SBart Van Assche */
601e7e23fc5SBart Van Assche if (!(iocb->ki_flags & IOCB_AIO_RW))
602e7e23fc5SBart Van Assche return;
603e7e23fc5SBart Van Assche
604*c01ed748SBart Van Assche req = container_of(iocb, struct aio_kiocb, rw);
605*c01ed748SBart Van Assche
60675321b50SChristoph Hellwig if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
60775321b50SChristoph Hellwig return;
60875321b50SChristoph Hellwig
609*c01ed748SBart Van Assche ctx = req->ki_ctx;
610*c01ed748SBart Van Assche
6110460fef2SKent Overstreet spin_lock_irqsave(&ctx->ctx_lock, flags);
61275321b50SChristoph Hellwig list_add_tail(&req->ki_list, &ctx->active_reqs);
6130460fef2SKent Overstreet req->ki_cancel = cancel;
6140460fef2SKent Overstreet spin_unlock_irqrestore(&ctx->ctx_lock, flags);
6150460fef2SKent Overstreet }
6160460fef2SKent Overstreet EXPORT_SYMBOL(kiocb_set_cancel_fn);
6170460fef2SKent Overstreet
618a6d7cff4STejun Heo /*
619a6d7cff4STejun Heo * free_ioctx() should be RCU delayed to synchronize against the RCU
620a6d7cff4STejun Heo * protected lookup_ioctx() and also needs process context to call
621f729863aSTejun Heo * aio_free_ring(). Use rcu_work.
622a6d7cff4STejun Heo */
free_ioctx(struct work_struct * work)623e34ecee2SKent Overstreet static void free_ioctx(struct work_struct *work)
62436f55889SKent Overstreet {
625f729863aSTejun Heo struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
626f729863aSTejun Heo free_rwork);
627e34ecee2SKent Overstreet pr_debug("freeing %p\n", ctx);
628e34ecee2SKent Overstreet
629e34ecee2SKent Overstreet aio_free_ring(ctx);
630e1bdd5f2SKent Overstreet free_percpu(ctx->cpu);
6319a1049daSTejun Heo percpu_ref_exit(&ctx->reqs);
6329a1049daSTejun Heo percpu_ref_exit(&ctx->users);
63336f55889SKent Overstreet kmem_cache_free(kioctx_cachep, ctx);
63436f55889SKent Overstreet }
63536f55889SKent Overstreet
free_ioctx_reqs(struct percpu_ref * ref)636e34ecee2SKent Overstreet static void free_ioctx_reqs(struct percpu_ref *ref)
637e34ecee2SKent Overstreet {
638e34ecee2SKent Overstreet struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
639e34ecee2SKent Overstreet
640e02ba72aSAnatol Pomozov /* At this point we know that there are no any in-flight requests */
641dc48e56dSJens Axboe if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
642dc48e56dSJens Axboe complete(&ctx->rq_wait->comp);
643e02ba72aSAnatol Pomozov
644a6d7cff4STejun Heo /* Synchronize against RCU protected table->table[] dereferences */
645f729863aSTejun Heo INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
646f729863aSTejun Heo queue_rcu_work(system_wq, &ctx->free_rwork);
647e34ecee2SKent Overstreet }
648e34ecee2SKent Overstreet
64936f55889SKent Overstreet /*
65036f55889SKent Overstreet * When this function runs, the kioctx has been removed from the "hash table"
65136f55889SKent Overstreet * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
65236f55889SKent Overstreet * now it's safe to cancel any that need to be.
65336f55889SKent Overstreet */
free_ioctx_users(struct percpu_ref * ref)654e34ecee2SKent Overstreet static void free_ioctx_users(struct percpu_ref *ref)
65536f55889SKent Overstreet {
656e34ecee2SKent Overstreet struct kioctx *ctx = container_of(ref, struct kioctx, users);
65704b2fa9fSChristoph Hellwig struct aio_kiocb *req;
65836f55889SKent Overstreet
65936f55889SKent Overstreet spin_lock_irq(&ctx->ctx_lock);
66036f55889SKent Overstreet
66136f55889SKent Overstreet while (!list_empty(&ctx->active_reqs)) {
66236f55889SKent Overstreet req = list_first_entry(&ctx->active_reqs,
66304b2fa9fSChristoph Hellwig struct aio_kiocb, ki_list);
664888933f8SChristoph Hellwig req->ki_cancel(&req->rw);
6654faa9996SAl Viro list_del_init(&req->ki_list);
66636f55889SKent Overstreet }
66736f55889SKent Overstreet
66836f55889SKent Overstreet spin_unlock_irq(&ctx->ctx_lock);
66936f55889SKent Overstreet
670e34ecee2SKent Overstreet percpu_ref_kill(&ctx->reqs);
671e34ecee2SKent Overstreet percpu_ref_put(&ctx->reqs);
67236f55889SKent Overstreet }
67336f55889SKent Overstreet
ioctx_add_table(struct kioctx * ctx,struct mm_struct * mm)674db446a08SBenjamin LaHaise static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
675db446a08SBenjamin LaHaise {
676db446a08SBenjamin LaHaise unsigned i, new_nr;
677db446a08SBenjamin LaHaise struct kioctx_table *table, *old;
678db446a08SBenjamin LaHaise struct aio_ring *ring;
679db446a08SBenjamin LaHaise
680db446a08SBenjamin LaHaise spin_lock(&mm->ioctx_lock);
681855ef0deSOleg Nesterov table = rcu_dereference_raw(mm->ioctx_table);
682db446a08SBenjamin LaHaise
683db446a08SBenjamin LaHaise while (1) {
684db446a08SBenjamin LaHaise if (table)
685db446a08SBenjamin LaHaise for (i = 0; i < table->nr; i++)
686d0264c01STejun Heo if (!rcu_access_pointer(table->table[i])) {
687db446a08SBenjamin LaHaise ctx->id = i;
688d0264c01STejun Heo rcu_assign_pointer(table->table[i], ctx);
689db446a08SBenjamin LaHaise spin_unlock(&mm->ioctx_lock);
690db446a08SBenjamin LaHaise
691fa8a53c3SBenjamin LaHaise /* While kioctx setup is in progress,
692fa8a53c3SBenjamin LaHaise * we are protected from page migration
693fa8a53c3SBenjamin LaHaise * changes ring_pages by ->ring_lock.
694fa8a53c3SBenjamin LaHaise */
6955c075c5bSFabio M. De Francesco ring = page_address(ctx->ring_pages[0]);
696db446a08SBenjamin LaHaise ring->id = ctx->id;
697db446a08SBenjamin LaHaise return 0;
698db446a08SBenjamin LaHaise }
699db446a08SBenjamin LaHaise
700db446a08SBenjamin LaHaise new_nr = (table ? table->nr : 1) * 4;
701db446a08SBenjamin LaHaise spin_unlock(&mm->ioctx_lock);
702db446a08SBenjamin LaHaise
7036446c4fbSLen Baker table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL);
704db446a08SBenjamin LaHaise if (!table)
705db446a08SBenjamin LaHaise return -ENOMEM;
706db446a08SBenjamin LaHaise
707db446a08SBenjamin LaHaise table->nr = new_nr;
708db446a08SBenjamin LaHaise
709db446a08SBenjamin LaHaise spin_lock(&mm->ioctx_lock);
710855ef0deSOleg Nesterov old = rcu_dereference_raw(mm->ioctx_table);
711db446a08SBenjamin LaHaise
712db446a08SBenjamin LaHaise if (!old) {
713db446a08SBenjamin LaHaise rcu_assign_pointer(mm->ioctx_table, table);
714db446a08SBenjamin LaHaise } else if (table->nr > old->nr) {
715db446a08SBenjamin LaHaise memcpy(table->table, old->table,
716db446a08SBenjamin LaHaise old->nr * sizeof(struct kioctx *));
717db446a08SBenjamin LaHaise
718db446a08SBenjamin LaHaise rcu_assign_pointer(mm->ioctx_table, table);
719db446a08SBenjamin LaHaise kfree_rcu(old, rcu);
720db446a08SBenjamin LaHaise } else {
721db446a08SBenjamin LaHaise kfree(table);
722db446a08SBenjamin LaHaise table = old;
723db446a08SBenjamin LaHaise }
724db446a08SBenjamin LaHaise }
725db446a08SBenjamin LaHaise }
726db446a08SBenjamin LaHaise
aio_nr_sub(unsigned nr)727e34ecee2SKent Overstreet static void aio_nr_sub(unsigned nr)
728e34ecee2SKent Overstreet {
729e34ecee2SKent Overstreet spin_lock(&aio_nr_lock);
730e34ecee2SKent Overstreet if (WARN_ON(aio_nr - nr > aio_nr))
731e34ecee2SKent Overstreet aio_nr = 0;
732e34ecee2SKent Overstreet else
733e34ecee2SKent Overstreet aio_nr -= nr;
734e34ecee2SKent Overstreet spin_unlock(&aio_nr_lock);
735e34ecee2SKent Overstreet }
736e34ecee2SKent Overstreet
7371da177e4SLinus Torvalds /* ioctx_alloc
7381da177e4SLinus Torvalds * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
7391da177e4SLinus Torvalds */
ioctx_alloc(unsigned nr_events)7401da177e4SLinus Torvalds static struct kioctx *ioctx_alloc(unsigned nr_events)
7411da177e4SLinus Torvalds {
74241003a7bSZach Brown struct mm_struct *mm = current->mm;
7431da177e4SLinus Torvalds struct kioctx *ctx;
744e23754f8SAl Viro int err = -ENOMEM;
7451da177e4SLinus Torvalds
746e1bdd5f2SKent Overstreet /*
7472a8a9867SMauricio Faria de Oliveira * Store the original nr_events -- what userspace passed to io_setup(),
7482a8a9867SMauricio Faria de Oliveira * for counting against the global limit -- before it changes.
7492a8a9867SMauricio Faria de Oliveira */
7502a8a9867SMauricio Faria de Oliveira unsigned int max_reqs = nr_events;
7512a8a9867SMauricio Faria de Oliveira
7522a8a9867SMauricio Faria de Oliveira /*
753e1bdd5f2SKent Overstreet * We keep track of the number of available ringbuffer slots, to prevent
754e1bdd5f2SKent Overstreet * overflow (reqs_available), and we also use percpu counters for this.
755e1bdd5f2SKent Overstreet *
756e1bdd5f2SKent Overstreet * So since up to half the slots might be on other cpu's percpu counters
757e1bdd5f2SKent Overstreet * and unavailable, double nr_events so userspace sees what they
758e1bdd5f2SKent Overstreet * expected: additionally, we move req_batch slots to/from percpu
759e1bdd5f2SKent Overstreet * counters at a time, so make sure that isn't 0:
760e1bdd5f2SKent Overstreet */
761e1bdd5f2SKent Overstreet nr_events = max(nr_events, num_possible_cpus() * 4);
762e1bdd5f2SKent Overstreet nr_events *= 2;
763e1bdd5f2SKent Overstreet
7641da177e4SLinus Torvalds /* Prevent overflows */
76508397acdSAl Viro if (nr_events > (0x10000000U / sizeof(struct io_event))) {
7661da177e4SLinus Torvalds pr_debug("ENOMEM: nr_events too high\n");
7671da177e4SLinus Torvalds return ERR_PTR(-EINVAL);
7681da177e4SLinus Torvalds }
7691da177e4SLinus Torvalds
7702a8a9867SMauricio Faria de Oliveira if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
7711da177e4SLinus Torvalds return ERR_PTR(-EAGAIN);
7721da177e4SLinus Torvalds
773c3762229SRobert P. J. Day ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
7741da177e4SLinus Torvalds if (!ctx)
7751da177e4SLinus Torvalds return ERR_PTR(-ENOMEM);
7761da177e4SLinus Torvalds
7772a8a9867SMauricio Faria de Oliveira ctx->max_reqs = max_reqs;
7781da177e4SLinus Torvalds
779fa8a53c3SBenjamin LaHaise spin_lock_init(&ctx->ctx_lock);
780fa8a53c3SBenjamin LaHaise spin_lock_init(&ctx->completion_lock);
781fa8a53c3SBenjamin LaHaise mutex_init(&ctx->ring_lock);
782fa8a53c3SBenjamin LaHaise /* Protect against page migration throughout kiotx setup by keeping
783fa8a53c3SBenjamin LaHaise * the ring_lock mutex held until setup is complete. */
784fa8a53c3SBenjamin LaHaise mutex_lock(&ctx->ring_lock);
785fa8a53c3SBenjamin LaHaise init_waitqueue_head(&ctx->wait);
786fa8a53c3SBenjamin LaHaise
787fa8a53c3SBenjamin LaHaise INIT_LIST_HEAD(&ctx->active_reqs);
788fa8a53c3SBenjamin LaHaise
7892aad2a86STejun Heo if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
790e34ecee2SKent Overstreet goto err;
791e34ecee2SKent Overstreet
7922aad2a86STejun Heo if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
793e34ecee2SKent Overstreet goto err;
794723be6e3SKent Overstreet
795e1bdd5f2SKent Overstreet ctx->cpu = alloc_percpu(struct kioctx_cpu);
796e1bdd5f2SKent Overstreet if (!ctx->cpu)
797e34ecee2SKent Overstreet goto err;
7981da177e4SLinus Torvalds
7992a8a9867SMauricio Faria de Oliveira err = aio_setup_ring(ctx, nr_events);
800fa8a53c3SBenjamin LaHaise if (err < 0)
801e34ecee2SKent Overstreet goto err;
802e1bdd5f2SKent Overstreet
80334e83fc6SKent Overstreet atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
804e1bdd5f2SKent Overstreet ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
8056878ea72SBenjamin LaHaise if (ctx->req_batch < 1)
8066878ea72SBenjamin LaHaise ctx->req_batch = 1;
80734e83fc6SKent Overstreet
8081da177e4SLinus Torvalds /* limit the number of system wide aios */
8099fa1cb39SAl Viro spin_lock(&aio_nr_lock);
8102a8a9867SMauricio Faria de Oliveira if (aio_nr + ctx->max_reqs > aio_max_nr ||
8112a8a9867SMauricio Faria de Oliveira aio_nr + ctx->max_reqs < aio_nr) {
8129fa1cb39SAl Viro spin_unlock(&aio_nr_lock);
813e34ecee2SKent Overstreet err = -EAGAIN;
814d1b94327SGu Zheng goto err_ctx;
8152dd542b7SAl Viro }
816d55b5fdaSZach Brown aio_nr += ctx->max_reqs;
8179fa1cb39SAl Viro spin_unlock(&aio_nr_lock);
8181da177e4SLinus Torvalds
819723be6e3SKent Overstreet percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
8201881686fSBenjamin LaHaise percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */
821723be6e3SKent Overstreet
822da90382cSBenjamin LaHaise err = ioctx_add_table(ctx, mm);
823da90382cSBenjamin LaHaise if (err)
824e34ecee2SKent Overstreet goto err_cleanup;
825da90382cSBenjamin LaHaise
826fa8a53c3SBenjamin LaHaise /* Release the ring_lock mutex now that all setup is complete. */
827fa8a53c3SBenjamin LaHaise mutex_unlock(&ctx->ring_lock);
828fa8a53c3SBenjamin LaHaise
829caf4167aSKent Overstreet pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
83058c85dc2SKent Overstreet ctx, ctx->user_id, mm, ctx->nr_events);
8311da177e4SLinus Torvalds return ctx;
8321da177e4SLinus Torvalds
833e34ecee2SKent Overstreet err_cleanup:
834e34ecee2SKent Overstreet aio_nr_sub(ctx->max_reqs);
835d1b94327SGu Zheng err_ctx:
836deeb8525SAl Viro atomic_set(&ctx->dead, 1);
837deeb8525SAl Viro if (ctx->mmap_size)
838deeb8525SAl Viro vm_munmap(ctx->mmap_base, ctx->mmap_size);
839d1b94327SGu Zheng aio_free_ring(ctx);
840e34ecee2SKent Overstreet err:
841fa8a53c3SBenjamin LaHaise mutex_unlock(&ctx->ring_lock);
842e1bdd5f2SKent Overstreet free_percpu(ctx->cpu);
8439a1049daSTejun Heo percpu_ref_exit(&ctx->reqs);
8449a1049daSTejun Heo percpu_ref_exit(&ctx->users);
8451da177e4SLinus Torvalds kmem_cache_free(kioctx_cachep, ctx);
846caf4167aSKent Overstreet pr_debug("error allocating ioctx %d\n", err);
847e23754f8SAl Viro return ERR_PTR(err);
8481da177e4SLinus Torvalds }
8491da177e4SLinus Torvalds
85036f55889SKent Overstreet /* kill_ioctx
8511da177e4SLinus Torvalds * Cancels all outstanding aio requests on an aio context. Used
8521da177e4SLinus Torvalds * when the processes owning a context have all exited to encourage
8531da177e4SLinus Torvalds * the rapid destruction of the kioctx.
8541da177e4SLinus Torvalds */
kill_ioctx(struct mm_struct * mm,struct kioctx * ctx,struct ctx_rq_wait * wait)855fb2d4483SBenjamin LaHaise static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
856dc48e56dSJens Axboe struct ctx_rq_wait *wait)
8571da177e4SLinus Torvalds {
858db446a08SBenjamin LaHaise struct kioctx_table *table;
859db446a08SBenjamin LaHaise
860db446a08SBenjamin LaHaise spin_lock(&mm->ioctx_lock);
861b2edffddSAl Viro if (atomic_xchg(&ctx->dead, 1)) {
862b2edffddSAl Viro spin_unlock(&mm->ioctx_lock);
863b2edffddSAl Viro return -EINVAL;
864b2edffddSAl Viro }
865b2edffddSAl Viro
866855ef0deSOleg Nesterov table = rcu_dereference_raw(mm->ioctx_table);
867d0264c01STejun Heo WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
868d0264c01STejun Heo RCU_INIT_POINTER(table->table[ctx->id], NULL);
869db446a08SBenjamin LaHaise spin_unlock(&mm->ioctx_lock);
870db446a08SBenjamin LaHaise
871a6d7cff4STejun Heo /* free_ioctx_reqs() will do the necessary RCU synchronization */
872723be6e3SKent Overstreet wake_up_all(&ctx->wait);
87306af121eSAl Viro
87436f55889SKent Overstreet /*
8754fcc712fSKent Overstreet * It'd be more correct to do this in free_ioctx(), after all
8764fcc712fSKent Overstreet * the outstanding kiocbs have finished - but by then io_destroy
8774fcc712fSKent Overstreet * has already returned, so io_setup() could potentially return
8784fcc712fSKent Overstreet * -EAGAIN with no ioctxs actually in use (as far as userspace
8794fcc712fSKent Overstreet * could tell).
88036f55889SKent Overstreet */
881e34ecee2SKent Overstreet aio_nr_sub(ctx->max_reqs);
8824fcc712fSKent Overstreet
8834fcc712fSKent Overstreet if (ctx->mmap_size)
8844fcc712fSKent Overstreet vm_munmap(ctx->mmap_base, ctx->mmap_size);
8854fcc712fSKent Overstreet
886dc48e56dSJens Axboe ctx->rq_wait = wait;
887723be6e3SKent Overstreet percpu_ref_kill(&ctx->users);
888fb2d4483SBenjamin LaHaise return 0;
8891da177e4SLinus Torvalds }
8901da177e4SLinus Torvalds
89136f55889SKent Overstreet /*
89236f55889SKent Overstreet * exit_aio: called when the last user of mm goes away. At this point, there is
89336f55889SKent Overstreet * no way for any new requests to be submited or any of the io_* syscalls to be
89436f55889SKent Overstreet * called on the context.
89536f55889SKent Overstreet *
89636f55889SKent Overstreet * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
89736f55889SKent Overstreet * them.
8981da177e4SLinus Torvalds */
exit_aio(struct mm_struct * mm)899fc9b52cdSHarvey Harrison void exit_aio(struct mm_struct *mm)
9001da177e4SLinus Torvalds {
9014b70ac5fSOleg Nesterov struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
902dc48e56dSJens Axboe struct ctx_rq_wait wait;
903dc48e56dSJens Axboe int i, skipped;
904abf137ddSJens Axboe
9054b70ac5fSOleg Nesterov if (!table)
906db446a08SBenjamin LaHaise return;
907db446a08SBenjamin LaHaise
908dc48e56dSJens Axboe atomic_set(&wait.count, table->nr);
909dc48e56dSJens Axboe init_completion(&wait.comp);
910dc48e56dSJens Axboe
911dc48e56dSJens Axboe skipped = 0;
9124b70ac5fSOleg Nesterov for (i = 0; i < table->nr; ++i) {
913d0264c01STejun Heo struct kioctx *ctx =
914d0264c01STejun Heo rcu_dereference_protected(table->table[i], true);
915db446a08SBenjamin LaHaise
916dc48e56dSJens Axboe if (!ctx) {
917dc48e56dSJens Axboe skipped++;
9184b70ac5fSOleg Nesterov continue;
919dc48e56dSJens Axboe }
920dc48e56dSJens Axboe
921936af157SAl Viro /*
9224b70ac5fSOleg Nesterov * We don't need to bother with munmap() here - exit_mmap(mm)
9234b70ac5fSOleg Nesterov * is coming and it'll unmap everything. And we simply can't,
9244b70ac5fSOleg Nesterov * this is not necessarily our ->mm.
9254b70ac5fSOleg Nesterov * Since kill_ioctx() uses non-zero ->mmap_size as indicator
9264b70ac5fSOleg Nesterov * that it needs to unmap the area, just set it to 0.
927936af157SAl Viro */
92858c85dc2SKent Overstreet ctx->mmap_size = 0;
929dc48e56dSJens Axboe kill_ioctx(mm, ctx, &wait);
930dc48e56dSJens Axboe }
93136f55889SKent Overstreet
932dc48e56dSJens Axboe if (!atomic_sub_and_test(skipped, &wait.count)) {
9336098b45bSGu Zheng /* Wait until all IO for the context are done. */
934dc48e56dSJens Axboe wait_for_completion(&wait.comp);
9351da177e4SLinus Torvalds }
9364b70ac5fSOleg Nesterov
9374b70ac5fSOleg Nesterov RCU_INIT_POINTER(mm->ioctx_table, NULL);
9384b70ac5fSOleg Nesterov kfree(table);
9391da177e4SLinus Torvalds }
9401da177e4SLinus Torvalds
put_reqs_available(struct kioctx * ctx,unsigned nr)941e1bdd5f2SKent Overstreet static void put_reqs_available(struct kioctx *ctx, unsigned nr)
942e1bdd5f2SKent Overstreet {
943e1bdd5f2SKent Overstreet struct kioctx_cpu *kcpu;
944263782c1SBenjamin LaHaise unsigned long flags;
945e1bdd5f2SKent Overstreet
946263782c1SBenjamin LaHaise local_irq_save(flags);
947be6fb451SBenjamin LaHaise kcpu = this_cpu_ptr(ctx->cpu);
948e1bdd5f2SKent Overstreet kcpu->reqs_available += nr;
949263782c1SBenjamin LaHaise
950e1bdd5f2SKent Overstreet while (kcpu->reqs_available >= ctx->req_batch * 2) {
951e1bdd5f2SKent Overstreet kcpu->reqs_available -= ctx->req_batch;
952e1bdd5f2SKent Overstreet atomic_add(ctx->req_batch, &ctx->reqs_available);
953e1bdd5f2SKent Overstreet }
954e1bdd5f2SKent Overstreet
955263782c1SBenjamin LaHaise local_irq_restore(flags);
956e1bdd5f2SKent Overstreet }
957e1bdd5f2SKent Overstreet
__get_reqs_available(struct kioctx * ctx)958432c7997SChristoph Hellwig static bool __get_reqs_available(struct kioctx *ctx)
959e1bdd5f2SKent Overstreet {
960e1bdd5f2SKent Overstreet struct kioctx_cpu *kcpu;
961e1bdd5f2SKent Overstreet bool ret = false;
962263782c1SBenjamin LaHaise unsigned long flags;
963e1bdd5f2SKent Overstreet
964263782c1SBenjamin LaHaise local_irq_save(flags);
965be6fb451SBenjamin LaHaise kcpu = this_cpu_ptr(ctx->cpu);
966e1bdd5f2SKent Overstreet if (!kcpu->reqs_available) {
96738ace0d5SUros Bizjak int avail = atomic_read(&ctx->reqs_available);
968e1bdd5f2SKent Overstreet
969e1bdd5f2SKent Overstreet do {
970e1bdd5f2SKent Overstreet if (avail < ctx->req_batch)
971e1bdd5f2SKent Overstreet goto out;
97238ace0d5SUros Bizjak } while (!atomic_try_cmpxchg(&ctx->reqs_available,
97338ace0d5SUros Bizjak &avail, avail - ctx->req_batch));
974e1bdd5f2SKent Overstreet
975e1bdd5f2SKent Overstreet kcpu->reqs_available += ctx->req_batch;
976e1bdd5f2SKent Overstreet }
977e1bdd5f2SKent Overstreet
978e1bdd5f2SKent Overstreet ret = true;
979e1bdd5f2SKent Overstreet kcpu->reqs_available--;
980e1bdd5f2SKent Overstreet out:
981263782c1SBenjamin LaHaise local_irq_restore(flags);
982e1bdd5f2SKent Overstreet return ret;
983e1bdd5f2SKent Overstreet }
984e1bdd5f2SKent Overstreet
985d856f32aSBenjamin LaHaise /* refill_reqs_available
986d856f32aSBenjamin LaHaise * Updates the reqs_available reference counts used for tracking the
987d856f32aSBenjamin LaHaise * number of free slots in the completion ring. This can be called
988d856f32aSBenjamin LaHaise * from aio_complete() (to optimistically update reqs_available) or
989d856f32aSBenjamin LaHaise * from aio_get_req() (the we're out of events case). It must be
990d856f32aSBenjamin LaHaise * called holding ctx->completion_lock.
991d856f32aSBenjamin LaHaise */
refill_reqs_available(struct kioctx * ctx,unsigned head,unsigned tail)992d856f32aSBenjamin LaHaise static void refill_reqs_available(struct kioctx *ctx, unsigned head,
993d856f32aSBenjamin LaHaise unsigned tail)
994d856f32aSBenjamin LaHaise {
995d856f32aSBenjamin LaHaise unsigned events_in_ring, completed;
996d856f32aSBenjamin LaHaise
997d856f32aSBenjamin LaHaise /* Clamp head since userland can write to it. */
998d856f32aSBenjamin LaHaise head %= ctx->nr_events;
999d856f32aSBenjamin LaHaise if (head <= tail)
1000d856f32aSBenjamin LaHaise events_in_ring = tail - head;
1001d856f32aSBenjamin LaHaise else
1002d856f32aSBenjamin LaHaise events_in_ring = ctx->nr_events - (head - tail);
1003d856f32aSBenjamin LaHaise
1004d856f32aSBenjamin LaHaise completed = ctx->completed_events;
1005d856f32aSBenjamin LaHaise if (events_in_ring < completed)
1006d856f32aSBenjamin LaHaise completed -= events_in_ring;
1007d856f32aSBenjamin LaHaise else
1008d856f32aSBenjamin LaHaise completed = 0;
1009d856f32aSBenjamin LaHaise
1010d856f32aSBenjamin LaHaise if (!completed)
1011d856f32aSBenjamin LaHaise return;
1012d856f32aSBenjamin LaHaise
1013d856f32aSBenjamin LaHaise ctx->completed_events -= completed;
1014d856f32aSBenjamin LaHaise put_reqs_available(ctx, completed);
1015d856f32aSBenjamin LaHaise }
1016d856f32aSBenjamin LaHaise
1017d856f32aSBenjamin LaHaise /* user_refill_reqs_available
1018d856f32aSBenjamin LaHaise * Called to refill reqs_available when aio_get_req() encounters an
1019d856f32aSBenjamin LaHaise * out of space in the completion ring.
1020d856f32aSBenjamin LaHaise */
user_refill_reqs_available(struct kioctx * ctx)1021d856f32aSBenjamin LaHaise static void user_refill_reqs_available(struct kioctx *ctx)
1022d856f32aSBenjamin LaHaise {
1023d856f32aSBenjamin LaHaise spin_lock_irq(&ctx->completion_lock);
1024d856f32aSBenjamin LaHaise if (ctx->completed_events) {
1025d856f32aSBenjamin LaHaise struct aio_ring *ring;
1026d856f32aSBenjamin LaHaise unsigned head;
1027d856f32aSBenjamin LaHaise
1028d856f32aSBenjamin LaHaise /* Access of ring->head may race with aio_read_events_ring()
1029d856f32aSBenjamin LaHaise * here, but that's okay since whether we read the old version
1030d856f32aSBenjamin LaHaise * or the new version, and either will be valid. The important
1031d856f32aSBenjamin LaHaise * part is that head cannot pass tail since we prevent
1032d856f32aSBenjamin LaHaise * aio_complete() from updating tail by holding
1033d856f32aSBenjamin LaHaise * ctx->completion_lock. Even if head is invalid, the check
1034d856f32aSBenjamin LaHaise * against ctx->completed_events below will make sure we do the
1035d856f32aSBenjamin LaHaise * safe/right thing.
1036d856f32aSBenjamin LaHaise */
10375c075c5bSFabio M. De Francesco ring = page_address(ctx->ring_pages[0]);
1038d856f32aSBenjamin LaHaise head = ring->head;
1039d856f32aSBenjamin LaHaise
1040d856f32aSBenjamin LaHaise refill_reqs_available(ctx, head, ctx->tail);
1041d856f32aSBenjamin LaHaise }
1042d856f32aSBenjamin LaHaise
1043d856f32aSBenjamin LaHaise spin_unlock_irq(&ctx->completion_lock);
1044d856f32aSBenjamin LaHaise }
1045d856f32aSBenjamin LaHaise
get_reqs_available(struct kioctx * ctx)1046432c7997SChristoph Hellwig static bool get_reqs_available(struct kioctx *ctx)
1047432c7997SChristoph Hellwig {
1048432c7997SChristoph Hellwig if (__get_reqs_available(ctx))
1049432c7997SChristoph Hellwig return true;
1050432c7997SChristoph Hellwig user_refill_reqs_available(ctx);
1051432c7997SChristoph Hellwig return __get_reqs_available(ctx);
1052432c7997SChristoph Hellwig }
1053432c7997SChristoph Hellwig
10541da177e4SLinus Torvalds /* aio_get_req
105557282d8fSKent Overstreet * Allocate a slot for an aio request.
105657282d8fSKent Overstreet * Returns NULL if no requests are free.
1057b53119f1SLinus Torvalds *
1058b53119f1SLinus Torvalds * The refcount is initialized to 2 - one for the async op completion,
1059b53119f1SLinus Torvalds * one for the synchronous code that does this.
10601da177e4SLinus Torvalds */
aio_get_req(struct kioctx * ctx)106104b2fa9fSChristoph Hellwig static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
10621da177e4SLinus Torvalds {
106304b2fa9fSChristoph Hellwig struct aio_kiocb *req;
1064a1c8eae7SKent Overstreet
10652bc4ca9bSJens Axboe req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
10661da177e4SLinus Torvalds if (unlikely(!req))
1067432c7997SChristoph Hellwig return NULL;
10681da177e4SLinus Torvalds
1069fa0ca2aeSAl Viro if (unlikely(!get_reqs_available(ctx))) {
10706af1c849SWei Yongjun kmem_cache_free(kiocb_cachep, req);
1071fa0ca2aeSAl Viro return NULL;
1072fa0ca2aeSAl Viro }
1073fa0ca2aeSAl Viro
1074e34ecee2SKent Overstreet percpu_ref_get(&ctx->reqs);
10752bc4ca9bSJens Axboe req->ki_ctx = ctx;
107675321b50SChristoph Hellwig INIT_LIST_HEAD(&req->ki_list);
1077b53119f1SLinus Torvalds refcount_set(&req->ki_refcnt, 2);
10782bc4ca9bSJens Axboe req->ki_eventfd = NULL;
10791da177e4SLinus Torvalds return req;
10801da177e4SLinus Torvalds }
10811da177e4SLinus Torvalds
lookup_ioctx(unsigned long ctx_id)1082d5470b59SAdrian Bunk static struct kioctx *lookup_ioctx(unsigned long ctx_id)
10831da177e4SLinus Torvalds {
1084db446a08SBenjamin LaHaise struct aio_ring __user *ring = (void __user *)ctx_id;
1085abf137ddSJens Axboe struct mm_struct *mm = current->mm;
108665c24491SJeff Moyer struct kioctx *ctx, *ret = NULL;
1087db446a08SBenjamin LaHaise struct kioctx_table *table;
1088db446a08SBenjamin LaHaise unsigned id;
1089db446a08SBenjamin LaHaise
1090db446a08SBenjamin LaHaise if (get_user(id, &ring->id))
1091db446a08SBenjamin LaHaise return NULL;
10921da177e4SLinus Torvalds
1093abf137ddSJens Axboe rcu_read_lock();
1094db446a08SBenjamin LaHaise table = rcu_dereference(mm->ioctx_table);
1095abf137ddSJens Axboe
1096db446a08SBenjamin LaHaise if (!table || id >= table->nr)
1097db446a08SBenjamin LaHaise goto out;
1098db446a08SBenjamin LaHaise
1099a538e3ffSJeff Moyer id = array_index_nospec(id, table->nr);
1100d0264c01STejun Heo ctx = rcu_dereference(table->table[id]);
1101f30d704fSBenjamin LaHaise if (ctx && ctx->user_id == ctx_id) {
1102baf10564SAl Viro if (percpu_ref_tryget_live(&ctx->users))
110365c24491SJeff Moyer ret = ctx;
11041da177e4SLinus Torvalds }
1105db446a08SBenjamin LaHaise out:
1106abf137ddSJens Axboe rcu_read_unlock();
110765c24491SJeff Moyer return ret;
11081da177e4SLinus Torvalds }
11091da177e4SLinus Torvalds
iocb_destroy(struct aio_kiocb * iocb)1110b53119f1SLinus Torvalds static inline void iocb_destroy(struct aio_kiocb *iocb)
11119018ccc4SChristoph Hellwig {
111274259703SAl Viro if (iocb->ki_eventfd)
111374259703SAl Viro eventfd_ctx_put(iocb->ki_eventfd);
111484c4e1f8SLinus Torvalds if (iocb->ki_filp)
111584c4e1f8SLinus Torvalds fput(iocb->ki_filp);
11169018ccc4SChristoph Hellwig percpu_ref_put(&iocb->ki_ctx->reqs);
11179018ccc4SChristoph Hellwig kmem_cache_free(kiocb_cachep, iocb);
11189018ccc4SChristoph Hellwig }
1119b53119f1SLinus Torvalds
11201da177e4SLinus Torvalds /* aio_complete
11211da177e4SLinus Torvalds * Called when the io request on the given iocb is complete.
11221da177e4SLinus Torvalds */
aio_complete(struct aio_kiocb * iocb)11232bb874c0SAl Viro static void aio_complete(struct aio_kiocb *iocb)
11241da177e4SLinus Torvalds {
11251da177e4SLinus Torvalds struct kioctx *ctx = iocb->ki_ctx;
11261da177e4SLinus Torvalds struct aio_ring *ring;
112721b40200SKent Overstreet struct io_event *ev_page, *event;
1128d856f32aSBenjamin LaHaise unsigned tail, pos, head;
11291da177e4SLinus Torvalds unsigned long flags;
11301da177e4SLinus Torvalds
11311da177e4SLinus Torvalds /*
11320460fef2SKent Overstreet * Add a completion event to the ring buffer. Must be done holding
11334b30f07eSTang Chen * ctx->completion_lock to prevent other code from messing with the tail
11340460fef2SKent Overstreet * pointer since we might be called from irq context.
11350460fef2SKent Overstreet */
11360460fef2SKent Overstreet spin_lock_irqsave(&ctx->completion_lock, flags);
11370460fef2SKent Overstreet
113858c85dc2SKent Overstreet tail = ctx->tail;
113921b40200SKent Overstreet pos = tail + AIO_EVENTS_OFFSET;
114021b40200SKent Overstreet
114158c85dc2SKent Overstreet if (++tail >= ctx->nr_events)
11424bf69b2aSKen Chen tail = 0;
11431da177e4SLinus Torvalds
11445c075c5bSFabio M. De Francesco ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
114521b40200SKent Overstreet event = ev_page + pos % AIO_EVENTS_PER_PAGE;
114621b40200SKent Overstreet
1147a9339b78SAl Viro *event = iocb->ki_res;
11481da177e4SLinus Torvalds
114958c85dc2SKent Overstreet flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
115021b40200SKent Overstreet
1151a9339b78SAl Viro pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
1152a9339b78SAl Viro (void __user *)(unsigned long)iocb->ki_res.obj,
1153a9339b78SAl Viro iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
11541da177e4SLinus Torvalds
11551da177e4SLinus Torvalds /* after flagging the request as done, we
11561da177e4SLinus Torvalds * must never even look at it again
11571da177e4SLinus Torvalds */
11581da177e4SLinus Torvalds smp_wmb(); /* make event visible before updating tail */
11591da177e4SLinus Torvalds
116058c85dc2SKent Overstreet ctx->tail = tail;
116121b40200SKent Overstreet
11625c075c5bSFabio M. De Francesco ring = page_address(ctx->ring_pages[0]);
1163d856f32aSBenjamin LaHaise head = ring->head;
11641da177e4SLinus Torvalds ring->tail = tail;
116558c85dc2SKent Overstreet flush_dcache_page(ctx->ring_pages[0]);
11661da177e4SLinus Torvalds
1167d856f32aSBenjamin LaHaise ctx->completed_events++;
1168d856f32aSBenjamin LaHaise if (ctx->completed_events > 1)
1169d856f32aSBenjamin LaHaise refill_reqs_available(ctx, head, tail);
11700460fef2SKent Overstreet spin_unlock_irqrestore(&ctx->completion_lock, flags);
11710460fef2SKent Overstreet
117221b40200SKent Overstreet pr_debug("added to ring %p at [%u]\n", iocb, tail);
11738d1c98b0SDavide Libenzi
11748d1c98b0SDavide Libenzi /*
11758d1c98b0SDavide Libenzi * Check if the user asked us to deliver the result through an
11768d1c98b0SDavide Libenzi * eventfd. The eventfd_signal() function is safe to be called
11778d1c98b0SDavide Libenzi * from IRQ context.
11788d1c98b0SDavide Libenzi */
117974259703SAl Viro if (iocb->ki_eventfd)
11808d1c98b0SDavide Libenzi eventfd_signal(iocb->ki_eventfd, 1);
11818d1c98b0SDavide Libenzi
11826cb2a210SQuentin Barnes /*
11836cb2a210SQuentin Barnes * We have to order our ring_info tail store above and test
11846cb2a210SQuentin Barnes * of the wait list below outside the wait lock. This is
11856cb2a210SQuentin Barnes * like in wake_up_bit() where clearing a bit has to be
11866cb2a210SQuentin Barnes * ordered with the unlocked test.
11876cb2a210SQuentin Barnes */
11886cb2a210SQuentin Barnes smp_mb();
11896cb2a210SQuentin Barnes
11901da177e4SLinus Torvalds if (waitqueue_active(&ctx->wait))
11911da177e4SLinus Torvalds wake_up(&ctx->wait);
11922bb874c0SAl Viro }
11932bb874c0SAl Viro
iocb_put(struct aio_kiocb * iocb)11942bb874c0SAl Viro static inline void iocb_put(struct aio_kiocb *iocb)
11952bb874c0SAl Viro {
11962bb874c0SAl Viro if (refcount_dec_and_test(&iocb->ki_refcnt)) {
11972bb874c0SAl Viro aio_complete(iocb);
11982bb874c0SAl Viro iocb_destroy(iocb);
11992bb874c0SAl Viro }
12001da177e4SLinus Torvalds }
12011da177e4SLinus Torvalds
12022be4e7deSGu Zheng /* aio_read_events_ring
12031da177e4SLinus Torvalds * Pull an event off of the ioctx's event ring. Returns the number of
1204a31ad380SKent Overstreet * events fetched
12051da177e4SLinus Torvalds */
aio_read_events_ring(struct kioctx * ctx,struct io_event __user * event,long nr)1206a31ad380SKent Overstreet static long aio_read_events_ring(struct kioctx *ctx,
1207a31ad380SKent Overstreet struct io_event __user *event, long nr)
12081da177e4SLinus Torvalds {
12091da177e4SLinus Torvalds struct aio_ring *ring;
12105ffac122SKent Overstreet unsigned head, tail, pos;
1211a31ad380SKent Overstreet long ret = 0;
1212a31ad380SKent Overstreet int copy_ret;
1213a31ad380SKent Overstreet
12149c9ce763SDave Chinner /*
12159c9ce763SDave Chinner * The mutex can block and wake us up and that will cause
12169c9ce763SDave Chinner * wait_event_interruptible_hrtimeout() to schedule without sleeping
12179c9ce763SDave Chinner * and repeat. This should be rare enough that it doesn't cause
12189c9ce763SDave Chinner * peformance issues. See the comment in read_events() for more detail.
12199c9ce763SDave Chinner */
12209c9ce763SDave Chinner sched_annotate_sleep();
122158c85dc2SKent Overstreet mutex_lock(&ctx->ring_lock);
12221da177e4SLinus Torvalds
1223fa8a53c3SBenjamin LaHaise /* Access to ->ring_pages here is protected by ctx->ring_lock. */
12245c075c5bSFabio M. De Francesco ring = page_address(ctx->ring_pages[0]);
1225a31ad380SKent Overstreet head = ring->head;
12265ffac122SKent Overstreet tail = ring->tail;
12271da177e4SLinus Torvalds
12282ff396beSJeff Moyer /*
12292ff396beSJeff Moyer * Ensure that once we've read the current tail pointer, that
12302ff396beSJeff Moyer * we also see the events that were stored up to the tail.
12312ff396beSJeff Moyer */
12322ff396beSJeff Moyer smp_rmb();
12332ff396beSJeff Moyer
12345ffac122SKent Overstreet pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
1235a31ad380SKent Overstreet
12365ffac122SKent Overstreet if (head == tail)
12371da177e4SLinus Torvalds goto out;
12381da177e4SLinus Torvalds
1239edfbbf38SBenjamin LaHaise head %= ctx->nr_events;
1240edfbbf38SBenjamin LaHaise tail %= ctx->nr_events;
1241edfbbf38SBenjamin LaHaise
1242a31ad380SKent Overstreet while (ret < nr) {
1243a31ad380SKent Overstreet long avail;
1244a31ad380SKent Overstreet struct io_event *ev;
1245a31ad380SKent Overstreet struct page *page;
12461da177e4SLinus Torvalds
12475ffac122SKent Overstreet avail = (head <= tail ? tail : ctx->nr_events) - head;
12485ffac122SKent Overstreet if (head == tail)
1249a31ad380SKent Overstreet break;
1250a31ad380SKent Overstreet
1251a31ad380SKent Overstreet pos = head + AIO_EVENTS_OFFSET;
125258c85dc2SKent Overstreet page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
1253a31ad380SKent Overstreet pos %= AIO_EVENTS_PER_PAGE;
1254a31ad380SKent Overstreet
1255d2988bd4SAl Viro avail = min(avail, nr - ret);
1256d2988bd4SAl Viro avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
1257d2988bd4SAl Viro
12585c075c5bSFabio M. De Francesco ev = page_address(page);
1259a31ad380SKent Overstreet copy_ret = copy_to_user(event + ret, ev + pos,
1260a31ad380SKent Overstreet sizeof(*ev) * avail);
1261a31ad380SKent Overstreet
1262a31ad380SKent Overstreet if (unlikely(copy_ret)) {
1263a31ad380SKent Overstreet ret = -EFAULT;
1264a31ad380SKent Overstreet goto out;
12651da177e4SLinus Torvalds }
12661da177e4SLinus Torvalds
1267a31ad380SKent Overstreet ret += avail;
1268a31ad380SKent Overstreet head += avail;
126958c85dc2SKent Overstreet head %= ctx->nr_events;
1270a31ad380SKent Overstreet }
1271a31ad380SKent Overstreet
12725c075c5bSFabio M. De Francesco ring = page_address(ctx->ring_pages[0]);
1273a31ad380SKent Overstreet ring->head = head;
127458c85dc2SKent Overstreet flush_dcache_page(ctx->ring_pages[0]);
1275a31ad380SKent Overstreet
12765ffac122SKent Overstreet pr_debug("%li h%u t%u\n", ret, head, tail);
1277a31ad380SKent Overstreet out:
127858c85dc2SKent Overstreet mutex_unlock(&ctx->ring_lock);
1279a31ad380SKent Overstreet
12801da177e4SLinus Torvalds return ret;
12811da177e4SLinus Torvalds }
12821da177e4SLinus Torvalds
aio_read_events(struct kioctx * ctx,long min_nr,long nr,struct io_event __user * event,long * i)1283a31ad380SKent Overstreet static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
1284a31ad380SKent Overstreet struct io_event __user *event, long *i)
12851da177e4SLinus Torvalds {
1286a31ad380SKent Overstreet long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
12871da177e4SLinus Torvalds
1288a31ad380SKent Overstreet if (ret > 0)
1289a31ad380SKent Overstreet *i += ret;
1290a31ad380SKent Overstreet
1291a31ad380SKent Overstreet if (unlikely(atomic_read(&ctx->dead)))
1292a31ad380SKent Overstreet ret = -EINVAL;
1293a31ad380SKent Overstreet
1294a31ad380SKent Overstreet if (!*i)
1295a31ad380SKent Overstreet *i = ret;
1296a31ad380SKent Overstreet
1297a31ad380SKent Overstreet return ret < 0 || *i >= min_nr;
12981da177e4SLinus Torvalds }
12991da177e4SLinus Torvalds
read_events(struct kioctx * ctx,long min_nr,long nr,struct io_event __user * event,ktime_t until)1300a31ad380SKent Overstreet static long read_events(struct kioctx *ctx, long min_nr, long nr,
13011da177e4SLinus Torvalds struct io_event __user *event,
1302fa2e62a5SDeepa Dinamani ktime_t until)
13031da177e4SLinus Torvalds {
1304a31ad380SKent Overstreet long ret = 0;
13051da177e4SLinus Torvalds
1306a31ad380SKent Overstreet /*
1307a31ad380SKent Overstreet * Note that aio_read_events() is being called as the conditional - i.e.
1308a31ad380SKent Overstreet * we're calling it after prepare_to_wait() has set task state to
1309a31ad380SKent Overstreet * TASK_INTERRUPTIBLE.
1310a31ad380SKent Overstreet *
1311a31ad380SKent Overstreet * But aio_read_events() can block, and if it blocks it's going to flip
1312a31ad380SKent Overstreet * the task state back to TASK_RUNNING.
1313a31ad380SKent Overstreet *
1314a31ad380SKent Overstreet * This should be ok, provided it doesn't flip the state back to
1315a31ad380SKent Overstreet * TASK_RUNNING and return 0 too much - that causes us to spin. That
1316a31ad380SKent Overstreet * will only happen if the mutex_lock() call blocks, and we then find
1317a31ad380SKent Overstreet * the ringbuffer empty. So in practice we should be ok, but it's
1318a31ad380SKent Overstreet * something to be aware of when touching this code.
1319a31ad380SKent Overstreet */
13202456e855SThomas Gleixner if (until == 0)
13215f785de5SFam Zheng aio_read_events(ctx, min_nr, nr, event, &ret);
13225f785de5SFam Zheng else
1323a31ad380SKent Overstreet wait_event_interruptible_hrtimeout(ctx->wait,
13245f785de5SFam Zheng aio_read_events(ctx, min_nr, nr, event, &ret),
13255f785de5SFam Zheng until);
1326a31ad380SKent Overstreet return ret;
13271da177e4SLinus Torvalds }
13281da177e4SLinus Torvalds
13291da177e4SLinus Torvalds /* sys_io_setup:
13301da177e4SLinus Torvalds * Create an aio_context capable of receiving at least nr_events.
13311da177e4SLinus Torvalds * ctxp must not point to an aio_context that already exists, and
13321da177e4SLinus Torvalds * must be initialized to 0 prior to the call. On successful
13331da177e4SLinus Torvalds * creation of the aio_context, *ctxp is filled in with the resulting
13341da177e4SLinus Torvalds * handle. May fail with -EINVAL if *ctxp is not initialized,
13351da177e4SLinus Torvalds * if the specified nr_events exceeds internal limits. May fail
13361da177e4SLinus Torvalds * with -EAGAIN if the specified nr_events exceeds the user's limit
13371da177e4SLinus Torvalds * of available events. May fail with -ENOMEM if insufficient kernel
13381da177e4SLinus Torvalds * resources are available. May fail with -EFAULT if an invalid
13391da177e4SLinus Torvalds * pointer is passed for ctxp. Will fail with -ENOSYS if not
13401da177e4SLinus Torvalds * implemented.
13411da177e4SLinus Torvalds */
SYSCALL_DEFINE2(io_setup,unsigned,nr_events,aio_context_t __user *,ctxp)1342002c8976SHeiko Carstens SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
13431da177e4SLinus Torvalds {
13441da177e4SLinus Torvalds struct kioctx *ioctx = NULL;
13451da177e4SLinus Torvalds unsigned long ctx;
13461da177e4SLinus Torvalds long ret;
13471da177e4SLinus Torvalds
13481da177e4SLinus Torvalds ret = get_user(ctx, ctxp);
13491da177e4SLinus Torvalds if (unlikely(ret))
13501da177e4SLinus Torvalds goto out;
13511da177e4SLinus Torvalds
13521da177e4SLinus Torvalds ret = -EINVAL;
1353d55b5fdaSZach Brown if (unlikely(ctx || nr_events == 0)) {
1354acd88d4eSKinglong Mee pr_debug("EINVAL: ctx %lu nr_events %u\n",
1355d55b5fdaSZach Brown ctx, nr_events);
13561da177e4SLinus Torvalds goto out;
13571da177e4SLinus Torvalds }
13581da177e4SLinus Torvalds
13591da177e4SLinus Torvalds ioctx = ioctx_alloc(nr_events);
13601da177e4SLinus Torvalds ret = PTR_ERR(ioctx);
13611da177e4SLinus Torvalds if (!IS_ERR(ioctx)) {
13621da177e4SLinus Torvalds ret = put_user(ioctx->user_id, ctxp);
1363a2e1859aSAl Viro if (ret)
1364e02ba72aSAnatol Pomozov kill_ioctx(current->mm, ioctx, NULL);
1365723be6e3SKent Overstreet percpu_ref_put(&ioctx->users);
13661da177e4SLinus Torvalds }
13671da177e4SLinus Torvalds
13681da177e4SLinus Torvalds out:
13691da177e4SLinus Torvalds return ret;
13701da177e4SLinus Torvalds }
13711da177e4SLinus Torvalds
1372c00d2c7eSAl Viro #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(io_setup,unsigned,nr_events,u32 __user *,ctx32p)1373c00d2c7eSAl Viro COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
1374c00d2c7eSAl Viro {
1375c00d2c7eSAl Viro struct kioctx *ioctx = NULL;
1376c00d2c7eSAl Viro unsigned long ctx;
1377c00d2c7eSAl Viro long ret;
1378c00d2c7eSAl Viro
1379c00d2c7eSAl Viro ret = get_user(ctx, ctx32p);
1380c00d2c7eSAl Viro if (unlikely(ret))
1381c00d2c7eSAl Viro goto out;
1382c00d2c7eSAl Viro
1383c00d2c7eSAl Viro ret = -EINVAL;
1384c00d2c7eSAl Viro if (unlikely(ctx || nr_events == 0)) {
1385c00d2c7eSAl Viro pr_debug("EINVAL: ctx %lu nr_events %u\n",
1386c00d2c7eSAl Viro ctx, nr_events);
1387c00d2c7eSAl Viro goto out;
1388c00d2c7eSAl Viro }
1389c00d2c7eSAl Viro
1390c00d2c7eSAl Viro ioctx = ioctx_alloc(nr_events);
1391c00d2c7eSAl Viro ret = PTR_ERR(ioctx);
1392c00d2c7eSAl Viro if (!IS_ERR(ioctx)) {
1393c00d2c7eSAl Viro /* truncating is ok because it's a user address */
1394c00d2c7eSAl Viro ret = put_user((u32)ioctx->user_id, ctx32p);
1395c00d2c7eSAl Viro if (ret)
1396c00d2c7eSAl Viro kill_ioctx(current->mm, ioctx, NULL);
1397c00d2c7eSAl Viro percpu_ref_put(&ioctx->users);
1398c00d2c7eSAl Viro }
1399c00d2c7eSAl Viro
1400c00d2c7eSAl Viro out:
1401c00d2c7eSAl Viro return ret;
1402c00d2c7eSAl Viro }
1403c00d2c7eSAl Viro #endif
1404c00d2c7eSAl Viro
14051da177e4SLinus Torvalds /* sys_io_destroy:
14061da177e4SLinus Torvalds * Destroy the aio_context specified. May cancel any outstanding
14071da177e4SLinus Torvalds * AIOs and block on completion. Will fail with -ENOSYS if not
1408642b5123SSatoru Takeuchi * implemented. May fail with -EINVAL if the context pointed to
14091da177e4SLinus Torvalds * is invalid.
14101da177e4SLinus Torvalds */
SYSCALL_DEFINE1(io_destroy,aio_context_t,ctx)1411002c8976SHeiko Carstens SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
14121da177e4SLinus Torvalds {
14131da177e4SLinus Torvalds struct kioctx *ioctx = lookup_ioctx(ctx);
14141da177e4SLinus Torvalds if (likely(NULL != ioctx)) {
1415dc48e56dSJens Axboe struct ctx_rq_wait wait;
1416fb2d4483SBenjamin LaHaise int ret;
1417e02ba72aSAnatol Pomozov
1418dc48e56dSJens Axboe init_completion(&wait.comp);
1419dc48e56dSJens Axboe atomic_set(&wait.count, 1);
1420dc48e56dSJens Axboe
1421e02ba72aSAnatol Pomozov /* Pass requests_done to kill_ioctx() where it can be set
1422e02ba72aSAnatol Pomozov * in a thread-safe way. If we try to set it here then we have
1423e02ba72aSAnatol Pomozov * a race condition if two io_destroy() called simultaneously.
1424e02ba72aSAnatol Pomozov */
1425dc48e56dSJens Axboe ret = kill_ioctx(current->mm, ioctx, &wait);
1426723be6e3SKent Overstreet percpu_ref_put(&ioctx->users);
1427e02ba72aSAnatol Pomozov
1428e02ba72aSAnatol Pomozov /* Wait until all IO for the context are done. Otherwise kernel
1429e02ba72aSAnatol Pomozov * keep using user-space buffers even if user thinks the context
1430e02ba72aSAnatol Pomozov * is destroyed.
1431e02ba72aSAnatol Pomozov */
1432fb2d4483SBenjamin LaHaise if (!ret)
1433dc48e56dSJens Axboe wait_for_completion(&wait.comp);
1434e02ba72aSAnatol Pomozov
1435fb2d4483SBenjamin LaHaise return ret;
14361da177e4SLinus Torvalds }
1437acd88d4eSKinglong Mee pr_debug("EINVAL: invalid context id\n");
14381da177e4SLinus Torvalds return -EINVAL;
14391da177e4SLinus Torvalds }
14401da177e4SLinus Torvalds
aio_remove_iocb(struct aio_kiocb * iocb)14413c96c7f4SAl Viro static void aio_remove_iocb(struct aio_kiocb *iocb)
14423c96c7f4SAl Viro {
14433c96c7f4SAl Viro struct kioctx *ctx = iocb->ki_ctx;
14443c96c7f4SAl Viro unsigned long flags;
14453c96c7f4SAl Viro
14463c96c7f4SAl Viro spin_lock_irqsave(&ctx->ctx_lock, flags);
14473c96c7f4SAl Viro list_del(&iocb->ki_list);
14483c96c7f4SAl Viro spin_unlock_irqrestore(&ctx->ctx_lock, flags);
14493c96c7f4SAl Viro }
14503c96c7f4SAl Viro
aio_complete_rw(struct kiocb * kiocb,long res)14516b19b766SJens Axboe static void aio_complete_rw(struct kiocb *kiocb, long res)
145254843f87SChristoph Hellwig {
145354843f87SChristoph Hellwig struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
145454843f87SChristoph Hellwig
14553c96c7f4SAl Viro if (!list_empty_careful(&iocb->ki_list))
14563c96c7f4SAl Viro aio_remove_iocb(iocb);
14573c96c7f4SAl Viro
145854843f87SChristoph Hellwig if (kiocb->ki_flags & IOCB_WRITE) {
145954843f87SChristoph Hellwig struct inode *inode = file_inode(kiocb->ki_filp);
146054843f87SChristoph Hellwig
146154843f87SChristoph Hellwig if (S_ISREG(inode->i_mode))
14628c3cfa80SAmir Goldstein kiocb_end_write(kiocb);
146354843f87SChristoph Hellwig }
146454843f87SChristoph Hellwig
14652bb874c0SAl Viro iocb->ki_res.res = res;
14666b19b766SJens Axboe iocb->ki_res.res2 = 0;
14672bb874c0SAl Viro iocb_put(iocb);
146854843f87SChristoph Hellwig }
146954843f87SChristoph Hellwig
aio_prep_rw(struct kiocb * req,const struct iocb * iocb)147088a6f18bSJens Axboe static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
147154843f87SChristoph Hellwig {
147254843f87SChristoph Hellwig int ret;
147354843f87SChristoph Hellwig
147454843f87SChristoph Hellwig req->ki_complete = aio_complete_rw;
1475ec51f8eeSMike Marshall req->private = NULL;
147654843f87SChristoph Hellwig req->ki_pos = iocb->aio_offset;
1477e7e23fc5SBart Van Assche req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
147854843f87SChristoph Hellwig if (iocb->aio_flags & IOCB_FLAG_RESFD)
147954843f87SChristoph Hellwig req->ki_flags |= IOCB_EVENTFD;
1480d9a08a9eSAdam Manzanares if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
1481d9a08a9eSAdam Manzanares /*
1482d9a08a9eSAdam Manzanares * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
1483d9a08a9eSAdam Manzanares * aio_reqprio is interpreted as an I/O scheduling
1484d9a08a9eSAdam Manzanares * class and priority.
1485d9a08a9eSAdam Manzanares */
1486d9a08a9eSAdam Manzanares ret = ioprio_check_cap(iocb->aio_reqprio);
1487d9a08a9eSAdam Manzanares if (ret) {
14889a6d9a62SAdam Manzanares pr_debug("aio ioprio check cap error: %d\n", ret);
148984c4e1f8SLinus Torvalds return ret;
1490d9a08a9eSAdam Manzanares }
1491d9a08a9eSAdam Manzanares
1492d9a08a9eSAdam Manzanares req->ki_ioprio = iocb->aio_reqprio;
1493d9a08a9eSAdam Manzanares } else
149476dc8913SDamien Le Moal req->ki_ioprio = get_current_ioprio();
1495d9a08a9eSAdam Manzanares
149654843f87SChristoph Hellwig ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
149754843f87SChristoph Hellwig if (unlikely(ret))
149884c4e1f8SLinus Torvalds return ret;
1499154989e4SChristoph Hellwig
1500154989e4SChristoph Hellwig req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
1501154989e4SChristoph Hellwig return 0;
150254843f87SChristoph Hellwig }
150354843f87SChristoph Hellwig
aio_setup_rw(int rw,const struct iocb * iocb,struct iovec ** iovec,bool vectored,bool compat,struct iov_iter * iter)150487e5e6daSJens Axboe static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
150587e5e6daSJens Axboe struct iovec **iovec, bool vectored, bool compat,
150687e5e6daSJens Axboe struct iov_iter *iter)
1507eed4e51fSBadari Pulavarty {
150889319d31SChristoph Hellwig void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
150989319d31SChristoph Hellwig size_t len = iocb->aio_nbytes;
1510eed4e51fSBadari Pulavarty
151189319d31SChristoph Hellwig if (!vectored) {
151289319d31SChristoph Hellwig ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
151389319d31SChristoph Hellwig *iovec = NULL;
151441ef4eb8SKent Overstreet return ret;
15158bc92afcSKent Overstreet }
151689cd35c5SChristoph Hellwig
151789cd35c5SChristoph Hellwig return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
15181da177e4SLinus Torvalds }
15191da177e4SLinus Torvalds
aio_rw_done(struct kiocb * req,ssize_t ret)15209061d14aSAl Viro static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
152189319d31SChristoph Hellwig {
152289319d31SChristoph Hellwig switch (ret) {
152389319d31SChristoph Hellwig case -EIOCBQUEUED:
15249061d14aSAl Viro break;
152589319d31SChristoph Hellwig case -ERESTARTSYS:
152689319d31SChristoph Hellwig case -ERESTARTNOINTR:
152789319d31SChristoph Hellwig case -ERESTARTNOHAND:
152889319d31SChristoph Hellwig case -ERESTART_RESTARTBLOCK:
152941ef4eb8SKent Overstreet /*
153041ef4eb8SKent Overstreet * There's no easy way to restart the syscall since other AIO's
153141ef4eb8SKent Overstreet * may be already running. Just fail this IO with EINTR.
153241ef4eb8SKent Overstreet */
153341ef4eb8SKent Overstreet ret = -EINTR;
1534df561f66SGustavo A. R. Silva fallthrough;
153589319d31SChristoph Hellwig default:
15366b19b766SJens Axboe req->ki_complete(req, ret);
153789319d31SChristoph Hellwig }
153841ef4eb8SKent Overstreet }
15391da177e4SLinus Torvalds
aio_read(struct kiocb * req,const struct iocb * iocb,bool vectored,bool compat)1540958c13ceSAl Viro static int aio_read(struct kiocb *req, const struct iocb *iocb,
154188a6f18bSJens Axboe bool vectored, bool compat)
154289319d31SChristoph Hellwig {
154389319d31SChristoph Hellwig struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
154489319d31SChristoph Hellwig struct iov_iter iter;
154554843f87SChristoph Hellwig struct file *file;
1546958c13ceSAl Viro int ret;
154789319d31SChristoph Hellwig
154854843f87SChristoph Hellwig ret = aio_prep_rw(req, iocb);
154954843f87SChristoph Hellwig if (ret)
155054843f87SChristoph Hellwig return ret;
155154843f87SChristoph Hellwig file = req->ki_filp;
155289319d31SChristoph Hellwig if (unlikely(!(file->f_mode & FMODE_READ)))
155384c4e1f8SLinus Torvalds return -EBADF;
155489319d31SChristoph Hellwig if (unlikely(!file->f_op->read_iter))
155584c4e1f8SLinus Torvalds return -EINVAL;
155689319d31SChristoph Hellwig
1557de4eda9dSAl Viro ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter);
155887e5e6daSJens Axboe if (ret < 0)
155984c4e1f8SLinus Torvalds return ret;
156089319d31SChristoph Hellwig ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
156189319d31SChristoph Hellwig if (!ret)
15629061d14aSAl Viro aio_rw_done(req, call_read_iter(file, req, &iter));
156389319d31SChristoph Hellwig kfree(iovec);
156489319d31SChristoph Hellwig return ret;
156589319d31SChristoph Hellwig }
156689319d31SChristoph Hellwig
aio_write(struct kiocb * req,const struct iocb * iocb,bool vectored,bool compat)1567958c13ceSAl Viro static int aio_write(struct kiocb *req, const struct iocb *iocb,
156888a6f18bSJens Axboe bool vectored, bool compat)
156989319d31SChristoph Hellwig {
157089319d31SChristoph Hellwig struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
157189319d31SChristoph Hellwig struct iov_iter iter;
157254843f87SChristoph Hellwig struct file *file;
1573958c13ceSAl Viro int ret;
157489319d31SChristoph Hellwig
157554843f87SChristoph Hellwig ret = aio_prep_rw(req, iocb);
157654843f87SChristoph Hellwig if (ret)
157754843f87SChristoph Hellwig return ret;
157854843f87SChristoph Hellwig file = req->ki_filp;
157954843f87SChristoph Hellwig
158089319d31SChristoph Hellwig if (unlikely(!(file->f_mode & FMODE_WRITE)))
158184c4e1f8SLinus Torvalds return -EBADF;
158289319d31SChristoph Hellwig if (unlikely(!file->f_op->write_iter))
158384c4e1f8SLinus Torvalds return -EINVAL;
158489319d31SChristoph Hellwig
1585de4eda9dSAl Viro ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter);
158687e5e6daSJens Axboe if (ret < 0)
158784c4e1f8SLinus Torvalds return ret;
158889319d31SChristoph Hellwig ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
158989319d31SChristoph Hellwig if (!ret) {
15908c3cfa80SAmir Goldstein if (S_ISREG(file_inode(file)->i_mode))
15918c3cfa80SAmir Goldstein kiocb_start_write(req);
159292ce4728SChristoph Hellwig req->ki_flags |= IOCB_WRITE;
15939061d14aSAl Viro aio_rw_done(req, call_write_iter(file, req, &iter));
159492ce4728SChristoph Hellwig }
159589319d31SChristoph Hellwig kfree(iovec);
159689319d31SChristoph Hellwig return ret;
15971da177e4SLinus Torvalds }
15981da177e4SLinus Torvalds
aio_fsync_work(struct work_struct * work)1599a3c0d439SChristoph Hellwig static void aio_fsync_work(struct work_struct *work)
1600a3c0d439SChristoph Hellwig {
16012bb874c0SAl Viro struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
1602530f32fcSMiklos Szeredi const struct cred *old_cred = override_creds(iocb->fsync.creds);
1603a3c0d439SChristoph Hellwig
16042bb874c0SAl Viro iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
1605530f32fcSMiklos Szeredi revert_creds(old_cred);
1606530f32fcSMiklos Szeredi put_cred(iocb->fsync.creds);
16072bb874c0SAl Viro iocb_put(iocb);
1608a3c0d439SChristoph Hellwig }
1609a3c0d439SChristoph Hellwig
aio_fsync(struct fsync_iocb * req,const struct iocb * iocb,bool datasync)161088a6f18bSJens Axboe static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
161188a6f18bSJens Axboe bool datasync)
1612a3c0d439SChristoph Hellwig {
1613a3c0d439SChristoph Hellwig if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
1614a3c0d439SChristoph Hellwig iocb->aio_rw_flags))
1615a3c0d439SChristoph Hellwig return -EINVAL;
1616a11e1d43SLinus Torvalds
161784c4e1f8SLinus Torvalds if (unlikely(!req->file->f_op->fsync))
1618a3c0d439SChristoph Hellwig return -EINVAL;
1619a3c0d439SChristoph Hellwig
1620530f32fcSMiklos Szeredi req->creds = prepare_creds();
1621530f32fcSMiklos Szeredi if (!req->creds)
1622530f32fcSMiklos Szeredi return -ENOMEM;
1623530f32fcSMiklos Szeredi
1624a3c0d439SChristoph Hellwig req->datasync = datasync;
1625a3c0d439SChristoph Hellwig INIT_WORK(&req->work, aio_fsync_work);
1626a3c0d439SChristoph Hellwig schedule_work(&req->work);
16279061d14aSAl Viro return 0;
1628a3c0d439SChristoph Hellwig }
1629a3c0d439SChristoph Hellwig
aio_poll_put_work(struct work_struct * work)163001d7a356SJens Axboe static void aio_poll_put_work(struct work_struct *work)
163101d7a356SJens Axboe {
163201d7a356SJens Axboe struct poll_iocb *req = container_of(work, struct poll_iocb, work);
163301d7a356SJens Axboe struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
163401d7a356SJens Axboe
163501d7a356SJens Axboe iocb_put(iocb);
163601d7a356SJens Axboe }
163701d7a356SJens Axboe
163850252e4bSEric Biggers /*
163950252e4bSEric Biggers * Safely lock the waitqueue which the request is on, synchronizing with the
164050252e4bSEric Biggers * case where the ->poll() provider decides to free its waitqueue early.
164150252e4bSEric Biggers *
164250252e4bSEric Biggers * Returns true on success, meaning that req->head->lock was locked, req->wait
164350252e4bSEric Biggers * is on req->head, and an RCU read lock was taken. Returns false if the
164450252e4bSEric Biggers * request was already removed from its waitqueue (which might no longer exist).
164550252e4bSEric Biggers */
poll_iocb_lock_wq(struct poll_iocb * req)164650252e4bSEric Biggers static bool poll_iocb_lock_wq(struct poll_iocb *req)
164750252e4bSEric Biggers {
164850252e4bSEric Biggers wait_queue_head_t *head;
164950252e4bSEric Biggers
165050252e4bSEric Biggers /*
165150252e4bSEric Biggers * While we hold the waitqueue lock and the waitqueue is nonempty,
165250252e4bSEric Biggers * wake_up_pollfree() will wait for us. However, taking the waitqueue
165350252e4bSEric Biggers * lock in the first place can race with the waitqueue being freed.
165450252e4bSEric Biggers *
165550252e4bSEric Biggers * We solve this as eventpoll does: by taking advantage of the fact that
165650252e4bSEric Biggers * all users of wake_up_pollfree() will RCU-delay the actual free. If
165750252e4bSEric Biggers * we enter rcu_read_lock() and see that the pointer to the queue is
165850252e4bSEric Biggers * non-NULL, we can then lock it without the memory being freed out from
165950252e4bSEric Biggers * under us, then check whether the request is still on the queue.
166050252e4bSEric Biggers *
166150252e4bSEric Biggers * Keep holding rcu_read_lock() as long as we hold the queue lock, in
166250252e4bSEric Biggers * case the caller deletes the entry from the queue, leaving it empty.
166350252e4bSEric Biggers * In that case, only RCU prevents the queue memory from being freed.
166450252e4bSEric Biggers */
166550252e4bSEric Biggers rcu_read_lock();
166650252e4bSEric Biggers head = smp_load_acquire(&req->head);
166750252e4bSEric Biggers if (head) {
166850252e4bSEric Biggers spin_lock(&head->lock);
166950252e4bSEric Biggers if (!list_empty(&req->wait.entry))
167050252e4bSEric Biggers return true;
167150252e4bSEric Biggers spin_unlock(&head->lock);
167250252e4bSEric Biggers }
167350252e4bSEric Biggers rcu_read_unlock();
167450252e4bSEric Biggers return false;
167550252e4bSEric Biggers }
167650252e4bSEric Biggers
poll_iocb_unlock_wq(struct poll_iocb * req)167750252e4bSEric Biggers static void poll_iocb_unlock_wq(struct poll_iocb *req)
167850252e4bSEric Biggers {
167950252e4bSEric Biggers spin_unlock(&req->head->lock);
168050252e4bSEric Biggers rcu_read_unlock();
168150252e4bSEric Biggers }
168250252e4bSEric Biggers
aio_poll_complete_work(struct work_struct * work)1683bfe4037eSChristoph Hellwig static void aio_poll_complete_work(struct work_struct *work)
1684bfe4037eSChristoph Hellwig {
1685bfe4037eSChristoph Hellwig struct poll_iocb *req = container_of(work, struct poll_iocb, work);
1686bfe4037eSChristoph Hellwig struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
1687bfe4037eSChristoph Hellwig struct poll_table_struct pt = { ._key = req->events };
1688bfe4037eSChristoph Hellwig struct kioctx *ctx = iocb->ki_ctx;
1689bfe4037eSChristoph Hellwig __poll_t mask = 0;
1690bfe4037eSChristoph Hellwig
1691bfe4037eSChristoph Hellwig if (!READ_ONCE(req->cancelled))
1692bfe4037eSChristoph Hellwig mask = vfs_poll(req->file, &pt) & req->events;
1693bfe4037eSChristoph Hellwig
1694bfe4037eSChristoph Hellwig /*
1695bfe4037eSChristoph Hellwig * Note that ->ki_cancel callers also delete iocb from active_reqs after
1696bfe4037eSChristoph Hellwig * calling ->ki_cancel. We need the ctx_lock roundtrip here to
1697bfe4037eSChristoph Hellwig * synchronize with them. In the cancellation case the list_del_init
1698bfe4037eSChristoph Hellwig * itself is not actually needed, but harmless so we keep it in to
1699bfe4037eSChristoph Hellwig * avoid further branches in the fast path.
1700bfe4037eSChristoph Hellwig */
1701bfe4037eSChristoph Hellwig spin_lock_irq(&ctx->ctx_lock);
170250252e4bSEric Biggers if (poll_iocb_lock_wq(req)) {
1703bfe4037eSChristoph Hellwig if (!mask && !READ_ONCE(req->cancelled)) {
1704363bee27SEric Biggers /*
1705363bee27SEric Biggers * The request isn't actually ready to be completed yet.
1706363bee27SEric Biggers * Reschedule completion if another wakeup came in.
1707363bee27SEric Biggers */
1708363bee27SEric Biggers if (req->work_need_resched) {
1709363bee27SEric Biggers schedule_work(&req->work);
1710363bee27SEric Biggers req->work_need_resched = false;
1711363bee27SEric Biggers } else {
1712363bee27SEric Biggers req->work_scheduled = false;
1713363bee27SEric Biggers }
171450252e4bSEric Biggers poll_iocb_unlock_wq(req);
1715bfe4037eSChristoph Hellwig spin_unlock_irq(&ctx->ctx_lock);
1716bfe4037eSChristoph Hellwig return;
1717bfe4037eSChristoph Hellwig }
1718363bee27SEric Biggers list_del_init(&req->wait.entry);
171950252e4bSEric Biggers poll_iocb_unlock_wq(req);
172050252e4bSEric Biggers } /* else, POLLFREE has freed the waitqueue, so we must complete */
1721bfe4037eSChristoph Hellwig list_del_init(&iocb->ki_list);
1722af5c72b1SAl Viro iocb->ki_res.res = mangle_poll(mask);
1723bfe4037eSChristoph Hellwig spin_unlock_irq(&ctx->ctx_lock);
1724bfe4037eSChristoph Hellwig
1725af5c72b1SAl Viro iocb_put(iocb);
1726bfe4037eSChristoph Hellwig }
1727bfe4037eSChristoph Hellwig
1728bfe4037eSChristoph Hellwig /* assumes we are called with irqs disabled */
aio_poll_cancel(struct kiocb * iocb)1729bfe4037eSChristoph Hellwig static int aio_poll_cancel(struct kiocb *iocb)
1730bfe4037eSChristoph Hellwig {
1731bfe4037eSChristoph Hellwig struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
1732bfe4037eSChristoph Hellwig struct poll_iocb *req = &aiocb->poll;
1733bfe4037eSChristoph Hellwig
173450252e4bSEric Biggers if (poll_iocb_lock_wq(req)) {
1735bfe4037eSChristoph Hellwig WRITE_ONCE(req->cancelled, true);
1736363bee27SEric Biggers if (!req->work_scheduled) {
1737bfe4037eSChristoph Hellwig schedule_work(&aiocb->poll.work);
1738363bee27SEric Biggers req->work_scheduled = true;
1739bfe4037eSChristoph Hellwig }
174050252e4bSEric Biggers poll_iocb_unlock_wq(req);
174150252e4bSEric Biggers } /* else, the request was force-cancelled by POLLFREE already */
1742bfe4037eSChristoph Hellwig
1743bfe4037eSChristoph Hellwig return 0;
1744bfe4037eSChristoph Hellwig }
1745bfe4037eSChristoph Hellwig
aio_poll_wake(struct wait_queue_entry * wait,unsigned mode,int sync,void * key)1746bfe4037eSChristoph Hellwig static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1747bfe4037eSChristoph Hellwig void *key)
1748bfe4037eSChristoph Hellwig {
1749bfe4037eSChristoph Hellwig struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
1750e8693bcfSChristoph Hellwig struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
1751bfe4037eSChristoph Hellwig __poll_t mask = key_to_poll(key);
1752d3d6a18dSBart Van Assche unsigned long flags;
1753bfe4037eSChristoph Hellwig
1754bfe4037eSChristoph Hellwig /* for instances that support it check for an event match first: */
1755af5c72b1SAl Viro if (mask && !(mask & req->events))
1756bfe4037eSChristoph Hellwig return 0;
1757bfe4037eSChristoph Hellwig
1758363bee27SEric Biggers /*
1759363bee27SEric Biggers * Complete the request inline if possible. This requires that three
1760363bee27SEric Biggers * conditions be met:
1761363bee27SEric Biggers * 1. An event mask must have been passed. If a plain wakeup was done
1762363bee27SEric Biggers * instead, then mask == 0 and we have to call vfs_poll() to get
1763363bee27SEric Biggers * the events, so inline completion isn't possible.
1764363bee27SEric Biggers * 2. The completion work must not have already been scheduled.
1765363bee27SEric Biggers * 3. ctx_lock must not be busy. We have to use trylock because we
1766363bee27SEric Biggers * already hold the waitqueue lock, so this inverts the normal
1767363bee27SEric Biggers * locking order. Use irqsave/irqrestore because not all
1768363bee27SEric Biggers * filesystems (e.g. fuse) call this function with IRQs disabled,
1769363bee27SEric Biggers * yet IRQs have to be disabled before ctx_lock is obtained.
1770363bee27SEric Biggers */
1771363bee27SEric Biggers if (mask && !req->work_scheduled &&
1772363bee27SEric Biggers spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
177301d7a356SJens Axboe struct kioctx *ctx = iocb->ki_ctx;
177401d7a356SJens Axboe
1775363bee27SEric Biggers list_del_init(&req->wait.entry);
1776e8693bcfSChristoph Hellwig list_del(&iocb->ki_list);
1777af5c72b1SAl Viro iocb->ki_res.res = mangle_poll(mask);
17784b374986SXie Yongji if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
177901d7a356SJens Axboe iocb = NULL;
178001d7a356SJens Axboe INIT_WORK(&req->work, aio_poll_put_work);
178101d7a356SJens Axboe schedule_work(&req->work);
178201d7a356SJens Axboe }
178301d7a356SJens Axboe spin_unlock_irqrestore(&ctx->ctx_lock, flags);
178401d7a356SJens Axboe if (iocb)
1785af5c72b1SAl Viro iocb_put(iocb);
1786af5c72b1SAl Viro } else {
1787363bee27SEric Biggers /*
1788363bee27SEric Biggers * Schedule the completion work if needed. If it was already
1789363bee27SEric Biggers * scheduled, record that another wakeup came in.
1790363bee27SEric Biggers *
1791363bee27SEric Biggers * Don't remove the request from the waitqueue here, as it might
1792363bee27SEric Biggers * not actually be complete yet (we won't know until vfs_poll()
179350252e4bSEric Biggers * is called), and we must not miss any wakeups. POLLFREE is an
179450252e4bSEric Biggers * exception to this; see below.
1795363bee27SEric Biggers */
1796363bee27SEric Biggers if (req->work_scheduled) {
1797363bee27SEric Biggers req->work_need_resched = true;
1798363bee27SEric Biggers } else {
1799bfe4037eSChristoph Hellwig schedule_work(&req->work);
1800363bee27SEric Biggers req->work_scheduled = true;
1801363bee27SEric Biggers }
180250252e4bSEric Biggers
180350252e4bSEric Biggers /*
180450252e4bSEric Biggers * If the waitqueue is being freed early but we can't complete
180550252e4bSEric Biggers * the request inline, we have to tear down the request as best
180650252e4bSEric Biggers * we can. That means immediately removing the request from its
180750252e4bSEric Biggers * waitqueue and preventing all further accesses to the
180850252e4bSEric Biggers * waitqueue via the request. We also need to schedule the
180950252e4bSEric Biggers * completion work (done above). Also mark the request as
181050252e4bSEric Biggers * cancelled, to potentially skip an unneeded call to ->poll().
181150252e4bSEric Biggers */
181250252e4bSEric Biggers if (mask & POLLFREE) {
181350252e4bSEric Biggers WRITE_ONCE(req->cancelled, true);
181450252e4bSEric Biggers list_del_init(&req->wait.entry);
181550252e4bSEric Biggers
181650252e4bSEric Biggers /*
181750252e4bSEric Biggers * Careful: this *must* be the last step, since as soon
181850252e4bSEric Biggers * as req->head is NULL'ed out, the request can be
181950252e4bSEric Biggers * completed and freed, since aio_poll_complete_work()
182050252e4bSEric Biggers * will no longer need to take the waitqueue lock.
182150252e4bSEric Biggers */
182250252e4bSEric Biggers smp_store_release(&req->head, NULL);
182350252e4bSEric Biggers }
1824af5c72b1SAl Viro }
1825bfe4037eSChristoph Hellwig return 1;
1826bfe4037eSChristoph Hellwig }
1827bfe4037eSChristoph Hellwig
1828bfe4037eSChristoph Hellwig struct aio_poll_table {
1829bfe4037eSChristoph Hellwig struct poll_table_struct pt;
1830bfe4037eSChristoph Hellwig struct aio_kiocb *iocb;
183150252e4bSEric Biggers bool queued;
1832bfe4037eSChristoph Hellwig int error;
1833bfe4037eSChristoph Hellwig };
1834bfe4037eSChristoph Hellwig
1835bfe4037eSChristoph Hellwig static void
aio_poll_queue_proc(struct file * file,struct wait_queue_head * head,struct poll_table_struct * p)1836bfe4037eSChristoph Hellwig aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1837bfe4037eSChristoph Hellwig struct poll_table_struct *p)
1838bfe4037eSChristoph Hellwig {
1839bfe4037eSChristoph Hellwig struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
1840bfe4037eSChristoph Hellwig
1841bfe4037eSChristoph Hellwig /* multiple wait queues per file are not supported */
184250252e4bSEric Biggers if (unlikely(pt->queued)) {
1843bfe4037eSChristoph Hellwig pt->error = -EINVAL;
1844bfe4037eSChristoph Hellwig return;
1845bfe4037eSChristoph Hellwig }
1846bfe4037eSChristoph Hellwig
184750252e4bSEric Biggers pt->queued = true;
1848bfe4037eSChristoph Hellwig pt->error = 0;
1849bfe4037eSChristoph Hellwig pt->iocb->poll.head = head;
1850bfe4037eSChristoph Hellwig add_wait_queue(head, &pt->iocb->poll.wait);
1851bfe4037eSChristoph Hellwig }
1852bfe4037eSChristoph Hellwig
aio_poll(struct aio_kiocb * aiocb,const struct iocb * iocb)1853958c13ceSAl Viro static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
1854bfe4037eSChristoph Hellwig {
1855bfe4037eSChristoph Hellwig struct kioctx *ctx = aiocb->ki_ctx;
1856bfe4037eSChristoph Hellwig struct poll_iocb *req = &aiocb->poll;
1857bfe4037eSChristoph Hellwig struct aio_poll_table apt;
1858af5c72b1SAl Viro bool cancel = false;
1859bfe4037eSChristoph Hellwig __poll_t mask;
1860bfe4037eSChristoph Hellwig
1861bfe4037eSChristoph Hellwig /* reject any unknown events outside the normal event mask. */
1862bfe4037eSChristoph Hellwig if ((u16)iocb->aio_buf != iocb->aio_buf)
1863bfe4037eSChristoph Hellwig return -EINVAL;
1864bfe4037eSChristoph Hellwig /* reject fields that are not defined for poll */
1865bfe4037eSChristoph Hellwig if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
1866bfe4037eSChristoph Hellwig return -EINVAL;
1867bfe4037eSChristoph Hellwig
1868bfe4037eSChristoph Hellwig INIT_WORK(&req->work, aio_poll_complete_work);
1869bfe4037eSChristoph Hellwig req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
1870bfe4037eSChristoph Hellwig
18712bc4ca9bSJens Axboe req->head = NULL;
18722bc4ca9bSJens Axboe req->cancelled = false;
1873363bee27SEric Biggers req->work_scheduled = false;
1874363bee27SEric Biggers req->work_need_resched = false;
18752bc4ca9bSJens Axboe
1876bfe4037eSChristoph Hellwig apt.pt._qproc = aio_poll_queue_proc;
1877bfe4037eSChristoph Hellwig apt.pt._key = req->events;
1878bfe4037eSChristoph Hellwig apt.iocb = aiocb;
187950252e4bSEric Biggers apt.queued = false;
1880bfe4037eSChristoph Hellwig apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
1881bfe4037eSChristoph Hellwig
1882bfe4037eSChristoph Hellwig /* initialized the list so that we can do list_empty checks */
1883bfe4037eSChristoph Hellwig INIT_LIST_HEAD(&req->wait.entry);
1884bfe4037eSChristoph Hellwig init_waitqueue_func_entry(&req->wait, aio_poll_wake);
1885bfe4037eSChristoph Hellwig
1886bfe4037eSChristoph Hellwig mask = vfs_poll(req->file, &apt.pt) & req->events;
1887bfe4037eSChristoph Hellwig spin_lock_irq(&ctx->ctx_lock);
188850252e4bSEric Biggers if (likely(apt.queued)) {
188950252e4bSEric Biggers bool on_queue = poll_iocb_lock_wq(req);
189050252e4bSEric Biggers
189150252e4bSEric Biggers if (!on_queue || req->work_scheduled) {
1892363bee27SEric Biggers /*
1893363bee27SEric Biggers * aio_poll_wake() already either scheduled the async
1894363bee27SEric Biggers * completion work, or completed the request inline.
1895363bee27SEric Biggers */
1896363bee27SEric Biggers if (apt.error) /* unsupported case: multiple queues */
1897af5c72b1SAl Viro cancel = true;
1898bfe4037eSChristoph Hellwig apt.error = 0;
1899af5c72b1SAl Viro mask = 0;
1900af5c72b1SAl Viro }
1901af5c72b1SAl Viro if (mask || apt.error) {
1902363bee27SEric Biggers /* Steal to complete synchronously. */
1903bfe4037eSChristoph Hellwig list_del_init(&req->wait.entry);
1904af5c72b1SAl Viro } else if (cancel) {
1905363bee27SEric Biggers /* Cancel if possible (may be too late though). */
1906af5c72b1SAl Viro WRITE_ONCE(req->cancelled, true);
190750252e4bSEric Biggers } else if (on_queue) {
1908363bee27SEric Biggers /*
1909363bee27SEric Biggers * Actually waiting for an event, so add the request to
1910363bee27SEric Biggers * active_reqs so that it can be cancelled if needed.
1911363bee27SEric Biggers */
1912bfe4037eSChristoph Hellwig list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
1913bfe4037eSChristoph Hellwig aiocb->ki_cancel = aio_poll_cancel;
1914bfe4037eSChristoph Hellwig }
191550252e4bSEric Biggers if (on_queue)
191650252e4bSEric Biggers poll_iocb_unlock_wq(req);
1917af5c72b1SAl Viro }
1918af5c72b1SAl Viro if (mask) { /* no async, we'd stolen it */
1919af5c72b1SAl Viro aiocb->ki_res.res = mangle_poll(mask);
1920af5c72b1SAl Viro apt.error = 0;
1921af5c72b1SAl Viro }
1922bfe4037eSChristoph Hellwig spin_unlock_irq(&ctx->ctx_lock);
1923bfe4037eSChristoph Hellwig if (mask)
1924af5c72b1SAl Viro iocb_put(aiocb);
1925af5c72b1SAl Viro return apt.error;
1926bfe4037eSChristoph Hellwig }
1927bfe4037eSChristoph Hellwig
__io_submit_one(struct kioctx * ctx,const struct iocb * iocb,struct iocb __user * user_iocb,struct aio_kiocb * req,bool compat)192888a6f18bSJens Axboe static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
19297316b49cSAl Viro struct iocb __user *user_iocb, struct aio_kiocb *req,
19307316b49cSAl Viro bool compat)
19311da177e4SLinus Torvalds {
193284c4e1f8SLinus Torvalds req->ki_filp = fget(iocb->aio_fildes);
193384c4e1f8SLinus Torvalds if (unlikely(!req->ki_filp))
19347316b49cSAl Viro return -EBADF;
193584c4e1f8SLinus Torvalds
193688a6f18bSJens Axboe if (iocb->aio_flags & IOCB_FLAG_RESFD) {
193774259703SAl Viro struct eventfd_ctx *eventfd;
19389c3060beSDavide Libenzi /*
19399c3060beSDavide Libenzi * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
19409c3060beSDavide Libenzi * instance of the file* now. The file descriptor must be
19419c3060beSDavide Libenzi * an eventfd() fd, and will be signaled for each completed
19429c3060beSDavide Libenzi * event using the eventfd_signal() function.
19439c3060beSDavide Libenzi */
194474259703SAl Viro eventfd = eventfd_ctx_fdget(iocb->aio_resfd);
19457316b49cSAl Viro if (IS_ERR(eventfd))
194618bfb9c6SDan Carpenter return PTR_ERR(eventfd);
19477316b49cSAl Viro
194874259703SAl Viro req->ki_eventfd = eventfd;
19499830f4beSGoldwyn Rodrigues }
19509830f4beSGoldwyn Rodrigues
19517316b49cSAl Viro if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
1952caf4167aSKent Overstreet pr_debug("EFAULT: aio_key\n");
19537316b49cSAl Viro return -EFAULT;
19541da177e4SLinus Torvalds }
19551da177e4SLinus Torvalds
1956a9339b78SAl Viro req->ki_res.obj = (u64)(unsigned long)user_iocb;
1957a9339b78SAl Viro req->ki_res.data = iocb->aio_data;
1958a9339b78SAl Viro req->ki_res.res = 0;
1959a9339b78SAl Viro req->ki_res.res2 = 0;
19601da177e4SLinus Torvalds
196188a6f18bSJens Axboe switch (iocb->aio_lio_opcode) {
196289319d31SChristoph Hellwig case IOCB_CMD_PREAD:
19637316b49cSAl Viro return aio_read(&req->rw, iocb, false, compat);
196489319d31SChristoph Hellwig case IOCB_CMD_PWRITE:
19657316b49cSAl Viro return aio_write(&req->rw, iocb, false, compat);
196689319d31SChristoph Hellwig case IOCB_CMD_PREADV:
19677316b49cSAl Viro return aio_read(&req->rw, iocb, true, compat);
196889319d31SChristoph Hellwig case IOCB_CMD_PWRITEV:
19697316b49cSAl Viro return aio_write(&req->rw, iocb, true, compat);
1970a3c0d439SChristoph Hellwig case IOCB_CMD_FSYNC:
19717316b49cSAl Viro return aio_fsync(&req->fsync, iocb, false);
1972a3c0d439SChristoph Hellwig case IOCB_CMD_FDSYNC:
19737316b49cSAl Viro return aio_fsync(&req->fsync, iocb, true);
1974bfe4037eSChristoph Hellwig case IOCB_CMD_POLL:
19757316b49cSAl Viro return aio_poll(req, iocb);
197689319d31SChristoph Hellwig default:
197788a6f18bSJens Axboe pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
19787316b49cSAl Viro return -EINVAL;
197989319d31SChristoph Hellwig }
19801da177e4SLinus Torvalds }
19811da177e4SLinus Torvalds
io_submit_one(struct kioctx * ctx,struct iocb __user * user_iocb,bool compat)198288a6f18bSJens Axboe static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
198388a6f18bSJens Axboe bool compat)
198488a6f18bSJens Axboe {
19857316b49cSAl Viro struct aio_kiocb *req;
198688a6f18bSJens Axboe struct iocb iocb;
19877316b49cSAl Viro int err;
198888a6f18bSJens Axboe
198988a6f18bSJens Axboe if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
199088a6f18bSJens Axboe return -EFAULT;
199188a6f18bSJens Axboe
19927316b49cSAl Viro /* enforce forwards compatibility on users */
19937316b49cSAl Viro if (unlikely(iocb.aio_reserved2)) {
19947316b49cSAl Viro pr_debug("EINVAL: reserve field set\n");
19957316b49cSAl Viro return -EINVAL;
19967316b49cSAl Viro }
19977316b49cSAl Viro
19987316b49cSAl Viro /* prevent overflows */
19997316b49cSAl Viro if (unlikely(
20007316b49cSAl Viro (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
20017316b49cSAl Viro (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
20027316b49cSAl Viro ((ssize_t)iocb.aio_nbytes < 0)
20037316b49cSAl Viro )) {
20047316b49cSAl Viro pr_debug("EINVAL: overflow check\n");
20057316b49cSAl Viro return -EINVAL;
20067316b49cSAl Viro }
20077316b49cSAl Viro
20087316b49cSAl Viro req = aio_get_req(ctx);
20097316b49cSAl Viro if (unlikely(!req))
20107316b49cSAl Viro return -EAGAIN;
20117316b49cSAl Viro
20127316b49cSAl Viro err = __io_submit_one(ctx, &iocb, user_iocb, req, compat);
20137316b49cSAl Viro
20147316b49cSAl Viro /* Done with the synchronous reference */
20157316b49cSAl Viro iocb_put(req);
20167316b49cSAl Viro
20177316b49cSAl Viro /*
20187316b49cSAl Viro * If err is 0, we'd either done aio_complete() ourselves or have
20197316b49cSAl Viro * arranged for that to be done asynchronously. Anything non-zero
20207316b49cSAl Viro * means that we need to destroy req ourselves.
20217316b49cSAl Viro */
20227316b49cSAl Viro if (unlikely(err)) {
20237316b49cSAl Viro iocb_destroy(req);
20247316b49cSAl Viro put_reqs_available(ctx, 1);
20257316b49cSAl Viro }
20267316b49cSAl Viro return err;
202788a6f18bSJens Axboe }
202888a6f18bSJens Axboe
20299d85cba7SJeff Moyer /* sys_io_submit:
20309d85cba7SJeff Moyer * Queue the nr iocbs pointed to by iocbpp for processing. Returns
20319d85cba7SJeff Moyer * the number of iocbs queued. May return -EINVAL if the aio_context
20329d85cba7SJeff Moyer * specified by ctx_id is invalid, if nr is < 0, if the iocb at
20339d85cba7SJeff Moyer * *iocbpp[0] is not properly initialized, if the operation specified
20349d85cba7SJeff Moyer * is invalid for the file descriptor in the iocb. May fail with
20359d85cba7SJeff Moyer * -EFAULT if any of the data structures point to invalid data. May
20369d85cba7SJeff Moyer * fail with -EBADF if the file descriptor specified in the first
20379d85cba7SJeff Moyer * iocb is invalid. May fail with -EAGAIN if insufficient resources
20389d85cba7SJeff Moyer * are available to queue any iocbs. Will return 0 if nr is 0. Will
20399d85cba7SJeff Moyer * fail with -ENOSYS if not implemented.
20409d85cba7SJeff Moyer */
SYSCALL_DEFINE3(io_submit,aio_context_t,ctx_id,long,nr,struct iocb __user * __user *,iocbpp)20419d85cba7SJeff Moyer SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
20429d85cba7SJeff Moyer struct iocb __user * __user *, iocbpp)
20439d85cba7SJeff Moyer {
204467ba049fSAl Viro struct kioctx *ctx;
204567ba049fSAl Viro long ret = 0;
204667ba049fSAl Viro int i = 0;
204767ba049fSAl Viro struct blk_plug plug;
204867ba049fSAl Viro
204967ba049fSAl Viro if (unlikely(nr < 0))
205067ba049fSAl Viro return -EINVAL;
205167ba049fSAl Viro
205267ba049fSAl Viro ctx = lookup_ioctx(ctx_id);
205367ba049fSAl Viro if (unlikely(!ctx)) {
205467ba049fSAl Viro pr_debug("EINVAL: invalid context id\n");
205567ba049fSAl Viro return -EINVAL;
205667ba049fSAl Viro }
205767ba049fSAl Viro
20581da92779SAl Viro if (nr > ctx->nr_events)
20591da92779SAl Viro nr = ctx->nr_events;
20601da92779SAl Viro
2061a79d40e9SJens Axboe if (nr > AIO_PLUG_THRESHOLD)
206267ba049fSAl Viro blk_start_plug(&plug);
206367ba049fSAl Viro for (i = 0; i < nr; i++) {
206467ba049fSAl Viro struct iocb __user *user_iocb;
206567ba049fSAl Viro
206667ba049fSAl Viro if (unlikely(get_user(user_iocb, iocbpp + i))) {
206767ba049fSAl Viro ret = -EFAULT;
206867ba049fSAl Viro break;
206967ba049fSAl Viro }
207067ba049fSAl Viro
207167ba049fSAl Viro ret = io_submit_one(ctx, user_iocb, false);
207267ba049fSAl Viro if (ret)
207367ba049fSAl Viro break;
207467ba049fSAl Viro }
2075a79d40e9SJens Axboe if (nr > AIO_PLUG_THRESHOLD)
207667ba049fSAl Viro blk_finish_plug(&plug);
207767ba049fSAl Viro
207867ba049fSAl Viro percpu_ref_put(&ctx->users);
207967ba049fSAl Viro return i ? i : ret;
20809d85cba7SJeff Moyer }
20819d85cba7SJeff Moyer
2082c00d2c7eSAl Viro #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(io_submit,compat_aio_context_t,ctx_id,int,nr,compat_uptr_t __user *,iocbpp)2083c00d2c7eSAl Viro COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
208467ba049fSAl Viro int, nr, compat_uptr_t __user *, iocbpp)
2085c00d2c7eSAl Viro {
208667ba049fSAl Viro struct kioctx *ctx;
208767ba049fSAl Viro long ret = 0;
208867ba049fSAl Viro int i = 0;
208967ba049fSAl Viro struct blk_plug plug;
2090c00d2c7eSAl Viro
2091c00d2c7eSAl Viro if (unlikely(nr < 0))
2092c00d2c7eSAl Viro return -EINVAL;
2093c00d2c7eSAl Viro
209467ba049fSAl Viro ctx = lookup_ioctx(ctx_id);
209567ba049fSAl Viro if (unlikely(!ctx)) {
209667ba049fSAl Viro pr_debug("EINVAL: invalid context id\n");
209767ba049fSAl Viro return -EINVAL;
209867ba049fSAl Viro }
209967ba049fSAl Viro
21001da92779SAl Viro if (nr > ctx->nr_events)
21011da92779SAl Viro nr = ctx->nr_events;
21021da92779SAl Viro
2103a79d40e9SJens Axboe if (nr > AIO_PLUG_THRESHOLD)
210467ba049fSAl Viro blk_start_plug(&plug);
210567ba049fSAl Viro for (i = 0; i < nr; i++) {
210667ba049fSAl Viro compat_uptr_t user_iocb;
210767ba049fSAl Viro
210867ba049fSAl Viro if (unlikely(get_user(user_iocb, iocbpp + i))) {
210967ba049fSAl Viro ret = -EFAULT;
211067ba049fSAl Viro break;
211167ba049fSAl Viro }
211267ba049fSAl Viro
211367ba049fSAl Viro ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
211467ba049fSAl Viro if (ret)
211567ba049fSAl Viro break;
211667ba049fSAl Viro }
2117a79d40e9SJens Axboe if (nr > AIO_PLUG_THRESHOLD)
211867ba049fSAl Viro blk_finish_plug(&plug);
211967ba049fSAl Viro
212067ba049fSAl Viro percpu_ref_put(&ctx->users);
212167ba049fSAl Viro return i ? i : ret;
2122c00d2c7eSAl Viro }
2123c00d2c7eSAl Viro #endif
2124c00d2c7eSAl Viro
21251da177e4SLinus Torvalds /* sys_io_cancel:
21261da177e4SLinus Torvalds * Attempts to cancel an iocb previously passed to io_submit. If
21271da177e4SLinus Torvalds * the operation is successfully cancelled, the resulting event is
21281da177e4SLinus Torvalds * copied into the memory pointed to by result without being placed
21291da177e4SLinus Torvalds * into the completion queue and 0 is returned. May fail with
21301da177e4SLinus Torvalds * -EFAULT if any of the data structures pointed to are invalid.
21311da177e4SLinus Torvalds * May fail with -EINVAL if aio_context specified by ctx_id is
21321da177e4SLinus Torvalds * invalid. May fail with -EAGAIN if the iocb specified was not
21331da177e4SLinus Torvalds * cancelled. Will fail with -ENOSYS if not implemented.
21341da177e4SLinus Torvalds */
SYSCALL_DEFINE3(io_cancel,aio_context_t,ctx_id,struct iocb __user *,iocb,struct io_event __user *,result)2135002c8976SHeiko Carstens SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
2136002c8976SHeiko Carstens struct io_event __user *, result)
21371da177e4SLinus Torvalds {
21381da177e4SLinus Torvalds struct kioctx *ctx;
213904b2fa9fSChristoph Hellwig struct aio_kiocb *kiocb;
2140888933f8SChristoph Hellwig int ret = -EINVAL;
21411da177e4SLinus Torvalds u32 key;
2142a9339b78SAl Viro u64 obj = (u64)(unsigned long)iocb;
21431da177e4SLinus Torvalds
2144f3a2752aSChristoph Hellwig if (unlikely(get_user(key, &iocb->aio_key)))
21451da177e4SLinus Torvalds return -EFAULT;
2146f3a2752aSChristoph Hellwig if (unlikely(key != KIOCB_KEY))
2147f3a2752aSChristoph Hellwig return -EINVAL;
21481da177e4SLinus Torvalds
21491da177e4SLinus Torvalds ctx = lookup_ioctx(ctx_id);
21501da177e4SLinus Torvalds if (unlikely(!ctx))
21511da177e4SLinus Torvalds return -EINVAL;
21521da177e4SLinus Torvalds
21531da177e4SLinus Torvalds spin_lock_irq(&ctx->ctx_lock);
2154833f4154SAl Viro /* TODO: use a hash or array, this sucks. */
2155833f4154SAl Viro list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
2156a9339b78SAl Viro if (kiocb->ki_res.obj == obj) {
2157888933f8SChristoph Hellwig ret = kiocb->ki_cancel(&kiocb->rw);
2158888933f8SChristoph Hellwig list_del_init(&kiocb->ki_list);
2159833f4154SAl Viro break;
2160833f4154SAl Viro }
2161888933f8SChristoph Hellwig }
21621da177e4SLinus Torvalds spin_unlock_irq(&ctx->ctx_lock);
21631da177e4SLinus Torvalds
21641da177e4SLinus Torvalds if (!ret) {
2165bec68faaSKent Overstreet /*
2166bec68faaSKent Overstreet * The result argument is no longer used - the io_event is
2167bec68faaSKent Overstreet * always delivered via the ring buffer. -EINPROGRESS indicates
2168bec68faaSKent Overstreet * cancellation is progress:
21691da177e4SLinus Torvalds */
2170bec68faaSKent Overstreet ret = -EINPROGRESS;
21711da177e4SLinus Torvalds }
21721da177e4SLinus Torvalds
2173723be6e3SKent Overstreet percpu_ref_put(&ctx->users);
21741da177e4SLinus Torvalds
21751da177e4SLinus Torvalds return ret;
21761da177e4SLinus Torvalds }
21771da177e4SLinus Torvalds
do_io_getevents(aio_context_t ctx_id,long min_nr,long nr,struct io_event __user * events,struct timespec64 * ts)2178fa2e62a5SDeepa Dinamani static long do_io_getevents(aio_context_t ctx_id,
2179fa2e62a5SDeepa Dinamani long min_nr,
2180fa2e62a5SDeepa Dinamani long nr,
2181fa2e62a5SDeepa Dinamani struct io_event __user *events,
2182fa2e62a5SDeepa Dinamani struct timespec64 *ts)
2183fa2e62a5SDeepa Dinamani {
2184fa2e62a5SDeepa Dinamani ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
2185fa2e62a5SDeepa Dinamani struct kioctx *ioctx = lookup_ioctx(ctx_id);
2186fa2e62a5SDeepa Dinamani long ret = -EINVAL;
2187fa2e62a5SDeepa Dinamani
2188fa2e62a5SDeepa Dinamani if (likely(ioctx)) {
2189fa2e62a5SDeepa Dinamani if (likely(min_nr <= nr && min_nr >= 0))
2190fa2e62a5SDeepa Dinamani ret = read_events(ioctx, min_nr, nr, events, until);
2191fa2e62a5SDeepa Dinamani percpu_ref_put(&ioctx->users);
2192fa2e62a5SDeepa Dinamani }
2193fa2e62a5SDeepa Dinamani
2194fa2e62a5SDeepa Dinamani return ret;
2195fa2e62a5SDeepa Dinamani }
2196fa2e62a5SDeepa Dinamani
21971da177e4SLinus Torvalds /* io_getevents:
21981da177e4SLinus Torvalds * Attempts to read at least min_nr events and up to nr events from
2199642b5123SSatoru Takeuchi * the completion queue for the aio_context specified by ctx_id. If
2200642b5123SSatoru Takeuchi * it succeeds, the number of read events is returned. May fail with
2201642b5123SSatoru Takeuchi * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
2202642b5123SSatoru Takeuchi * out of range, if timeout is out of range. May fail with -EFAULT
2203642b5123SSatoru Takeuchi * if any of the memory specified is invalid. May return 0 or
2204642b5123SSatoru Takeuchi * < min_nr if the timeout specified by timeout has elapsed
2205642b5123SSatoru Takeuchi * before sufficient events are available, where timeout == NULL
2206642b5123SSatoru Takeuchi * specifies an infinite timeout. Note that the timeout pointed to by
22076900807cSJeff Moyer * timeout is relative. Will fail with -ENOSYS if not implemented.
22081da177e4SLinus Torvalds */
22093ca47e95SArnd Bergmann #ifdef CONFIG_64BIT
22107a35397fSDeepa Dinamani
SYSCALL_DEFINE5(io_getevents,aio_context_t,ctx_id,long,min_nr,long,nr,struct io_event __user *,events,struct __kernel_timespec __user *,timeout)2211002c8976SHeiko Carstens SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
2212002c8976SHeiko Carstens long, min_nr,
2213002c8976SHeiko Carstens long, nr,
2214002c8976SHeiko Carstens struct io_event __user *, events,
22157a35397fSDeepa Dinamani struct __kernel_timespec __user *, timeout)
22161da177e4SLinus Torvalds {
2217fa2e62a5SDeepa Dinamani struct timespec64 ts;
22187a074e96SChristoph Hellwig int ret;
22191da177e4SLinus Torvalds
22207a074e96SChristoph Hellwig if (timeout && unlikely(get_timespec64(&ts, timeout)))
2221fa2e62a5SDeepa Dinamani return -EFAULT;
22227a074e96SChristoph Hellwig
22237a074e96SChristoph Hellwig ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
22247a074e96SChristoph Hellwig if (!ret && signal_pending(current))
22257a074e96SChristoph Hellwig ret = -EINTR;
22267a074e96SChristoph Hellwig return ret;
22271da177e4SLinus Torvalds }
2228fa2e62a5SDeepa Dinamani
22297a35397fSDeepa Dinamani #endif
22307a35397fSDeepa Dinamani
22319ba546c0SChristoph Hellwig struct __aio_sigset {
22329ba546c0SChristoph Hellwig const sigset_t __user *sigmask;
22339ba546c0SChristoph Hellwig size_t sigsetsize;
22349ba546c0SChristoph Hellwig };
22359ba546c0SChristoph Hellwig
SYSCALL_DEFINE6(io_pgetevents,aio_context_t,ctx_id,long,min_nr,long,nr,struct io_event __user *,events,struct __kernel_timespec __user *,timeout,const struct __aio_sigset __user *,usig)22367a074e96SChristoph Hellwig SYSCALL_DEFINE6(io_pgetevents,
22377a074e96SChristoph Hellwig aio_context_t, ctx_id,
22387a074e96SChristoph Hellwig long, min_nr,
22397a074e96SChristoph Hellwig long, nr,
22407a074e96SChristoph Hellwig struct io_event __user *, events,
22417a35397fSDeepa Dinamani struct __kernel_timespec __user *, timeout,
22427a074e96SChristoph Hellwig const struct __aio_sigset __user *, usig)
22437a074e96SChristoph Hellwig {
22447a074e96SChristoph Hellwig struct __aio_sigset ksig = { NULL, };
22457a074e96SChristoph Hellwig struct timespec64 ts;
224697abc889SOleg Nesterov bool interrupted;
22477a074e96SChristoph Hellwig int ret;
22487a074e96SChristoph Hellwig
22497a074e96SChristoph Hellwig if (timeout && unlikely(get_timespec64(&ts, timeout)))
22507a074e96SChristoph Hellwig return -EFAULT;
22517a074e96SChristoph Hellwig
22527a074e96SChristoph Hellwig if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
22537a074e96SChristoph Hellwig return -EFAULT;
22547a074e96SChristoph Hellwig
2255b772434bSOleg Nesterov ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
22567a35397fSDeepa Dinamani if (ret)
22577a35397fSDeepa Dinamani return ret;
22587a074e96SChristoph Hellwig
22597a074e96SChristoph Hellwig ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
226097abc889SOleg Nesterov
226197abc889SOleg Nesterov interrupted = signal_pending(current);
2262b772434bSOleg Nesterov restore_saved_sigmask_unless(interrupted);
226397abc889SOleg Nesterov if (interrupted && !ret)
22647a074e96SChristoph Hellwig ret = -ERESTARTNOHAND;
22657a074e96SChristoph Hellwig
22667a074e96SChristoph Hellwig return ret;
22671da177e4SLinus Torvalds }
2268c00d2c7eSAl Viro
22697a35397fSDeepa Dinamani #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
22707a35397fSDeepa Dinamani
SYSCALL_DEFINE6(io_pgetevents_time32,aio_context_t,ctx_id,long,min_nr,long,nr,struct io_event __user *,events,struct old_timespec32 __user *,timeout,const struct __aio_sigset __user *,usig)22717a35397fSDeepa Dinamani SYSCALL_DEFINE6(io_pgetevents_time32,
22727a35397fSDeepa Dinamani aio_context_t, ctx_id,
22737a35397fSDeepa Dinamani long, min_nr,
22747a35397fSDeepa Dinamani long, nr,
22757a35397fSDeepa Dinamani struct io_event __user *, events,
22767a35397fSDeepa Dinamani struct old_timespec32 __user *, timeout,
22777a35397fSDeepa Dinamani const struct __aio_sigset __user *, usig)
22787a35397fSDeepa Dinamani {
22797a35397fSDeepa Dinamani struct __aio_sigset ksig = { NULL, };
22807a35397fSDeepa Dinamani struct timespec64 ts;
228197abc889SOleg Nesterov bool interrupted;
22827a35397fSDeepa Dinamani int ret;
22837a35397fSDeepa Dinamani
22847a35397fSDeepa Dinamani if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
22857a35397fSDeepa Dinamani return -EFAULT;
22867a35397fSDeepa Dinamani
22877a35397fSDeepa Dinamani if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
22887a35397fSDeepa Dinamani return -EFAULT;
22897a35397fSDeepa Dinamani
2290ded653ccSDeepa Dinamani
2291b772434bSOleg Nesterov ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
2292ded653ccSDeepa Dinamani if (ret)
2293ded653ccSDeepa Dinamani return ret;
22941da177e4SLinus Torvalds
22951da177e4SLinus Torvalds ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
229697abc889SOleg Nesterov
229797abc889SOleg Nesterov interrupted = signal_pending(current);
2298b772434bSOleg Nesterov restore_saved_sigmask_unless(interrupted);
229997abc889SOleg Nesterov if (interrupted && !ret)
23001da177e4SLinus Torvalds ret = -ERESTARTNOHAND;
23011da177e4SLinus Torvalds
23021da177e4SLinus Torvalds return ret;
23031da177e4SLinus Torvalds }
2304c00d2c7eSAl Viro
23057a35397fSDeepa Dinamani #endif
23067a35397fSDeepa Dinamani
23077a35397fSDeepa Dinamani #if defined(CONFIG_COMPAT_32BIT_TIME)
23087a35397fSDeepa Dinamani
SYSCALL_DEFINE5(io_getevents_time32,__u32,ctx_id,__s32,min_nr,__s32,nr,struct io_event __user *,events,struct old_timespec32 __user *,timeout)23098dabe724SArnd Bergmann SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
23108dabe724SArnd Bergmann __s32, min_nr,
23118dabe724SArnd Bergmann __s32, nr,
2312c00d2c7eSAl Viro struct io_event __user *, events,
23139afc5eeeSArnd Bergmann struct old_timespec32 __user *, timeout)
2314c00d2c7eSAl Viro {
2315fa2e62a5SDeepa Dinamani struct timespec64 t;
23167a074e96SChristoph Hellwig int ret;
2317c00d2c7eSAl Viro
23189afc5eeeSArnd Bergmann if (timeout && get_old_timespec32(&t, timeout))
2319c00d2c7eSAl Viro return -EFAULT;
2320c00d2c7eSAl Viro
23217a074e96SChristoph Hellwig ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
23227a074e96SChristoph Hellwig if (!ret && signal_pending(current))
23237a074e96SChristoph Hellwig ret = -EINTR;
23247a074e96SChristoph Hellwig return ret;
2325c00d2c7eSAl Viro }
2326fa2e62a5SDeepa Dinamani
23277a35397fSDeepa Dinamani #endif
23287a35397fSDeepa Dinamani
23297a35397fSDeepa Dinamani #ifdef CONFIG_COMPAT
23307a074e96SChristoph Hellwig
23317a074e96SChristoph Hellwig struct __compat_aio_sigset {
233297eba80fSGuillem Jover compat_uptr_t sigmask;
23337a074e96SChristoph Hellwig compat_size_t sigsetsize;
23347a074e96SChristoph Hellwig };
23357a074e96SChristoph Hellwig
23367a35397fSDeepa Dinamani #if defined(CONFIG_COMPAT_32BIT_TIME)
23377a35397fSDeepa Dinamani
COMPAT_SYSCALL_DEFINE6(io_pgetevents,compat_aio_context_t,ctx_id,compat_long_t,min_nr,compat_long_t,nr,struct io_event __user *,events,struct old_timespec32 __user *,timeout,const struct __compat_aio_sigset __user *,usig)23387a074e96SChristoph Hellwig COMPAT_SYSCALL_DEFINE6(io_pgetevents,
23397a074e96SChristoph Hellwig compat_aio_context_t, ctx_id,
23407a074e96SChristoph Hellwig compat_long_t, min_nr,
23417a074e96SChristoph Hellwig compat_long_t, nr,
23427a074e96SChristoph Hellwig struct io_event __user *, events,
23439afc5eeeSArnd Bergmann struct old_timespec32 __user *, timeout,
23447a074e96SChristoph Hellwig const struct __compat_aio_sigset __user *, usig)
23457a074e96SChristoph Hellwig {
234697eba80fSGuillem Jover struct __compat_aio_sigset ksig = { 0, };
23477a074e96SChristoph Hellwig struct timespec64 t;
234897abc889SOleg Nesterov bool interrupted;
23497a074e96SChristoph Hellwig int ret;
23507a074e96SChristoph Hellwig
23519afc5eeeSArnd Bergmann if (timeout && get_old_timespec32(&t, timeout))
23527a074e96SChristoph Hellwig return -EFAULT;
23537a074e96SChristoph Hellwig
23547a074e96SChristoph Hellwig if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
23557a074e96SChristoph Hellwig return -EFAULT;
23567a074e96SChristoph Hellwig
235797eba80fSGuillem Jover ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
2358ded653ccSDeepa Dinamani if (ret)
2359ded653ccSDeepa Dinamani return ret;
23607a074e96SChristoph Hellwig
23617a074e96SChristoph Hellwig ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
236297abc889SOleg Nesterov
236397abc889SOleg Nesterov interrupted = signal_pending(current);
2364b772434bSOleg Nesterov restore_saved_sigmask_unless(interrupted);
236597abc889SOleg Nesterov if (interrupted && !ret)
23667a074e96SChristoph Hellwig ret = -ERESTARTNOHAND;
23677a074e96SChristoph Hellwig
23687a074e96SChristoph Hellwig return ret;
23691da177e4SLinus Torvalds }
23701da177e4SLinus Torvalds
23717a35397fSDeepa Dinamani #endif
23727a35397fSDeepa Dinamani
COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,compat_aio_context_t,ctx_id,compat_long_t,min_nr,compat_long_t,nr,struct io_event __user *,events,struct __kernel_timespec __user *,timeout,const struct __compat_aio_sigset __user *,usig)23737a35397fSDeepa Dinamani COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
23747a35397fSDeepa Dinamani compat_aio_context_t, ctx_id,
23757a35397fSDeepa Dinamani compat_long_t, min_nr,
23767a35397fSDeepa Dinamani compat_long_t, nr,
23777a35397fSDeepa Dinamani struct io_event __user *, events,
23787a35397fSDeepa Dinamani struct __kernel_timespec __user *, timeout,
23797a35397fSDeepa Dinamani const struct __compat_aio_sigset __user *, usig)
23807a35397fSDeepa Dinamani {
238197eba80fSGuillem Jover struct __compat_aio_sigset ksig = { 0, };
23827a35397fSDeepa Dinamani struct timespec64 t;
238397abc889SOleg Nesterov bool interrupted;
23847a35397fSDeepa Dinamani int ret;
23857a35397fSDeepa Dinamani
23867a35397fSDeepa Dinamani if (timeout && get_timespec64(&t, timeout))
23877a35397fSDeepa Dinamani return -EFAULT;
23887a35397fSDeepa Dinamani
23897a35397fSDeepa Dinamani if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
23907a35397fSDeepa Dinamani return -EFAULT;
23917a35397fSDeepa Dinamani
239297eba80fSGuillem Jover ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
23937a35397fSDeepa Dinamani if (ret)
23947a35397fSDeepa Dinamani return ret;
23957a35397fSDeepa Dinamani
23967a35397fSDeepa Dinamani ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
239797abc889SOleg Nesterov
239897abc889SOleg Nesterov interrupted = signal_pending(current);
2399b772434bSOleg Nesterov restore_saved_sigmask_unless(interrupted);
240097abc889SOleg Nesterov if (interrupted && !ret)
24017a35397fSDeepa Dinamani ret = -ERESTARTNOHAND;
24027a35397fSDeepa Dinamani
24031da177e4SLinus Torvalds return ret;
24041da177e4SLinus Torvalds }
24051da177e4SLinus Torvalds #endif
2406