xref: /openbmc/linux/fs/aio.c (revision 46eeaa11bdd1bc9e077bdf741d32ca7235d263c6)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  *	An async IO implementation for Linux
31da177e4SLinus Torvalds  *	Written by Benjamin LaHaise <bcrl@kvack.org>
41da177e4SLinus Torvalds  *
51da177e4SLinus Torvalds  *	Implements an efficient asynchronous io interface.
61da177e4SLinus Torvalds  *
71da177e4SLinus Torvalds  *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
8bfe4037eSChristoph Hellwig  *	Copyright 2018 Christoph Hellwig.
91da177e4SLinus Torvalds  *
101da177e4SLinus Torvalds  *	See ../COPYING for licensing terms.
111da177e4SLinus Torvalds  */
12caf4167aSKent Overstreet #define pr_fmt(fmt) "%s: " fmt, __func__
13caf4167aSKent Overstreet 
141da177e4SLinus Torvalds #include <linux/kernel.h>
151da177e4SLinus Torvalds #include <linux/init.h>
161da177e4SLinus Torvalds #include <linux/errno.h>
171da177e4SLinus Torvalds #include <linux/time.h>
181da177e4SLinus Torvalds #include <linux/aio_abi.h>
19630d9c47SPaul Gortmaker #include <linux/export.h>
201da177e4SLinus Torvalds #include <linux/syscalls.h>
21b9d128f1SJens Axboe #include <linux/backing-dev.h>
229018ccc4SChristoph Hellwig #include <linux/refcount.h>
23027445c3SBadari Pulavarty #include <linux/uio.h>
241da177e4SLinus Torvalds 
25174cd4b1SIngo Molnar #include <linux/sched/signal.h>
261da177e4SLinus Torvalds #include <linux/fs.h>
271da177e4SLinus Torvalds #include <linux/file.h>
281da177e4SLinus Torvalds #include <linux/mm.h>
291da177e4SLinus Torvalds #include <linux/mman.h>
30e1bdd5f2SKent Overstreet #include <linux/percpu.h>
311da177e4SLinus Torvalds #include <linux/slab.h>
321da177e4SLinus Torvalds #include <linux/timer.h>
331da177e4SLinus Torvalds #include <linux/aio.h>
341da177e4SLinus Torvalds #include <linux/highmem.h>
351da177e4SLinus Torvalds #include <linux/workqueue.h>
361da177e4SLinus Torvalds #include <linux/security.h>
379c3060beSDavide Libenzi #include <linux/eventfd.h>
38cfb1e33eSJeff Moyer #include <linux/blkdev.h>
399d85cba7SJeff Moyer #include <linux/compat.h>
4036bc08ccSGu Zheng #include <linux/migrate.h>
4136bc08ccSGu Zheng #include <linux/ramfs.h>
42723be6e3SKent Overstreet #include <linux/percpu-refcount.h>
4371ad7490SBenjamin LaHaise #include <linux/mount.h>
4452db59dfSDavid Howells #include <linux/pseudo_fs.h>
451da177e4SLinus Torvalds 
467c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
47a538e3ffSJeff Moyer #include <linux/nospec.h>
481da177e4SLinus Torvalds 
4968d70d03SAl Viro #include "internal.h"
5068d70d03SAl Viro 
51f3a2752aSChristoph Hellwig #define KIOCB_KEY		0
52f3a2752aSChristoph Hellwig 
534e179bcaSKent Overstreet #define AIO_RING_MAGIC			0xa10a10a1
544e179bcaSKent Overstreet #define AIO_RING_COMPAT_FEATURES	1
554e179bcaSKent Overstreet #define AIO_RING_INCOMPAT_FEATURES	0
564e179bcaSKent Overstreet struct aio_ring {
574e179bcaSKent Overstreet 	unsigned	id;	/* kernel internal index number */
584e179bcaSKent Overstreet 	unsigned	nr;	/* number of io_events */
59fa8a53c3SBenjamin LaHaise 	unsigned	head;	/* Written to by userland or under ring_lock
60fa8a53c3SBenjamin LaHaise 				 * mutex by aio_read_events_ring(). */
614e179bcaSKent Overstreet 	unsigned	tail;
624e179bcaSKent Overstreet 
634e179bcaSKent Overstreet 	unsigned	magic;
644e179bcaSKent Overstreet 	unsigned	compat_features;
654e179bcaSKent Overstreet 	unsigned	incompat_features;
664e179bcaSKent Overstreet 	unsigned	header_length;	/* size of aio_ring */
674e179bcaSKent Overstreet 
684e179bcaSKent Overstreet 
69241cb28eSGustavo A. R. Silva 	struct io_event		io_events[];
704e179bcaSKent Overstreet }; /* 128 bytes + ring size */
714e179bcaSKent Overstreet 
72a79d40e9SJens Axboe /*
73a79d40e9SJens Axboe  * Plugging is meant to work with larger batches of IOs. If we don't
74a79d40e9SJens Axboe  * have more than the below, then don't bother setting up a plug.
75a79d40e9SJens Axboe  */
76a79d40e9SJens Axboe #define AIO_PLUG_THRESHOLD	2
77a79d40e9SJens Axboe 
784e179bcaSKent Overstreet #define AIO_RING_PAGES	8
794e179bcaSKent Overstreet 
80db446a08SBenjamin LaHaise struct kioctx_table {
81db446a08SBenjamin LaHaise 	struct rcu_head		rcu;
82db446a08SBenjamin LaHaise 	unsigned		nr;
83db7fcc88SKees Cook 	struct kioctx __rcu	*table[] __counted_by(nr);
84db446a08SBenjamin LaHaise };
85db446a08SBenjamin LaHaise 
86e1bdd5f2SKent Overstreet struct kioctx_cpu {
87e1bdd5f2SKent Overstreet 	unsigned		reqs_available;
88e1bdd5f2SKent Overstreet };
89e1bdd5f2SKent Overstreet 
90dc48e56dSJens Axboe struct ctx_rq_wait {
91dc48e56dSJens Axboe 	struct completion comp;
92dc48e56dSJens Axboe 	atomic_t count;
93dc48e56dSJens Axboe };
94dc48e56dSJens Axboe 
954e179bcaSKent Overstreet struct kioctx {
96723be6e3SKent Overstreet 	struct percpu_ref	users;
9736f55889SKent Overstreet 	atomic_t		dead;
984e179bcaSKent Overstreet 
99e34ecee2SKent Overstreet 	struct percpu_ref	reqs;
100e34ecee2SKent Overstreet 
1014e179bcaSKent Overstreet 	unsigned long		user_id;
1024e179bcaSKent Overstreet 
103e1bdd5f2SKent Overstreet 	struct __percpu kioctx_cpu *cpu;
104e1bdd5f2SKent Overstreet 
105e1bdd5f2SKent Overstreet 	/*
106e1bdd5f2SKent Overstreet 	 * For percpu reqs_available, number of slots we move to/from global
107e1bdd5f2SKent Overstreet 	 * counter at a time:
108e1bdd5f2SKent Overstreet 	 */
109e1bdd5f2SKent Overstreet 	unsigned		req_batch;
1103e845ce0SKent Overstreet 	/*
1113e845ce0SKent Overstreet 	 * This is what userspace passed to io_setup(), it's not used for
1123e845ce0SKent Overstreet 	 * anything but counting against the global max_reqs quota.
1133e845ce0SKent Overstreet 	 *
11458c85dc2SKent Overstreet 	 * The real limit is nr_events - 1, which will be larger (see
1153e845ce0SKent Overstreet 	 * aio_setup_ring())
1163e845ce0SKent Overstreet 	 */
1174e179bcaSKent Overstreet 	unsigned		max_reqs;
1184e179bcaSKent Overstreet 
11958c85dc2SKent Overstreet 	/* Size of ringbuffer, in units of struct io_event */
12058c85dc2SKent Overstreet 	unsigned		nr_events;
1214e179bcaSKent Overstreet 
12258c85dc2SKent Overstreet 	unsigned long		mmap_base;
12358c85dc2SKent Overstreet 	unsigned long		mmap_size;
12458c85dc2SKent Overstreet 
12558c85dc2SKent Overstreet 	struct page		**ring_pages;
12658c85dc2SKent Overstreet 	long			nr_pages;
12758c85dc2SKent Overstreet 
128f729863aSTejun Heo 	struct rcu_work		free_rwork;	/* see free_ioctx() */
1294e23bcaeSKent Overstreet 
130e02ba72aSAnatol Pomozov 	/*
131e02ba72aSAnatol Pomozov 	 * signals when all in-flight requests are done
132e02ba72aSAnatol Pomozov 	 */
133dc48e56dSJens Axboe 	struct ctx_rq_wait	*rq_wait;
134e02ba72aSAnatol Pomozov 
1354e23bcaeSKent Overstreet 	struct {
13634e83fc6SKent Overstreet 		/*
13734e83fc6SKent Overstreet 		 * This counts the number of available slots in the ringbuffer,
13834e83fc6SKent Overstreet 		 * so we avoid overflowing it: it's decremented (if positive)
13934e83fc6SKent Overstreet 		 * when allocating a kiocb and incremented when the resulting
14034e83fc6SKent Overstreet 		 * io_event is pulled off the ringbuffer.
141e1bdd5f2SKent Overstreet 		 *
142e1bdd5f2SKent Overstreet 		 * We batch accesses to it with a percpu version.
14334e83fc6SKent Overstreet 		 */
14434e83fc6SKent Overstreet 		atomic_t	reqs_available;
1454e23bcaeSKent Overstreet 	} ____cacheline_aligned_in_smp;
1464e23bcaeSKent Overstreet 
1474e23bcaeSKent Overstreet 	struct {
1484e23bcaeSKent Overstreet 		spinlock_t	ctx_lock;
1494e23bcaeSKent Overstreet 		struct list_head active_reqs;	/* used for cancellation */
1504e23bcaeSKent Overstreet 	} ____cacheline_aligned_in_smp;
1514e23bcaeSKent Overstreet 
15258c85dc2SKent Overstreet 	struct {
15358c85dc2SKent Overstreet 		struct mutex	ring_lock;
1544e23bcaeSKent Overstreet 		wait_queue_head_t wait;
1554e23bcaeSKent Overstreet 	} ____cacheline_aligned_in_smp;
15658c85dc2SKent Overstreet 
15758c85dc2SKent Overstreet 	struct {
15858c85dc2SKent Overstreet 		unsigned	tail;
159d856f32aSBenjamin LaHaise 		unsigned	completed_events;
1600460fef2SKent Overstreet 		spinlock_t	completion_lock;
1614e23bcaeSKent Overstreet 	} ____cacheline_aligned_in_smp;
16258c85dc2SKent Overstreet 
16358c85dc2SKent Overstreet 	struct page		*internal_pages[AIO_RING_PAGES];
16436bc08ccSGu Zheng 	struct file		*aio_ring_file;
165db446a08SBenjamin LaHaise 
166db446a08SBenjamin LaHaise 	unsigned		id;
1674e179bcaSKent Overstreet };
1684e179bcaSKent Overstreet 
16984c4e1f8SLinus Torvalds /*
17084c4e1f8SLinus Torvalds  * First field must be the file pointer in all the
17184c4e1f8SLinus Torvalds  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
17284c4e1f8SLinus Torvalds  */
173a3c0d439SChristoph Hellwig struct fsync_iocb {
174a3c0d439SChristoph Hellwig 	struct file		*file;
17584c4e1f8SLinus Torvalds 	struct work_struct	work;
176a3c0d439SChristoph Hellwig 	bool			datasync;
177530f32fcSMiklos Szeredi 	struct cred		*creds;
178a3c0d439SChristoph Hellwig };
179a3c0d439SChristoph Hellwig 
180bfe4037eSChristoph Hellwig struct poll_iocb {
181bfe4037eSChristoph Hellwig 	struct file		*file;
182bfe4037eSChristoph Hellwig 	struct wait_queue_head	*head;
183bfe4037eSChristoph Hellwig 	__poll_t		events;
184bfe4037eSChristoph Hellwig 	bool			cancelled;
185363bee27SEric Biggers 	bool			work_scheduled;
186363bee27SEric Biggers 	bool			work_need_resched;
187bfe4037eSChristoph Hellwig 	struct wait_queue_entry	wait;
188bfe4037eSChristoph Hellwig 	struct work_struct	work;
189bfe4037eSChristoph Hellwig };
190bfe4037eSChristoph Hellwig 
19184c4e1f8SLinus Torvalds /*
19284c4e1f8SLinus Torvalds  * NOTE! Each of the iocb union members has the file pointer
19384c4e1f8SLinus Torvalds  * as the first entry in their struct definition. So you can
19484c4e1f8SLinus Torvalds  * access the file pointer through any of the sub-structs,
19584c4e1f8SLinus Torvalds  * or directly as just 'ki_filp' in this struct.
19684c4e1f8SLinus Torvalds  */
19704b2fa9fSChristoph Hellwig struct aio_kiocb {
19854843f87SChristoph Hellwig 	union {
19984c4e1f8SLinus Torvalds 		struct file		*ki_filp;
20054843f87SChristoph Hellwig 		struct kiocb		rw;
201a3c0d439SChristoph Hellwig 		struct fsync_iocb	fsync;
202bfe4037eSChristoph Hellwig 		struct poll_iocb	poll;
20354843f87SChristoph Hellwig 	};
20404b2fa9fSChristoph Hellwig 
20504b2fa9fSChristoph Hellwig 	struct kioctx		*ki_ctx;
20604b2fa9fSChristoph Hellwig 	kiocb_cancel_fn		*ki_cancel;
20704b2fa9fSChristoph Hellwig 
208a9339b78SAl Viro 	struct io_event		ki_res;
20904b2fa9fSChristoph Hellwig 
21004b2fa9fSChristoph Hellwig 	struct list_head	ki_list;	/* the aio core uses this
21104b2fa9fSChristoph Hellwig 						 * for cancellation */
2129018ccc4SChristoph Hellwig 	refcount_t		ki_refcnt;
21304b2fa9fSChristoph Hellwig 
21404b2fa9fSChristoph Hellwig 	/*
21504b2fa9fSChristoph Hellwig 	 * If the aio_resfd field of the userspace iocb is not zero,
21604b2fa9fSChristoph Hellwig 	 * this is the underlying eventfd context to deliver events to.
21704b2fa9fSChristoph Hellwig 	 */
21804b2fa9fSChristoph Hellwig 	struct eventfd_ctx	*ki_eventfd;
21904b2fa9fSChristoph Hellwig };
22004b2fa9fSChristoph Hellwig 
2211da177e4SLinus Torvalds /*------ sysctl variables----*/
222d55b5fdaSZach Brown static DEFINE_SPINLOCK(aio_nr_lock);
22386b12b6cSXiaoming Ni static unsigned long aio_nr;		/* current system wide number of aio requests */
22486b12b6cSXiaoming Ni static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
2251da177e4SLinus Torvalds /*----end sysctl variables---*/
22686b12b6cSXiaoming Ni #ifdef CONFIG_SYSCTL
22786b12b6cSXiaoming Ni static struct ctl_table aio_sysctls[] = {
22886b12b6cSXiaoming Ni 	{
22986b12b6cSXiaoming Ni 		.procname	= "aio-nr",
23086b12b6cSXiaoming Ni 		.data		= &aio_nr,
23186b12b6cSXiaoming Ni 		.maxlen		= sizeof(aio_nr),
23286b12b6cSXiaoming Ni 		.mode		= 0444,
23386b12b6cSXiaoming Ni 		.proc_handler	= proc_doulongvec_minmax,
23486b12b6cSXiaoming Ni 	},
23586b12b6cSXiaoming Ni 	{
23686b12b6cSXiaoming Ni 		.procname	= "aio-max-nr",
23786b12b6cSXiaoming Ni 		.data		= &aio_max_nr,
23886b12b6cSXiaoming Ni 		.maxlen		= sizeof(aio_max_nr),
23986b12b6cSXiaoming Ni 		.mode		= 0644,
24086b12b6cSXiaoming Ni 		.proc_handler	= proc_doulongvec_minmax,
24186b12b6cSXiaoming Ni 	},
24286b12b6cSXiaoming Ni 	{}
24386b12b6cSXiaoming Ni };
24486b12b6cSXiaoming Ni 
aio_sysctl_init(void)24586b12b6cSXiaoming Ni static void __init aio_sysctl_init(void)
24686b12b6cSXiaoming Ni {
24786b12b6cSXiaoming Ni 	register_sysctl_init("fs", aio_sysctls);
24886b12b6cSXiaoming Ni }
24986b12b6cSXiaoming Ni #else
25086b12b6cSXiaoming Ni #define aio_sysctl_init() do { } while (0)
25186b12b6cSXiaoming Ni #endif
2521da177e4SLinus Torvalds 
253e18b890bSChristoph Lameter static struct kmem_cache	*kiocb_cachep;
254e18b890bSChristoph Lameter static struct kmem_cache	*kioctx_cachep;
2551da177e4SLinus Torvalds 
25671ad7490SBenjamin LaHaise static struct vfsmount *aio_mnt;
25771ad7490SBenjamin LaHaise 
25871ad7490SBenjamin LaHaise static const struct file_operations aio_ring_fops;
25971ad7490SBenjamin LaHaise static const struct address_space_operations aio_ctx_aops;
26071ad7490SBenjamin LaHaise 
aio_private_file(struct kioctx * ctx,loff_t nr_pages)26171ad7490SBenjamin LaHaise static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
26271ad7490SBenjamin LaHaise {
26371ad7490SBenjamin LaHaise 	struct file *file;
26471ad7490SBenjamin LaHaise 	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
2657f62656bSDan Carpenter 	if (IS_ERR(inode))
2667f62656bSDan Carpenter 		return ERR_CAST(inode);
26771ad7490SBenjamin LaHaise 
26871ad7490SBenjamin LaHaise 	inode->i_mapping->a_ops = &aio_ctx_aops;
26971ad7490SBenjamin LaHaise 	inode->i_mapping->private_data = ctx;
27071ad7490SBenjamin LaHaise 	inode->i_size = PAGE_SIZE * nr_pages;
27171ad7490SBenjamin LaHaise 
272d93aa9d8SAl Viro 	file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
273d93aa9d8SAl Viro 				O_RDWR, &aio_ring_fops);
274c9c554f2SAl Viro 	if (IS_ERR(file))
27571ad7490SBenjamin LaHaise 		iput(inode);
27671ad7490SBenjamin LaHaise 	return file;
27771ad7490SBenjamin LaHaise }
27871ad7490SBenjamin LaHaise 
aio_init_fs_context(struct fs_context * fc)27952db59dfSDavid Howells static int aio_init_fs_context(struct fs_context *fc)
28071ad7490SBenjamin LaHaise {
28152db59dfSDavid Howells 	if (!init_pseudo(fc, AIO_RING_MAGIC))
28252db59dfSDavid Howells 		return -ENOMEM;
28352db59dfSDavid Howells 	fc->s_iflags |= SB_I_NOEXEC;
28452db59dfSDavid Howells 	return 0;
28571ad7490SBenjamin LaHaise }
28671ad7490SBenjamin LaHaise 
2871da177e4SLinus Torvalds /* aio_setup
2881da177e4SLinus Torvalds  *	Creates the slab caches used by the aio routines, panic on
2891da177e4SLinus Torvalds  *	failure as this is done early during the boot sequence.
2901da177e4SLinus Torvalds  */
aio_setup(void)2911da177e4SLinus Torvalds static int __init aio_setup(void)
2921da177e4SLinus Torvalds {
29371ad7490SBenjamin LaHaise 	static struct file_system_type aio_fs = {
29471ad7490SBenjamin LaHaise 		.name		= "aio",
29552db59dfSDavid Howells 		.init_fs_context = aio_init_fs_context,
29671ad7490SBenjamin LaHaise 		.kill_sb	= kill_anon_super,
29771ad7490SBenjamin LaHaise 	};
29871ad7490SBenjamin LaHaise 	aio_mnt = kern_mount(&aio_fs);
29971ad7490SBenjamin LaHaise 	if (IS_ERR(aio_mnt))
30071ad7490SBenjamin LaHaise 		panic("Failed to create aio fs mount.");
30171ad7490SBenjamin LaHaise 
30204b2fa9fSChristoph Hellwig 	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
3030a31bd5fSChristoph Lameter 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
30486b12b6cSXiaoming Ni 	aio_sysctl_init();
3051da177e4SLinus Torvalds 	return 0;
3061da177e4SLinus Torvalds }
307385773e0SH Hartley Sweeten __initcall(aio_setup);
3081da177e4SLinus Torvalds 
put_aio_ring_file(struct kioctx * ctx)3095e9ae2e5SBenjamin LaHaise static void put_aio_ring_file(struct kioctx *ctx)
3105e9ae2e5SBenjamin LaHaise {
3115e9ae2e5SBenjamin LaHaise 	struct file *aio_ring_file = ctx->aio_ring_file;
312de04e769SRasmus Villemoes 	struct address_space *i_mapping;
313de04e769SRasmus Villemoes 
3145e9ae2e5SBenjamin LaHaise 	if (aio_ring_file) {
31545063097SAl Viro 		truncate_setsize(file_inode(aio_ring_file), 0);
3165e9ae2e5SBenjamin LaHaise 
3175e9ae2e5SBenjamin LaHaise 		/* Prevent further access to the kioctx from migratepages */
31845063097SAl Viro 		i_mapping = aio_ring_file->f_mapping;
319de04e769SRasmus Villemoes 		spin_lock(&i_mapping->private_lock);
320de04e769SRasmus Villemoes 		i_mapping->private_data = NULL;
3215e9ae2e5SBenjamin LaHaise 		ctx->aio_ring_file = NULL;
322de04e769SRasmus Villemoes 		spin_unlock(&i_mapping->private_lock);
3235e9ae2e5SBenjamin LaHaise 
3245e9ae2e5SBenjamin LaHaise 		fput(aio_ring_file);
3255e9ae2e5SBenjamin LaHaise 	}
3265e9ae2e5SBenjamin LaHaise }
3275e9ae2e5SBenjamin LaHaise 
aio_free_ring(struct kioctx * ctx)3281da177e4SLinus Torvalds static void aio_free_ring(struct kioctx *ctx)
3291da177e4SLinus Torvalds {
33036bc08ccSGu Zheng 	int i;
3311da177e4SLinus Torvalds 
332fa8a53c3SBenjamin LaHaise 	/* Disconnect the kiotx from the ring file.  This prevents future
333fa8a53c3SBenjamin LaHaise 	 * accesses to the kioctx from page migration.
334fa8a53c3SBenjamin LaHaise 	 */
335fa8a53c3SBenjamin LaHaise 	put_aio_ring_file(ctx);
336fa8a53c3SBenjamin LaHaise 
33736bc08ccSGu Zheng 	for (i = 0; i < ctx->nr_pages; i++) {
3388e321fefSBenjamin LaHaise 		struct page *page;
33936bc08ccSGu Zheng 		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
34036bc08ccSGu Zheng 				page_count(ctx->ring_pages[i]));
3418e321fefSBenjamin LaHaise 		page = ctx->ring_pages[i];
3428e321fefSBenjamin LaHaise 		if (!page)
3438e321fefSBenjamin LaHaise 			continue;
3448e321fefSBenjamin LaHaise 		ctx->ring_pages[i] = NULL;
3458e321fefSBenjamin LaHaise 		put_page(page);
34636bc08ccSGu Zheng 	}
3471da177e4SLinus Torvalds 
348ddb8c45bSSasha Levin 	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
34958c85dc2SKent Overstreet 		kfree(ctx->ring_pages);
350ddb8c45bSSasha Levin 		ctx->ring_pages = NULL;
351ddb8c45bSSasha Levin 	}
35236bc08ccSGu Zheng }
35336bc08ccSGu Zheng 
aio_ring_mremap(struct vm_area_struct * vma)35414d07113SBrian Geffon static int aio_ring_mremap(struct vm_area_struct *vma)
35536bc08ccSGu Zheng {
3565477e70aSOleg Nesterov 	struct file *file = vma->vm_file;
357e4a0d3e7SPavel Emelyanov 	struct mm_struct *mm = vma->vm_mm;
358e4a0d3e7SPavel Emelyanov 	struct kioctx_table *table;
359b2edffddSAl Viro 	int i, res = -EINVAL;
360e4a0d3e7SPavel Emelyanov 
361e4a0d3e7SPavel Emelyanov 	spin_lock(&mm->ioctx_lock);
362e4a0d3e7SPavel Emelyanov 	rcu_read_lock();
363e4a0d3e7SPavel Emelyanov 	table = rcu_dereference(mm->ioctx_table);
36481e9d6f8SSeth Jenkins 	if (!table)
36581e9d6f8SSeth Jenkins 		goto out_unlock;
36681e9d6f8SSeth Jenkins 
367e4a0d3e7SPavel Emelyanov 	for (i = 0; i < table->nr; i++) {
368e4a0d3e7SPavel Emelyanov 		struct kioctx *ctx;
369e4a0d3e7SPavel Emelyanov 
370d0264c01STejun Heo 		ctx = rcu_dereference(table->table[i]);
371e4a0d3e7SPavel Emelyanov 		if (ctx && ctx->aio_ring_file == file) {
372b2edffddSAl Viro 			if (!atomic_read(&ctx->dead)) {
373e4a0d3e7SPavel Emelyanov 				ctx->user_id = ctx->mmap_base = vma->vm_start;
374b2edffddSAl Viro 				res = 0;
375b2edffddSAl Viro 			}
376e4a0d3e7SPavel Emelyanov 			break;
377e4a0d3e7SPavel Emelyanov 		}
378e4a0d3e7SPavel Emelyanov 	}
379e4a0d3e7SPavel Emelyanov 
38081e9d6f8SSeth Jenkins out_unlock:
381e4a0d3e7SPavel Emelyanov 	rcu_read_unlock();
382e4a0d3e7SPavel Emelyanov 	spin_unlock(&mm->ioctx_lock);
383b2edffddSAl Viro 	return res;
384e4a0d3e7SPavel Emelyanov }
385e4a0d3e7SPavel Emelyanov 
3865477e70aSOleg Nesterov static const struct vm_operations_struct aio_ring_vm_ops = {
3875477e70aSOleg Nesterov 	.mremap		= aio_ring_mremap,
3885477e70aSOleg Nesterov #if IS_ENABLED(CONFIG_MMU)
3895477e70aSOleg Nesterov 	.fault		= filemap_fault,
3905477e70aSOleg Nesterov 	.map_pages	= filemap_map_pages,
3915477e70aSOleg Nesterov 	.page_mkwrite	= filemap_page_mkwrite,
3925477e70aSOleg Nesterov #endif
3935477e70aSOleg Nesterov };
3945477e70aSOleg Nesterov 
aio_ring_mmap(struct file * file,struct vm_area_struct * vma)3955477e70aSOleg Nesterov static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
3965477e70aSOleg Nesterov {
3971c71222eSSuren Baghdasaryan 	vm_flags_set(vma, VM_DONTEXPAND);
3985477e70aSOleg Nesterov 	vma->vm_ops = &aio_ring_vm_ops;
3995477e70aSOleg Nesterov 	return 0;
4005477e70aSOleg Nesterov }
4015477e70aSOleg Nesterov 
40236bc08ccSGu Zheng static const struct file_operations aio_ring_fops = {
40336bc08ccSGu Zheng 	.mmap = aio_ring_mmap,
40436bc08ccSGu Zheng };
40536bc08ccSGu Zheng 
4060c45355fSBenjamin LaHaise #if IS_ENABLED(CONFIG_MIGRATION)
aio_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)4073648951cSMatthew Wilcox (Oracle) static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
4083648951cSMatthew Wilcox (Oracle) 			struct folio *src, enum migrate_mode mode)
40936bc08ccSGu Zheng {
4105e9ae2e5SBenjamin LaHaise 	struct kioctx *ctx;
41136bc08ccSGu Zheng 	unsigned long flags;
412fa8a53c3SBenjamin LaHaise 	pgoff_t idx;
41336bc08ccSGu Zheng 	int rc;
41436bc08ccSGu Zheng 
4152916ecc0SJérôme Glisse 	/*
4162916ecc0SJérôme Glisse 	 * We cannot support the _NO_COPY case here, because copy needs to
4172916ecc0SJérôme Glisse 	 * happen under the ctx->completion_lock. That does not work with the
4182916ecc0SJérôme Glisse 	 * migration workflow of MIGRATE_SYNC_NO_COPY.
4192916ecc0SJérôme Glisse 	 */
4202916ecc0SJérôme Glisse 	if (mode == MIGRATE_SYNC_NO_COPY)
4212916ecc0SJérôme Glisse 		return -EINVAL;
4222916ecc0SJérôme Glisse 
4238e321fefSBenjamin LaHaise 	rc = 0;
4248e321fefSBenjamin LaHaise 
425fa8a53c3SBenjamin LaHaise 	/* mapping->private_lock here protects against the kioctx teardown.  */
4268e321fefSBenjamin LaHaise 	spin_lock(&mapping->private_lock);
4278e321fefSBenjamin LaHaise 	ctx = mapping->private_data;
428fa8a53c3SBenjamin LaHaise 	if (!ctx) {
429fa8a53c3SBenjamin LaHaise 		rc = -EINVAL;
430fa8a53c3SBenjamin LaHaise 		goto out;
431fa8a53c3SBenjamin LaHaise 	}
432fa8a53c3SBenjamin LaHaise 
433fa8a53c3SBenjamin LaHaise 	/* The ring_lock mutex.  The prevents aio_read_events() from writing
434fa8a53c3SBenjamin LaHaise 	 * to the ring's head, and prevents page migration from mucking in
435fa8a53c3SBenjamin LaHaise 	 * a partially initialized kiotx.
436fa8a53c3SBenjamin LaHaise 	 */
437fa8a53c3SBenjamin LaHaise 	if (!mutex_trylock(&ctx->ring_lock)) {
438fa8a53c3SBenjamin LaHaise 		rc = -EAGAIN;
439fa8a53c3SBenjamin LaHaise 		goto out;
440fa8a53c3SBenjamin LaHaise 	}
441fa8a53c3SBenjamin LaHaise 
4423648951cSMatthew Wilcox (Oracle) 	idx = src->index;
4438e321fefSBenjamin LaHaise 	if (idx < (pgoff_t)ctx->nr_pages) {
4443648951cSMatthew Wilcox (Oracle) 		/* Make sure the old folio hasn't already been changed */
4453648951cSMatthew Wilcox (Oracle) 		if (ctx->ring_pages[idx] != &src->page)
4468e321fefSBenjamin LaHaise 			rc = -EAGAIN;
4478e321fefSBenjamin LaHaise 	} else
4488e321fefSBenjamin LaHaise 		rc = -EINVAL;
4498e321fefSBenjamin LaHaise 
4508e321fefSBenjamin LaHaise 	if (rc != 0)
451fa8a53c3SBenjamin LaHaise 		goto out_unlock;
4528e321fefSBenjamin LaHaise 
45336bc08ccSGu Zheng 	/* Writeback must be complete */
4543648951cSMatthew Wilcox (Oracle) 	BUG_ON(folio_test_writeback(src));
4553648951cSMatthew Wilcox (Oracle) 	folio_get(dst);
45636bc08ccSGu Zheng 
4573648951cSMatthew Wilcox (Oracle) 	rc = folio_migrate_mapping(mapping, dst, src, 1);
45836bc08ccSGu Zheng 	if (rc != MIGRATEPAGE_SUCCESS) {
4593648951cSMatthew Wilcox (Oracle) 		folio_put(dst);
460fa8a53c3SBenjamin LaHaise 		goto out_unlock;
46136bc08ccSGu Zheng 	}
46236bc08ccSGu Zheng 
463fa8a53c3SBenjamin LaHaise 	/* Take completion_lock to prevent other writes to the ring buffer
4643648951cSMatthew Wilcox (Oracle) 	 * while the old folio is copied to the new.  This prevents new
465fa8a53c3SBenjamin LaHaise 	 * events from being lost.
4665e9ae2e5SBenjamin LaHaise 	 */
46736bc08ccSGu Zheng 	spin_lock_irqsave(&ctx->completion_lock, flags);
4683648951cSMatthew Wilcox (Oracle) 	folio_migrate_copy(dst, src);
4693648951cSMatthew Wilcox (Oracle) 	BUG_ON(ctx->ring_pages[idx] != &src->page);
4703648951cSMatthew Wilcox (Oracle) 	ctx->ring_pages[idx] = &dst->page;
47136bc08ccSGu Zheng 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
47236bc08ccSGu Zheng 
4733648951cSMatthew Wilcox (Oracle) 	/* The old folio is no longer accessible. */
4743648951cSMatthew Wilcox (Oracle) 	folio_put(src);
4758e321fefSBenjamin LaHaise 
476fa8a53c3SBenjamin LaHaise out_unlock:
477fa8a53c3SBenjamin LaHaise 	mutex_unlock(&ctx->ring_lock);
478fa8a53c3SBenjamin LaHaise out:
479fa8a53c3SBenjamin LaHaise 	spin_unlock(&mapping->private_lock);
48036bc08ccSGu Zheng 	return rc;
48136bc08ccSGu Zheng }
4823648951cSMatthew Wilcox (Oracle) #else
4833648951cSMatthew Wilcox (Oracle) #define aio_migrate_folio NULL
4840c45355fSBenjamin LaHaise #endif
48536bc08ccSGu Zheng 
48636bc08ccSGu Zheng static const struct address_space_operations aio_ctx_aops = {
48746de8b97SMatthew Wilcox (Oracle) 	.dirty_folio	= noop_dirty_folio,
4883648951cSMatthew Wilcox (Oracle) 	.migrate_folio	= aio_migrate_folio,
48936bc08ccSGu Zheng };
4901da177e4SLinus Torvalds 
aio_setup_ring(struct kioctx * ctx,unsigned int nr_events)4912a8a9867SMauricio Faria de Oliveira static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
4921da177e4SLinus Torvalds {
4931da177e4SLinus Torvalds 	struct aio_ring *ring;
49441003a7bSZach Brown 	struct mm_struct *mm = current->mm;
4953dc9acb6SLinus Torvalds 	unsigned long size, unused;
4961da177e4SLinus Torvalds 	int nr_pages;
49736bc08ccSGu Zheng 	int i;
49836bc08ccSGu Zheng 	struct file *file;
4991da177e4SLinus Torvalds 
5001da177e4SLinus Torvalds 	/* Compensate for the ring buffer's head/tail overlap entry */
5011da177e4SLinus Torvalds 	nr_events += 2;	/* 1 is required, 2 for good luck */
5021da177e4SLinus Torvalds 
5031da177e4SLinus Torvalds 	size = sizeof(struct aio_ring);
5041da177e4SLinus Torvalds 	size += sizeof(struct io_event) * nr_events;
5051da177e4SLinus Torvalds 
50636bc08ccSGu Zheng 	nr_pages = PFN_UP(size);
5071da177e4SLinus Torvalds 	if (nr_pages < 0)
5081da177e4SLinus Torvalds 		return -EINVAL;
5091da177e4SLinus Torvalds 
51071ad7490SBenjamin LaHaise 	file = aio_private_file(ctx, nr_pages);
51136bc08ccSGu Zheng 	if (IS_ERR(file)) {
51236bc08ccSGu Zheng 		ctx->aio_ring_file = NULL;
513fa8a53c3SBenjamin LaHaise 		return -ENOMEM;
51436bc08ccSGu Zheng 	}
5151da177e4SLinus Torvalds 
51636bc08ccSGu Zheng 	ctx->aio_ring_file = file;
51736bc08ccSGu Zheng 	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
51836bc08ccSGu Zheng 			/ sizeof(struct io_event);
51936bc08ccSGu Zheng 
52058c85dc2SKent Overstreet 	ctx->ring_pages = ctx->internal_pages;
5211da177e4SLinus Torvalds 	if (nr_pages > AIO_RING_PAGES) {
52258c85dc2SKent Overstreet 		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
52358c85dc2SKent Overstreet 					  GFP_KERNEL);
524d1b94327SGu Zheng 		if (!ctx->ring_pages) {
525d1b94327SGu Zheng 			put_aio_ring_file(ctx);
5261da177e4SLinus Torvalds 			return -ENOMEM;
5271da177e4SLinus Torvalds 		}
528d1b94327SGu Zheng 	}
5291da177e4SLinus Torvalds 
5303dc9acb6SLinus Torvalds 	for (i = 0; i < nr_pages; i++) {
5313dc9acb6SLinus Torvalds 		struct page *page;
53245063097SAl Viro 		page = find_or_create_page(file->f_mapping,
5335c075c5bSFabio M. De Francesco 					   i, GFP_USER | __GFP_ZERO);
5343dc9acb6SLinus Torvalds 		if (!page)
5353dc9acb6SLinus Torvalds 			break;
5363dc9acb6SLinus Torvalds 		pr_debug("pid(%d) page[%d]->count=%d\n",
5373dc9acb6SLinus Torvalds 			 current->pid, i, page_count(page));
5383dc9acb6SLinus Torvalds 		SetPageUptodate(page);
5393dc9acb6SLinus Torvalds 		unlock_page(page);
5403dc9acb6SLinus Torvalds 
5413dc9acb6SLinus Torvalds 		ctx->ring_pages[i] = page;
5423dc9acb6SLinus Torvalds 	}
5433dc9acb6SLinus Torvalds 	ctx->nr_pages = i;
5443dc9acb6SLinus Torvalds 
5453dc9acb6SLinus Torvalds 	if (unlikely(i != nr_pages)) {
5463dc9acb6SLinus Torvalds 		aio_free_ring(ctx);
547fa8a53c3SBenjamin LaHaise 		return -ENOMEM;
5483dc9acb6SLinus Torvalds 	}
5493dc9acb6SLinus Torvalds 
55058c85dc2SKent Overstreet 	ctx->mmap_size = nr_pages * PAGE_SIZE;
55158c85dc2SKent Overstreet 	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
55236bc08ccSGu Zheng 
553d8ed45c5SMichel Lespinasse 	if (mmap_write_lock_killable(mm)) {
554013373e8SMichal Hocko 		ctx->mmap_size = 0;
555013373e8SMichal Hocko 		aio_free_ring(ctx);
556013373e8SMichal Hocko 		return -EINTR;
557013373e8SMichal Hocko 	}
558013373e8SMichal Hocko 
55945e55300SPeter Collingbourne 	ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
560e3fc629dSAl Viro 				 PROT_READ | PROT_WRITE,
561592b5fadSYu-cheng Yu 				 MAP_SHARED, 0, 0, &unused, NULL);
562d8ed45c5SMichel Lespinasse 	mmap_write_unlock(mm);
5633dc9acb6SLinus Torvalds 	if (IS_ERR((void *)ctx->mmap_base)) {
56458c85dc2SKent Overstreet 		ctx->mmap_size = 0;
5651da177e4SLinus Torvalds 		aio_free_ring(ctx);
566fa8a53c3SBenjamin LaHaise 		return -ENOMEM;
5671da177e4SLinus Torvalds 	}
5681da177e4SLinus Torvalds 
56958c85dc2SKent Overstreet 	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
570d6c355c7SBenjamin LaHaise 
57158c85dc2SKent Overstreet 	ctx->user_id = ctx->mmap_base;
57258c85dc2SKent Overstreet 	ctx->nr_events = nr_events; /* trusted copy */
5731da177e4SLinus Torvalds 
5745c075c5bSFabio M. De Francesco 	ring = page_address(ctx->ring_pages[0]);
5751da177e4SLinus Torvalds 	ring->nr = nr_events;	/* user copy */
576db446a08SBenjamin LaHaise 	ring->id = ~0U;
5771da177e4SLinus Torvalds 	ring->head = ring->tail = 0;
5781da177e4SLinus Torvalds 	ring->magic = AIO_RING_MAGIC;
5791da177e4SLinus Torvalds 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
5801da177e4SLinus Torvalds 	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
5811da177e4SLinus Torvalds 	ring->header_length = sizeof(struct aio_ring);
58258c85dc2SKent Overstreet 	flush_dcache_page(ctx->ring_pages[0]);
5831da177e4SLinus Torvalds 
5841da177e4SLinus Torvalds 	return 0;
5851da177e4SLinus Torvalds }
5861da177e4SLinus Torvalds 
5871da177e4SLinus Torvalds #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
5881da177e4SLinus Torvalds #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
5891da177e4SLinus Torvalds #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
5901da177e4SLinus Torvalds 
kiocb_set_cancel_fn(struct kiocb * iocb,kiocb_cancel_fn * cancel)59104b2fa9fSChristoph Hellwig void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
5920460fef2SKent Overstreet {
593*c01ed748SBart Van Assche 	struct aio_kiocb *req;
594*c01ed748SBart Van Assche 	struct kioctx *ctx;
5950460fef2SKent Overstreet 	unsigned long flags;
5960460fef2SKent Overstreet 
597e7e23fc5SBart Van Assche 	/*
598e7e23fc5SBart Van Assche 	 * kiocb didn't come from aio or is neither a read nor a write, hence
599e7e23fc5SBart Van Assche 	 * ignore it.
600e7e23fc5SBart Van Assche 	 */
601e7e23fc5SBart Van Assche 	if (!(iocb->ki_flags & IOCB_AIO_RW))
602e7e23fc5SBart Van Assche 		return;
603e7e23fc5SBart Van Assche 
604*c01ed748SBart Van Assche 	req = container_of(iocb, struct aio_kiocb, rw);
605*c01ed748SBart Van Assche 
60675321b50SChristoph Hellwig 	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
60775321b50SChristoph Hellwig 		return;
60875321b50SChristoph Hellwig 
609*c01ed748SBart Van Assche 	ctx = req->ki_ctx;
610*c01ed748SBart Van Assche 
6110460fef2SKent Overstreet 	spin_lock_irqsave(&ctx->ctx_lock, flags);
61275321b50SChristoph Hellwig 	list_add_tail(&req->ki_list, &ctx->active_reqs);
6130460fef2SKent Overstreet 	req->ki_cancel = cancel;
6140460fef2SKent Overstreet 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
6150460fef2SKent Overstreet }
6160460fef2SKent Overstreet EXPORT_SYMBOL(kiocb_set_cancel_fn);
6170460fef2SKent Overstreet 
618a6d7cff4STejun Heo /*
619a6d7cff4STejun Heo  * free_ioctx() should be RCU delayed to synchronize against the RCU
620a6d7cff4STejun Heo  * protected lookup_ioctx() and also needs process context to call
621f729863aSTejun Heo  * aio_free_ring().  Use rcu_work.
622a6d7cff4STejun Heo  */
free_ioctx(struct work_struct * work)623e34ecee2SKent Overstreet static void free_ioctx(struct work_struct *work)
62436f55889SKent Overstreet {
625f729863aSTejun Heo 	struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
626f729863aSTejun Heo 					  free_rwork);
627e34ecee2SKent Overstreet 	pr_debug("freeing %p\n", ctx);
628e34ecee2SKent Overstreet 
629e34ecee2SKent Overstreet 	aio_free_ring(ctx);
630e1bdd5f2SKent Overstreet 	free_percpu(ctx->cpu);
6319a1049daSTejun Heo 	percpu_ref_exit(&ctx->reqs);
6329a1049daSTejun Heo 	percpu_ref_exit(&ctx->users);
63336f55889SKent Overstreet 	kmem_cache_free(kioctx_cachep, ctx);
63436f55889SKent Overstreet }
63536f55889SKent Overstreet 
free_ioctx_reqs(struct percpu_ref * ref)636e34ecee2SKent Overstreet static void free_ioctx_reqs(struct percpu_ref *ref)
637e34ecee2SKent Overstreet {
638e34ecee2SKent Overstreet 	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
639e34ecee2SKent Overstreet 
640e02ba72aSAnatol Pomozov 	/* At this point we know that there are no any in-flight requests */
641dc48e56dSJens Axboe 	if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
642dc48e56dSJens Axboe 		complete(&ctx->rq_wait->comp);
643e02ba72aSAnatol Pomozov 
644a6d7cff4STejun Heo 	/* Synchronize against RCU protected table->table[] dereferences */
645f729863aSTejun Heo 	INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
646f729863aSTejun Heo 	queue_rcu_work(system_wq, &ctx->free_rwork);
647e34ecee2SKent Overstreet }
648e34ecee2SKent Overstreet 
64936f55889SKent Overstreet /*
65036f55889SKent Overstreet  * When this function runs, the kioctx has been removed from the "hash table"
65136f55889SKent Overstreet  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
65236f55889SKent Overstreet  * now it's safe to cancel any that need to be.
65336f55889SKent Overstreet  */
free_ioctx_users(struct percpu_ref * ref)654e34ecee2SKent Overstreet static void free_ioctx_users(struct percpu_ref *ref)
65536f55889SKent Overstreet {
656e34ecee2SKent Overstreet 	struct kioctx *ctx = container_of(ref, struct kioctx, users);
65704b2fa9fSChristoph Hellwig 	struct aio_kiocb *req;
65836f55889SKent Overstreet 
65936f55889SKent Overstreet 	spin_lock_irq(&ctx->ctx_lock);
66036f55889SKent Overstreet 
66136f55889SKent Overstreet 	while (!list_empty(&ctx->active_reqs)) {
66236f55889SKent Overstreet 		req = list_first_entry(&ctx->active_reqs,
66304b2fa9fSChristoph Hellwig 				       struct aio_kiocb, ki_list);
664888933f8SChristoph Hellwig 		req->ki_cancel(&req->rw);
6654faa9996SAl Viro 		list_del_init(&req->ki_list);
66636f55889SKent Overstreet 	}
66736f55889SKent Overstreet 
66836f55889SKent Overstreet 	spin_unlock_irq(&ctx->ctx_lock);
66936f55889SKent Overstreet 
670e34ecee2SKent Overstreet 	percpu_ref_kill(&ctx->reqs);
671e34ecee2SKent Overstreet 	percpu_ref_put(&ctx->reqs);
67236f55889SKent Overstreet }
67336f55889SKent Overstreet 
ioctx_add_table(struct kioctx * ctx,struct mm_struct * mm)674db446a08SBenjamin LaHaise static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
675db446a08SBenjamin LaHaise {
676db446a08SBenjamin LaHaise 	unsigned i, new_nr;
677db446a08SBenjamin LaHaise 	struct kioctx_table *table, *old;
678db446a08SBenjamin LaHaise 	struct aio_ring *ring;
679db446a08SBenjamin LaHaise 
680db446a08SBenjamin LaHaise 	spin_lock(&mm->ioctx_lock);
681855ef0deSOleg Nesterov 	table = rcu_dereference_raw(mm->ioctx_table);
682db446a08SBenjamin LaHaise 
683db446a08SBenjamin LaHaise 	while (1) {
684db446a08SBenjamin LaHaise 		if (table)
685db446a08SBenjamin LaHaise 			for (i = 0; i < table->nr; i++)
686d0264c01STejun Heo 				if (!rcu_access_pointer(table->table[i])) {
687db446a08SBenjamin LaHaise 					ctx->id = i;
688d0264c01STejun Heo 					rcu_assign_pointer(table->table[i], ctx);
689db446a08SBenjamin LaHaise 					spin_unlock(&mm->ioctx_lock);
690db446a08SBenjamin LaHaise 
691fa8a53c3SBenjamin LaHaise 					/* While kioctx setup is in progress,
692fa8a53c3SBenjamin LaHaise 					 * we are protected from page migration
693fa8a53c3SBenjamin LaHaise 					 * changes ring_pages by ->ring_lock.
694fa8a53c3SBenjamin LaHaise 					 */
6955c075c5bSFabio M. De Francesco 					ring = page_address(ctx->ring_pages[0]);
696db446a08SBenjamin LaHaise 					ring->id = ctx->id;
697db446a08SBenjamin LaHaise 					return 0;
698db446a08SBenjamin LaHaise 				}
699db446a08SBenjamin LaHaise 
700db446a08SBenjamin LaHaise 		new_nr = (table ? table->nr : 1) * 4;
701db446a08SBenjamin LaHaise 		spin_unlock(&mm->ioctx_lock);
702db446a08SBenjamin LaHaise 
7036446c4fbSLen Baker 		table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL);
704db446a08SBenjamin LaHaise 		if (!table)
705db446a08SBenjamin LaHaise 			return -ENOMEM;
706db446a08SBenjamin LaHaise 
707db446a08SBenjamin LaHaise 		table->nr = new_nr;
708db446a08SBenjamin LaHaise 
709db446a08SBenjamin LaHaise 		spin_lock(&mm->ioctx_lock);
710855ef0deSOleg Nesterov 		old = rcu_dereference_raw(mm->ioctx_table);
711db446a08SBenjamin LaHaise 
712db446a08SBenjamin LaHaise 		if (!old) {
713db446a08SBenjamin LaHaise 			rcu_assign_pointer(mm->ioctx_table, table);
714db446a08SBenjamin LaHaise 		} else if (table->nr > old->nr) {
715db446a08SBenjamin LaHaise 			memcpy(table->table, old->table,
716db446a08SBenjamin LaHaise 			       old->nr * sizeof(struct kioctx *));
717db446a08SBenjamin LaHaise 
718db446a08SBenjamin LaHaise 			rcu_assign_pointer(mm->ioctx_table, table);
719db446a08SBenjamin LaHaise 			kfree_rcu(old, rcu);
720db446a08SBenjamin LaHaise 		} else {
721db446a08SBenjamin LaHaise 			kfree(table);
722db446a08SBenjamin LaHaise 			table = old;
723db446a08SBenjamin LaHaise 		}
724db446a08SBenjamin LaHaise 	}
725db446a08SBenjamin LaHaise }
726db446a08SBenjamin LaHaise 
aio_nr_sub(unsigned nr)727e34ecee2SKent Overstreet static void aio_nr_sub(unsigned nr)
728e34ecee2SKent Overstreet {
729e34ecee2SKent Overstreet 	spin_lock(&aio_nr_lock);
730e34ecee2SKent Overstreet 	if (WARN_ON(aio_nr - nr > aio_nr))
731e34ecee2SKent Overstreet 		aio_nr = 0;
732e34ecee2SKent Overstreet 	else
733e34ecee2SKent Overstreet 		aio_nr -= nr;
734e34ecee2SKent Overstreet 	spin_unlock(&aio_nr_lock);
735e34ecee2SKent Overstreet }
736e34ecee2SKent Overstreet 
7371da177e4SLinus Torvalds /* ioctx_alloc
7381da177e4SLinus Torvalds  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
7391da177e4SLinus Torvalds  */
ioctx_alloc(unsigned nr_events)7401da177e4SLinus Torvalds static struct kioctx *ioctx_alloc(unsigned nr_events)
7411da177e4SLinus Torvalds {
74241003a7bSZach Brown 	struct mm_struct *mm = current->mm;
7431da177e4SLinus Torvalds 	struct kioctx *ctx;
744e23754f8SAl Viro 	int err = -ENOMEM;
7451da177e4SLinus Torvalds 
746e1bdd5f2SKent Overstreet 	/*
7472a8a9867SMauricio Faria de Oliveira 	 * Store the original nr_events -- what userspace passed to io_setup(),
7482a8a9867SMauricio Faria de Oliveira 	 * for counting against the global limit -- before it changes.
7492a8a9867SMauricio Faria de Oliveira 	 */
7502a8a9867SMauricio Faria de Oliveira 	unsigned int max_reqs = nr_events;
7512a8a9867SMauricio Faria de Oliveira 
7522a8a9867SMauricio Faria de Oliveira 	/*
753e1bdd5f2SKent Overstreet 	 * We keep track of the number of available ringbuffer slots, to prevent
754e1bdd5f2SKent Overstreet 	 * overflow (reqs_available), and we also use percpu counters for this.
755e1bdd5f2SKent Overstreet 	 *
756e1bdd5f2SKent Overstreet 	 * So since up to half the slots might be on other cpu's percpu counters
757e1bdd5f2SKent Overstreet 	 * and unavailable, double nr_events so userspace sees what they
758e1bdd5f2SKent Overstreet 	 * expected: additionally, we move req_batch slots to/from percpu
759e1bdd5f2SKent Overstreet 	 * counters at a time, so make sure that isn't 0:
760e1bdd5f2SKent Overstreet 	 */
761e1bdd5f2SKent Overstreet 	nr_events = max(nr_events, num_possible_cpus() * 4);
762e1bdd5f2SKent Overstreet 	nr_events *= 2;
763e1bdd5f2SKent Overstreet 
7641da177e4SLinus Torvalds 	/* Prevent overflows */
76508397acdSAl Viro 	if (nr_events > (0x10000000U / sizeof(struct io_event))) {
7661da177e4SLinus Torvalds 		pr_debug("ENOMEM: nr_events too high\n");
7671da177e4SLinus Torvalds 		return ERR_PTR(-EINVAL);
7681da177e4SLinus Torvalds 	}
7691da177e4SLinus Torvalds 
7702a8a9867SMauricio Faria de Oliveira 	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
7711da177e4SLinus Torvalds 		return ERR_PTR(-EAGAIN);
7721da177e4SLinus Torvalds 
773c3762229SRobert P. J. Day 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
7741da177e4SLinus Torvalds 	if (!ctx)
7751da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
7761da177e4SLinus Torvalds 
7772a8a9867SMauricio Faria de Oliveira 	ctx->max_reqs = max_reqs;
7781da177e4SLinus Torvalds 
779fa8a53c3SBenjamin LaHaise 	spin_lock_init(&ctx->ctx_lock);
780fa8a53c3SBenjamin LaHaise 	spin_lock_init(&ctx->completion_lock);
781fa8a53c3SBenjamin LaHaise 	mutex_init(&ctx->ring_lock);
782fa8a53c3SBenjamin LaHaise 	/* Protect against page migration throughout kiotx setup by keeping
783fa8a53c3SBenjamin LaHaise 	 * the ring_lock mutex held until setup is complete. */
784fa8a53c3SBenjamin LaHaise 	mutex_lock(&ctx->ring_lock);
785fa8a53c3SBenjamin LaHaise 	init_waitqueue_head(&ctx->wait);
786fa8a53c3SBenjamin LaHaise 
787fa8a53c3SBenjamin LaHaise 	INIT_LIST_HEAD(&ctx->active_reqs);
788fa8a53c3SBenjamin LaHaise 
7892aad2a86STejun Heo 	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
790e34ecee2SKent Overstreet 		goto err;
791e34ecee2SKent Overstreet 
7922aad2a86STejun Heo 	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
793e34ecee2SKent Overstreet 		goto err;
794723be6e3SKent Overstreet 
795e1bdd5f2SKent Overstreet 	ctx->cpu = alloc_percpu(struct kioctx_cpu);
796e1bdd5f2SKent Overstreet 	if (!ctx->cpu)
797e34ecee2SKent Overstreet 		goto err;
7981da177e4SLinus Torvalds 
7992a8a9867SMauricio Faria de Oliveira 	err = aio_setup_ring(ctx, nr_events);
800fa8a53c3SBenjamin LaHaise 	if (err < 0)
801e34ecee2SKent Overstreet 		goto err;
802e1bdd5f2SKent Overstreet 
80334e83fc6SKent Overstreet 	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
804e1bdd5f2SKent Overstreet 	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
8056878ea72SBenjamin LaHaise 	if (ctx->req_batch < 1)
8066878ea72SBenjamin LaHaise 		ctx->req_batch = 1;
80734e83fc6SKent Overstreet 
8081da177e4SLinus Torvalds 	/* limit the number of system wide aios */
8099fa1cb39SAl Viro 	spin_lock(&aio_nr_lock);
8102a8a9867SMauricio Faria de Oliveira 	if (aio_nr + ctx->max_reqs > aio_max_nr ||
8112a8a9867SMauricio Faria de Oliveira 	    aio_nr + ctx->max_reqs < aio_nr) {
8129fa1cb39SAl Viro 		spin_unlock(&aio_nr_lock);
813e34ecee2SKent Overstreet 		err = -EAGAIN;
814d1b94327SGu Zheng 		goto err_ctx;
8152dd542b7SAl Viro 	}
816d55b5fdaSZach Brown 	aio_nr += ctx->max_reqs;
8179fa1cb39SAl Viro 	spin_unlock(&aio_nr_lock);
8181da177e4SLinus Torvalds 
819723be6e3SKent Overstreet 	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
8201881686fSBenjamin LaHaise 	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
821723be6e3SKent Overstreet 
822da90382cSBenjamin LaHaise 	err = ioctx_add_table(ctx, mm);
823da90382cSBenjamin LaHaise 	if (err)
824e34ecee2SKent Overstreet 		goto err_cleanup;
825da90382cSBenjamin LaHaise 
826fa8a53c3SBenjamin LaHaise 	/* Release the ring_lock mutex now that all setup is complete. */
827fa8a53c3SBenjamin LaHaise 	mutex_unlock(&ctx->ring_lock);
828fa8a53c3SBenjamin LaHaise 
829caf4167aSKent Overstreet 	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
83058c85dc2SKent Overstreet 		 ctx, ctx->user_id, mm, ctx->nr_events);
8311da177e4SLinus Torvalds 	return ctx;
8321da177e4SLinus Torvalds 
833e34ecee2SKent Overstreet err_cleanup:
834e34ecee2SKent Overstreet 	aio_nr_sub(ctx->max_reqs);
835d1b94327SGu Zheng err_ctx:
836deeb8525SAl Viro 	atomic_set(&ctx->dead, 1);
837deeb8525SAl Viro 	if (ctx->mmap_size)
838deeb8525SAl Viro 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
839d1b94327SGu Zheng 	aio_free_ring(ctx);
840e34ecee2SKent Overstreet err:
841fa8a53c3SBenjamin LaHaise 	mutex_unlock(&ctx->ring_lock);
842e1bdd5f2SKent Overstreet 	free_percpu(ctx->cpu);
8439a1049daSTejun Heo 	percpu_ref_exit(&ctx->reqs);
8449a1049daSTejun Heo 	percpu_ref_exit(&ctx->users);
8451da177e4SLinus Torvalds 	kmem_cache_free(kioctx_cachep, ctx);
846caf4167aSKent Overstreet 	pr_debug("error allocating ioctx %d\n", err);
847e23754f8SAl Viro 	return ERR_PTR(err);
8481da177e4SLinus Torvalds }
8491da177e4SLinus Torvalds 
85036f55889SKent Overstreet /* kill_ioctx
8511da177e4SLinus Torvalds  *	Cancels all outstanding aio requests on an aio context.  Used
8521da177e4SLinus Torvalds  *	when the processes owning a context have all exited to encourage
8531da177e4SLinus Torvalds  *	the rapid destruction of the kioctx.
8541da177e4SLinus Torvalds  */
kill_ioctx(struct mm_struct * mm,struct kioctx * ctx,struct ctx_rq_wait * wait)855fb2d4483SBenjamin LaHaise static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
856dc48e56dSJens Axboe 		      struct ctx_rq_wait *wait)
8571da177e4SLinus Torvalds {
858db446a08SBenjamin LaHaise 	struct kioctx_table *table;
859db446a08SBenjamin LaHaise 
860db446a08SBenjamin LaHaise 	spin_lock(&mm->ioctx_lock);
861b2edffddSAl Viro 	if (atomic_xchg(&ctx->dead, 1)) {
862b2edffddSAl Viro 		spin_unlock(&mm->ioctx_lock);
863b2edffddSAl Viro 		return -EINVAL;
864b2edffddSAl Viro 	}
865b2edffddSAl Viro 
866855ef0deSOleg Nesterov 	table = rcu_dereference_raw(mm->ioctx_table);
867d0264c01STejun Heo 	WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
868d0264c01STejun Heo 	RCU_INIT_POINTER(table->table[ctx->id], NULL);
869db446a08SBenjamin LaHaise 	spin_unlock(&mm->ioctx_lock);
870db446a08SBenjamin LaHaise 
871a6d7cff4STejun Heo 	/* free_ioctx_reqs() will do the necessary RCU synchronization */
872723be6e3SKent Overstreet 	wake_up_all(&ctx->wait);
87306af121eSAl Viro 
87436f55889SKent Overstreet 	/*
8754fcc712fSKent Overstreet 	 * It'd be more correct to do this in free_ioctx(), after all
8764fcc712fSKent Overstreet 	 * the outstanding kiocbs have finished - but by then io_destroy
8774fcc712fSKent Overstreet 	 * has already returned, so io_setup() could potentially return
8784fcc712fSKent Overstreet 	 * -EAGAIN with no ioctxs actually in use (as far as userspace
8794fcc712fSKent Overstreet 	 *  could tell).
88036f55889SKent Overstreet 	 */
881e34ecee2SKent Overstreet 	aio_nr_sub(ctx->max_reqs);
8824fcc712fSKent Overstreet 
8834fcc712fSKent Overstreet 	if (ctx->mmap_size)
8844fcc712fSKent Overstreet 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
8854fcc712fSKent Overstreet 
886dc48e56dSJens Axboe 	ctx->rq_wait = wait;
887723be6e3SKent Overstreet 	percpu_ref_kill(&ctx->users);
888fb2d4483SBenjamin LaHaise 	return 0;
8891da177e4SLinus Torvalds }
8901da177e4SLinus Torvalds 
89136f55889SKent Overstreet /*
89236f55889SKent Overstreet  * exit_aio: called when the last user of mm goes away.  At this point, there is
89336f55889SKent Overstreet  * no way for any new requests to be submited or any of the io_* syscalls to be
89436f55889SKent Overstreet  * called on the context.
89536f55889SKent Overstreet  *
89636f55889SKent Overstreet  * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
89736f55889SKent Overstreet  * them.
8981da177e4SLinus Torvalds  */
exit_aio(struct mm_struct * mm)899fc9b52cdSHarvey Harrison void exit_aio(struct mm_struct *mm)
9001da177e4SLinus Torvalds {
9014b70ac5fSOleg Nesterov 	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
902dc48e56dSJens Axboe 	struct ctx_rq_wait wait;
903dc48e56dSJens Axboe 	int i, skipped;
904abf137ddSJens Axboe 
9054b70ac5fSOleg Nesterov 	if (!table)
906db446a08SBenjamin LaHaise 		return;
907db446a08SBenjamin LaHaise 
908dc48e56dSJens Axboe 	atomic_set(&wait.count, table->nr);
909dc48e56dSJens Axboe 	init_completion(&wait.comp);
910dc48e56dSJens Axboe 
911dc48e56dSJens Axboe 	skipped = 0;
9124b70ac5fSOleg Nesterov 	for (i = 0; i < table->nr; ++i) {
913d0264c01STejun Heo 		struct kioctx *ctx =
914d0264c01STejun Heo 			rcu_dereference_protected(table->table[i], true);
915db446a08SBenjamin LaHaise 
916dc48e56dSJens Axboe 		if (!ctx) {
917dc48e56dSJens Axboe 			skipped++;
9184b70ac5fSOleg Nesterov 			continue;
919dc48e56dSJens Axboe 		}
920dc48e56dSJens Axboe 
921936af157SAl Viro 		/*
9224b70ac5fSOleg Nesterov 		 * We don't need to bother with munmap() here - exit_mmap(mm)
9234b70ac5fSOleg Nesterov 		 * is coming and it'll unmap everything. And we simply can't,
9244b70ac5fSOleg Nesterov 		 * this is not necessarily our ->mm.
9254b70ac5fSOleg Nesterov 		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
9264b70ac5fSOleg Nesterov 		 * that it needs to unmap the area, just set it to 0.
927936af157SAl Viro 		 */
92858c85dc2SKent Overstreet 		ctx->mmap_size = 0;
929dc48e56dSJens Axboe 		kill_ioctx(mm, ctx, &wait);
930dc48e56dSJens Axboe 	}
93136f55889SKent Overstreet 
932dc48e56dSJens Axboe 	if (!atomic_sub_and_test(skipped, &wait.count)) {
9336098b45bSGu Zheng 		/* Wait until all IO for the context are done. */
934dc48e56dSJens Axboe 		wait_for_completion(&wait.comp);
9351da177e4SLinus Torvalds 	}
9364b70ac5fSOleg Nesterov 
9374b70ac5fSOleg Nesterov 	RCU_INIT_POINTER(mm->ioctx_table, NULL);
9384b70ac5fSOleg Nesterov 	kfree(table);
9391da177e4SLinus Torvalds }
9401da177e4SLinus Torvalds 
put_reqs_available(struct kioctx * ctx,unsigned nr)941e1bdd5f2SKent Overstreet static void put_reqs_available(struct kioctx *ctx, unsigned nr)
942e1bdd5f2SKent Overstreet {
943e1bdd5f2SKent Overstreet 	struct kioctx_cpu *kcpu;
944263782c1SBenjamin LaHaise 	unsigned long flags;
945e1bdd5f2SKent Overstreet 
946263782c1SBenjamin LaHaise 	local_irq_save(flags);
947be6fb451SBenjamin LaHaise 	kcpu = this_cpu_ptr(ctx->cpu);
948e1bdd5f2SKent Overstreet 	kcpu->reqs_available += nr;
949263782c1SBenjamin LaHaise 
950e1bdd5f2SKent Overstreet 	while (kcpu->reqs_available >= ctx->req_batch * 2) {
951e1bdd5f2SKent Overstreet 		kcpu->reqs_available -= ctx->req_batch;
952e1bdd5f2SKent Overstreet 		atomic_add(ctx->req_batch, &ctx->reqs_available);
953e1bdd5f2SKent Overstreet 	}
954e1bdd5f2SKent Overstreet 
955263782c1SBenjamin LaHaise 	local_irq_restore(flags);
956e1bdd5f2SKent Overstreet }
957e1bdd5f2SKent Overstreet 
__get_reqs_available(struct kioctx * ctx)958432c7997SChristoph Hellwig static bool __get_reqs_available(struct kioctx *ctx)
959e1bdd5f2SKent Overstreet {
960e1bdd5f2SKent Overstreet 	struct kioctx_cpu *kcpu;
961e1bdd5f2SKent Overstreet 	bool ret = false;
962263782c1SBenjamin LaHaise 	unsigned long flags;
963e1bdd5f2SKent Overstreet 
964263782c1SBenjamin LaHaise 	local_irq_save(flags);
965be6fb451SBenjamin LaHaise 	kcpu = this_cpu_ptr(ctx->cpu);
966e1bdd5f2SKent Overstreet 	if (!kcpu->reqs_available) {
96738ace0d5SUros Bizjak 		int avail = atomic_read(&ctx->reqs_available);
968e1bdd5f2SKent Overstreet 
969e1bdd5f2SKent Overstreet 		do {
970e1bdd5f2SKent Overstreet 			if (avail < ctx->req_batch)
971e1bdd5f2SKent Overstreet 				goto out;
97238ace0d5SUros Bizjak 		} while (!atomic_try_cmpxchg(&ctx->reqs_available,
97338ace0d5SUros Bizjak 					     &avail, avail - ctx->req_batch));
974e1bdd5f2SKent Overstreet 
975e1bdd5f2SKent Overstreet 		kcpu->reqs_available += ctx->req_batch;
976e1bdd5f2SKent Overstreet 	}
977e1bdd5f2SKent Overstreet 
978e1bdd5f2SKent Overstreet 	ret = true;
979e1bdd5f2SKent Overstreet 	kcpu->reqs_available--;
980e1bdd5f2SKent Overstreet out:
981263782c1SBenjamin LaHaise 	local_irq_restore(flags);
982e1bdd5f2SKent Overstreet 	return ret;
983e1bdd5f2SKent Overstreet }
984e1bdd5f2SKent Overstreet 
985d856f32aSBenjamin LaHaise /* refill_reqs_available
986d856f32aSBenjamin LaHaise  *	Updates the reqs_available reference counts used for tracking the
987d856f32aSBenjamin LaHaise  *	number of free slots in the completion ring.  This can be called
988d856f32aSBenjamin LaHaise  *	from aio_complete() (to optimistically update reqs_available) or
989d856f32aSBenjamin LaHaise  *	from aio_get_req() (the we're out of events case).  It must be
990d856f32aSBenjamin LaHaise  *	called holding ctx->completion_lock.
991d856f32aSBenjamin LaHaise  */
refill_reqs_available(struct kioctx * ctx,unsigned head,unsigned tail)992d856f32aSBenjamin LaHaise static void refill_reqs_available(struct kioctx *ctx, unsigned head,
993d856f32aSBenjamin LaHaise                                   unsigned tail)
994d856f32aSBenjamin LaHaise {
995d856f32aSBenjamin LaHaise 	unsigned events_in_ring, completed;
996d856f32aSBenjamin LaHaise 
997d856f32aSBenjamin LaHaise 	/* Clamp head since userland can write to it. */
998d856f32aSBenjamin LaHaise 	head %= ctx->nr_events;
999d856f32aSBenjamin LaHaise 	if (head <= tail)
1000d856f32aSBenjamin LaHaise 		events_in_ring = tail - head;
1001d856f32aSBenjamin LaHaise 	else
1002d856f32aSBenjamin LaHaise 		events_in_ring = ctx->nr_events - (head - tail);
1003d856f32aSBenjamin LaHaise 
1004d856f32aSBenjamin LaHaise 	completed = ctx->completed_events;
1005d856f32aSBenjamin LaHaise 	if (events_in_ring < completed)
1006d856f32aSBenjamin LaHaise 		completed -= events_in_ring;
1007d856f32aSBenjamin LaHaise 	else
1008d856f32aSBenjamin LaHaise 		completed = 0;
1009d856f32aSBenjamin LaHaise 
1010d856f32aSBenjamin LaHaise 	if (!completed)
1011d856f32aSBenjamin LaHaise 		return;
1012d856f32aSBenjamin LaHaise 
1013d856f32aSBenjamin LaHaise 	ctx->completed_events -= completed;
1014d856f32aSBenjamin LaHaise 	put_reqs_available(ctx, completed);
1015d856f32aSBenjamin LaHaise }
1016d856f32aSBenjamin LaHaise 
1017d856f32aSBenjamin LaHaise /* user_refill_reqs_available
1018d856f32aSBenjamin LaHaise  *	Called to refill reqs_available when aio_get_req() encounters an
1019d856f32aSBenjamin LaHaise  *	out of space in the completion ring.
1020d856f32aSBenjamin LaHaise  */
user_refill_reqs_available(struct kioctx * ctx)1021d856f32aSBenjamin LaHaise static void user_refill_reqs_available(struct kioctx *ctx)
1022d856f32aSBenjamin LaHaise {
1023d856f32aSBenjamin LaHaise 	spin_lock_irq(&ctx->completion_lock);
1024d856f32aSBenjamin LaHaise 	if (ctx->completed_events) {
1025d856f32aSBenjamin LaHaise 		struct aio_ring *ring;
1026d856f32aSBenjamin LaHaise 		unsigned head;
1027d856f32aSBenjamin LaHaise 
1028d856f32aSBenjamin LaHaise 		/* Access of ring->head may race with aio_read_events_ring()
1029d856f32aSBenjamin LaHaise 		 * here, but that's okay since whether we read the old version
1030d856f32aSBenjamin LaHaise 		 * or the new version, and either will be valid.  The important
1031d856f32aSBenjamin LaHaise 		 * part is that head cannot pass tail since we prevent
1032d856f32aSBenjamin LaHaise 		 * aio_complete() from updating tail by holding
1033d856f32aSBenjamin LaHaise 		 * ctx->completion_lock.  Even if head is invalid, the check
1034d856f32aSBenjamin LaHaise 		 * against ctx->completed_events below will make sure we do the
1035d856f32aSBenjamin LaHaise 		 * safe/right thing.
1036d856f32aSBenjamin LaHaise 		 */
10375c075c5bSFabio M. De Francesco 		ring = page_address(ctx->ring_pages[0]);
1038d856f32aSBenjamin LaHaise 		head = ring->head;
1039d856f32aSBenjamin LaHaise 
1040d856f32aSBenjamin LaHaise 		refill_reqs_available(ctx, head, ctx->tail);
1041d856f32aSBenjamin LaHaise 	}
1042d856f32aSBenjamin LaHaise 
1043d856f32aSBenjamin LaHaise 	spin_unlock_irq(&ctx->completion_lock);
1044d856f32aSBenjamin LaHaise }
1045d856f32aSBenjamin LaHaise 
get_reqs_available(struct kioctx * ctx)1046432c7997SChristoph Hellwig static bool get_reqs_available(struct kioctx *ctx)
1047432c7997SChristoph Hellwig {
1048432c7997SChristoph Hellwig 	if (__get_reqs_available(ctx))
1049432c7997SChristoph Hellwig 		return true;
1050432c7997SChristoph Hellwig 	user_refill_reqs_available(ctx);
1051432c7997SChristoph Hellwig 	return __get_reqs_available(ctx);
1052432c7997SChristoph Hellwig }
1053432c7997SChristoph Hellwig 
10541da177e4SLinus Torvalds /* aio_get_req
105557282d8fSKent Overstreet  *	Allocate a slot for an aio request.
105657282d8fSKent Overstreet  * Returns NULL if no requests are free.
1057b53119f1SLinus Torvalds  *
1058b53119f1SLinus Torvalds  * The refcount is initialized to 2 - one for the async op completion,
1059b53119f1SLinus Torvalds  * one for the synchronous code that does this.
10601da177e4SLinus Torvalds  */
aio_get_req(struct kioctx * ctx)106104b2fa9fSChristoph Hellwig static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
10621da177e4SLinus Torvalds {
106304b2fa9fSChristoph Hellwig 	struct aio_kiocb *req;
1064a1c8eae7SKent Overstreet 
10652bc4ca9bSJens Axboe 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
10661da177e4SLinus Torvalds 	if (unlikely(!req))
1067432c7997SChristoph Hellwig 		return NULL;
10681da177e4SLinus Torvalds 
1069fa0ca2aeSAl Viro 	if (unlikely(!get_reqs_available(ctx))) {
10706af1c849SWei Yongjun 		kmem_cache_free(kiocb_cachep, req);
1071fa0ca2aeSAl Viro 		return NULL;
1072fa0ca2aeSAl Viro 	}
1073fa0ca2aeSAl Viro 
1074e34ecee2SKent Overstreet 	percpu_ref_get(&ctx->reqs);
10752bc4ca9bSJens Axboe 	req->ki_ctx = ctx;
107675321b50SChristoph Hellwig 	INIT_LIST_HEAD(&req->ki_list);
1077b53119f1SLinus Torvalds 	refcount_set(&req->ki_refcnt, 2);
10782bc4ca9bSJens Axboe 	req->ki_eventfd = NULL;
10791da177e4SLinus Torvalds 	return req;
10801da177e4SLinus Torvalds }
10811da177e4SLinus Torvalds 
lookup_ioctx(unsigned long ctx_id)1082d5470b59SAdrian Bunk static struct kioctx *lookup_ioctx(unsigned long ctx_id)
10831da177e4SLinus Torvalds {
1084db446a08SBenjamin LaHaise 	struct aio_ring __user *ring  = (void __user *)ctx_id;
1085abf137ddSJens Axboe 	struct mm_struct *mm = current->mm;
108665c24491SJeff Moyer 	struct kioctx *ctx, *ret = NULL;
1087db446a08SBenjamin LaHaise 	struct kioctx_table *table;
1088db446a08SBenjamin LaHaise 	unsigned id;
1089db446a08SBenjamin LaHaise 
1090db446a08SBenjamin LaHaise 	if (get_user(id, &ring->id))
1091db446a08SBenjamin LaHaise 		return NULL;
10921da177e4SLinus Torvalds 
1093abf137ddSJens Axboe 	rcu_read_lock();
1094db446a08SBenjamin LaHaise 	table = rcu_dereference(mm->ioctx_table);
1095abf137ddSJens Axboe 
1096db446a08SBenjamin LaHaise 	if (!table || id >= table->nr)
1097db446a08SBenjamin LaHaise 		goto out;
1098db446a08SBenjamin LaHaise 
1099a538e3ffSJeff Moyer 	id = array_index_nospec(id, table->nr);
1100d0264c01STejun Heo 	ctx = rcu_dereference(table->table[id]);
1101f30d704fSBenjamin LaHaise 	if (ctx && ctx->user_id == ctx_id) {
1102baf10564SAl Viro 		if (percpu_ref_tryget_live(&ctx->users))
110365c24491SJeff Moyer 			ret = ctx;
11041da177e4SLinus Torvalds 	}
1105db446a08SBenjamin LaHaise out:
1106abf137ddSJens Axboe 	rcu_read_unlock();
110765c24491SJeff Moyer 	return ret;
11081da177e4SLinus Torvalds }
11091da177e4SLinus Torvalds 
iocb_destroy(struct aio_kiocb * iocb)1110b53119f1SLinus Torvalds static inline void iocb_destroy(struct aio_kiocb *iocb)
11119018ccc4SChristoph Hellwig {
111274259703SAl Viro 	if (iocb->ki_eventfd)
111374259703SAl Viro 		eventfd_ctx_put(iocb->ki_eventfd);
111484c4e1f8SLinus Torvalds 	if (iocb->ki_filp)
111584c4e1f8SLinus Torvalds 		fput(iocb->ki_filp);
11169018ccc4SChristoph Hellwig 	percpu_ref_put(&iocb->ki_ctx->reqs);
11179018ccc4SChristoph Hellwig 	kmem_cache_free(kiocb_cachep, iocb);
11189018ccc4SChristoph Hellwig }
1119b53119f1SLinus Torvalds 
11201da177e4SLinus Torvalds /* aio_complete
11211da177e4SLinus Torvalds  *	Called when the io request on the given iocb is complete.
11221da177e4SLinus Torvalds  */
aio_complete(struct aio_kiocb * iocb)11232bb874c0SAl Viro static void aio_complete(struct aio_kiocb *iocb)
11241da177e4SLinus Torvalds {
11251da177e4SLinus Torvalds 	struct kioctx	*ctx = iocb->ki_ctx;
11261da177e4SLinus Torvalds 	struct aio_ring	*ring;
112721b40200SKent Overstreet 	struct io_event	*ev_page, *event;
1128d856f32aSBenjamin LaHaise 	unsigned tail, pos, head;
11291da177e4SLinus Torvalds 	unsigned long	flags;
11301da177e4SLinus Torvalds 
11311da177e4SLinus Torvalds 	/*
11320460fef2SKent Overstreet 	 * Add a completion event to the ring buffer. Must be done holding
11334b30f07eSTang Chen 	 * ctx->completion_lock to prevent other code from messing with the tail
11340460fef2SKent Overstreet 	 * pointer since we might be called from irq context.
11350460fef2SKent Overstreet 	 */
11360460fef2SKent Overstreet 	spin_lock_irqsave(&ctx->completion_lock, flags);
11370460fef2SKent Overstreet 
113858c85dc2SKent Overstreet 	tail = ctx->tail;
113921b40200SKent Overstreet 	pos = tail + AIO_EVENTS_OFFSET;
114021b40200SKent Overstreet 
114158c85dc2SKent Overstreet 	if (++tail >= ctx->nr_events)
11424bf69b2aSKen Chen 		tail = 0;
11431da177e4SLinus Torvalds 
11445c075c5bSFabio M. De Francesco 	ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
114521b40200SKent Overstreet 	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
114621b40200SKent Overstreet 
1147a9339b78SAl Viro 	*event = iocb->ki_res;
11481da177e4SLinus Torvalds 
114958c85dc2SKent Overstreet 	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
115021b40200SKent Overstreet 
1151a9339b78SAl Viro 	pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
1152a9339b78SAl Viro 		 (void __user *)(unsigned long)iocb->ki_res.obj,
1153a9339b78SAl Viro 		 iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
11541da177e4SLinus Torvalds 
11551da177e4SLinus Torvalds 	/* after flagging the request as done, we
11561da177e4SLinus Torvalds 	 * must never even look at it again
11571da177e4SLinus Torvalds 	 */
11581da177e4SLinus Torvalds 	smp_wmb();	/* make event visible before updating tail */
11591da177e4SLinus Torvalds 
116058c85dc2SKent Overstreet 	ctx->tail = tail;
116121b40200SKent Overstreet 
11625c075c5bSFabio M. De Francesco 	ring = page_address(ctx->ring_pages[0]);
1163d856f32aSBenjamin LaHaise 	head = ring->head;
11641da177e4SLinus Torvalds 	ring->tail = tail;
116558c85dc2SKent Overstreet 	flush_dcache_page(ctx->ring_pages[0]);
11661da177e4SLinus Torvalds 
1167d856f32aSBenjamin LaHaise 	ctx->completed_events++;
1168d856f32aSBenjamin LaHaise 	if (ctx->completed_events > 1)
1169d856f32aSBenjamin LaHaise 		refill_reqs_available(ctx, head, tail);
11700460fef2SKent Overstreet 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
11710460fef2SKent Overstreet 
117221b40200SKent Overstreet 	pr_debug("added to ring %p at [%u]\n", iocb, tail);
11738d1c98b0SDavide Libenzi 
11748d1c98b0SDavide Libenzi 	/*
11758d1c98b0SDavide Libenzi 	 * Check if the user asked us to deliver the result through an
11768d1c98b0SDavide Libenzi 	 * eventfd. The eventfd_signal() function is safe to be called
11778d1c98b0SDavide Libenzi 	 * from IRQ context.
11788d1c98b0SDavide Libenzi 	 */
117974259703SAl Viro 	if (iocb->ki_eventfd)
11808d1c98b0SDavide Libenzi 		eventfd_signal(iocb->ki_eventfd, 1);
11818d1c98b0SDavide Libenzi 
11826cb2a210SQuentin Barnes 	/*
11836cb2a210SQuentin Barnes 	 * We have to order our ring_info tail store above and test
11846cb2a210SQuentin Barnes 	 * of the wait list below outside the wait lock.  This is
11856cb2a210SQuentin Barnes 	 * like in wake_up_bit() where clearing a bit has to be
11866cb2a210SQuentin Barnes 	 * ordered with the unlocked test.
11876cb2a210SQuentin Barnes 	 */
11886cb2a210SQuentin Barnes 	smp_mb();
11896cb2a210SQuentin Barnes 
11901da177e4SLinus Torvalds 	if (waitqueue_active(&ctx->wait))
11911da177e4SLinus Torvalds 		wake_up(&ctx->wait);
11922bb874c0SAl Viro }
11932bb874c0SAl Viro 
iocb_put(struct aio_kiocb * iocb)11942bb874c0SAl Viro static inline void iocb_put(struct aio_kiocb *iocb)
11952bb874c0SAl Viro {
11962bb874c0SAl Viro 	if (refcount_dec_and_test(&iocb->ki_refcnt)) {
11972bb874c0SAl Viro 		aio_complete(iocb);
11982bb874c0SAl Viro 		iocb_destroy(iocb);
11992bb874c0SAl Viro 	}
12001da177e4SLinus Torvalds }
12011da177e4SLinus Torvalds 
12022be4e7deSGu Zheng /* aio_read_events_ring
12031da177e4SLinus Torvalds  *	Pull an event off of the ioctx's event ring.  Returns the number of
1204a31ad380SKent Overstreet  *	events fetched
12051da177e4SLinus Torvalds  */
aio_read_events_ring(struct kioctx * ctx,struct io_event __user * event,long nr)1206a31ad380SKent Overstreet static long aio_read_events_ring(struct kioctx *ctx,
1207a31ad380SKent Overstreet 				 struct io_event __user *event, long nr)
12081da177e4SLinus Torvalds {
12091da177e4SLinus Torvalds 	struct aio_ring *ring;
12105ffac122SKent Overstreet 	unsigned head, tail, pos;
1211a31ad380SKent Overstreet 	long ret = 0;
1212a31ad380SKent Overstreet 	int copy_ret;
1213a31ad380SKent Overstreet 
12149c9ce763SDave Chinner 	/*
12159c9ce763SDave Chinner 	 * The mutex can block and wake us up and that will cause
12169c9ce763SDave Chinner 	 * wait_event_interruptible_hrtimeout() to schedule without sleeping
12179c9ce763SDave Chinner 	 * and repeat. This should be rare enough that it doesn't cause
12189c9ce763SDave Chinner 	 * peformance issues. See the comment in read_events() for more detail.
12199c9ce763SDave Chinner 	 */
12209c9ce763SDave Chinner 	sched_annotate_sleep();
122158c85dc2SKent Overstreet 	mutex_lock(&ctx->ring_lock);
12221da177e4SLinus Torvalds 
1223fa8a53c3SBenjamin LaHaise 	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
12245c075c5bSFabio M. De Francesco 	ring = page_address(ctx->ring_pages[0]);
1225a31ad380SKent Overstreet 	head = ring->head;
12265ffac122SKent Overstreet 	tail = ring->tail;
12271da177e4SLinus Torvalds 
12282ff396beSJeff Moyer 	/*
12292ff396beSJeff Moyer 	 * Ensure that once we've read the current tail pointer, that
12302ff396beSJeff Moyer 	 * we also see the events that were stored up to the tail.
12312ff396beSJeff Moyer 	 */
12322ff396beSJeff Moyer 	smp_rmb();
12332ff396beSJeff Moyer 
12345ffac122SKent Overstreet 	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
1235a31ad380SKent Overstreet 
12365ffac122SKent Overstreet 	if (head == tail)
12371da177e4SLinus Torvalds 		goto out;
12381da177e4SLinus Torvalds 
1239edfbbf38SBenjamin LaHaise 	head %= ctx->nr_events;
1240edfbbf38SBenjamin LaHaise 	tail %= ctx->nr_events;
1241edfbbf38SBenjamin LaHaise 
1242a31ad380SKent Overstreet 	while (ret < nr) {
1243a31ad380SKent Overstreet 		long avail;
1244a31ad380SKent Overstreet 		struct io_event *ev;
1245a31ad380SKent Overstreet 		struct page *page;
12461da177e4SLinus Torvalds 
12475ffac122SKent Overstreet 		avail = (head <= tail ?  tail : ctx->nr_events) - head;
12485ffac122SKent Overstreet 		if (head == tail)
1249a31ad380SKent Overstreet 			break;
1250a31ad380SKent Overstreet 
1251a31ad380SKent Overstreet 		pos = head + AIO_EVENTS_OFFSET;
125258c85dc2SKent Overstreet 		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
1253a31ad380SKent Overstreet 		pos %= AIO_EVENTS_PER_PAGE;
1254a31ad380SKent Overstreet 
1255d2988bd4SAl Viro 		avail = min(avail, nr - ret);
1256d2988bd4SAl Viro 		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
1257d2988bd4SAl Viro 
12585c075c5bSFabio M. De Francesco 		ev = page_address(page);
1259a31ad380SKent Overstreet 		copy_ret = copy_to_user(event + ret, ev + pos,
1260a31ad380SKent Overstreet 					sizeof(*ev) * avail);
1261a31ad380SKent Overstreet 
1262a31ad380SKent Overstreet 		if (unlikely(copy_ret)) {
1263a31ad380SKent Overstreet 			ret = -EFAULT;
1264a31ad380SKent Overstreet 			goto out;
12651da177e4SLinus Torvalds 		}
12661da177e4SLinus Torvalds 
1267a31ad380SKent Overstreet 		ret += avail;
1268a31ad380SKent Overstreet 		head += avail;
126958c85dc2SKent Overstreet 		head %= ctx->nr_events;
1270a31ad380SKent Overstreet 	}
1271a31ad380SKent Overstreet 
12725c075c5bSFabio M. De Francesco 	ring = page_address(ctx->ring_pages[0]);
1273a31ad380SKent Overstreet 	ring->head = head;
127458c85dc2SKent Overstreet 	flush_dcache_page(ctx->ring_pages[0]);
1275a31ad380SKent Overstreet 
12765ffac122SKent Overstreet 	pr_debug("%li  h%u t%u\n", ret, head, tail);
1277a31ad380SKent Overstreet out:
127858c85dc2SKent Overstreet 	mutex_unlock(&ctx->ring_lock);
1279a31ad380SKent Overstreet 
12801da177e4SLinus Torvalds 	return ret;
12811da177e4SLinus Torvalds }
12821da177e4SLinus Torvalds 
aio_read_events(struct kioctx * ctx,long min_nr,long nr,struct io_event __user * event,long * i)1283a31ad380SKent Overstreet static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
1284a31ad380SKent Overstreet 			    struct io_event __user *event, long *i)
12851da177e4SLinus Torvalds {
1286a31ad380SKent Overstreet 	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
12871da177e4SLinus Torvalds 
1288a31ad380SKent Overstreet 	if (ret > 0)
1289a31ad380SKent Overstreet 		*i += ret;
1290a31ad380SKent Overstreet 
1291a31ad380SKent Overstreet 	if (unlikely(atomic_read(&ctx->dead)))
1292a31ad380SKent Overstreet 		ret = -EINVAL;
1293a31ad380SKent Overstreet 
1294a31ad380SKent Overstreet 	if (!*i)
1295a31ad380SKent Overstreet 		*i = ret;
1296a31ad380SKent Overstreet 
1297a31ad380SKent Overstreet 	return ret < 0 || *i >= min_nr;
12981da177e4SLinus Torvalds }
12991da177e4SLinus Torvalds 
read_events(struct kioctx * ctx,long min_nr,long nr,struct io_event __user * event,ktime_t until)1300a31ad380SKent Overstreet static long read_events(struct kioctx *ctx, long min_nr, long nr,
13011da177e4SLinus Torvalds 			struct io_event __user *event,
1302fa2e62a5SDeepa Dinamani 			ktime_t until)
13031da177e4SLinus Torvalds {
1304a31ad380SKent Overstreet 	long ret = 0;
13051da177e4SLinus Torvalds 
1306a31ad380SKent Overstreet 	/*
1307a31ad380SKent Overstreet 	 * Note that aio_read_events() is being called as the conditional - i.e.
1308a31ad380SKent Overstreet 	 * we're calling it after prepare_to_wait() has set task state to
1309a31ad380SKent Overstreet 	 * TASK_INTERRUPTIBLE.
1310a31ad380SKent Overstreet 	 *
1311a31ad380SKent Overstreet 	 * But aio_read_events() can block, and if it blocks it's going to flip
1312a31ad380SKent Overstreet 	 * the task state back to TASK_RUNNING.
1313a31ad380SKent Overstreet 	 *
1314a31ad380SKent Overstreet 	 * This should be ok, provided it doesn't flip the state back to
1315a31ad380SKent Overstreet 	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
1316a31ad380SKent Overstreet 	 * will only happen if the mutex_lock() call blocks, and we then find
1317a31ad380SKent Overstreet 	 * the ringbuffer empty. So in practice we should be ok, but it's
1318a31ad380SKent Overstreet 	 * something to be aware of when touching this code.
1319a31ad380SKent Overstreet 	 */
13202456e855SThomas Gleixner 	if (until == 0)
13215f785de5SFam Zheng 		aio_read_events(ctx, min_nr, nr, event, &ret);
13225f785de5SFam Zheng 	else
1323a31ad380SKent Overstreet 		wait_event_interruptible_hrtimeout(ctx->wait,
13245f785de5SFam Zheng 				aio_read_events(ctx, min_nr, nr, event, &ret),
13255f785de5SFam Zheng 				until);
1326a31ad380SKent Overstreet 	return ret;
13271da177e4SLinus Torvalds }
13281da177e4SLinus Torvalds 
13291da177e4SLinus Torvalds /* sys_io_setup:
13301da177e4SLinus Torvalds  *	Create an aio_context capable of receiving at least nr_events.
13311da177e4SLinus Torvalds  *	ctxp must not point to an aio_context that already exists, and
13321da177e4SLinus Torvalds  *	must be initialized to 0 prior to the call.  On successful
13331da177e4SLinus Torvalds  *	creation of the aio_context, *ctxp is filled in with the resulting
13341da177e4SLinus Torvalds  *	handle.  May fail with -EINVAL if *ctxp is not initialized,
13351da177e4SLinus Torvalds  *	if the specified nr_events exceeds internal limits.  May fail
13361da177e4SLinus Torvalds  *	with -EAGAIN if the specified nr_events exceeds the user's limit
13371da177e4SLinus Torvalds  *	of available events.  May fail with -ENOMEM if insufficient kernel
13381da177e4SLinus Torvalds  *	resources are available.  May fail with -EFAULT if an invalid
13391da177e4SLinus Torvalds  *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
13401da177e4SLinus Torvalds  *	implemented.
13411da177e4SLinus Torvalds  */
SYSCALL_DEFINE2(io_setup,unsigned,nr_events,aio_context_t __user *,ctxp)1342002c8976SHeiko Carstens SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
13431da177e4SLinus Torvalds {
13441da177e4SLinus Torvalds 	struct kioctx *ioctx = NULL;
13451da177e4SLinus Torvalds 	unsigned long ctx;
13461da177e4SLinus Torvalds 	long ret;
13471da177e4SLinus Torvalds 
13481da177e4SLinus Torvalds 	ret = get_user(ctx, ctxp);
13491da177e4SLinus Torvalds 	if (unlikely(ret))
13501da177e4SLinus Torvalds 		goto out;
13511da177e4SLinus Torvalds 
13521da177e4SLinus Torvalds 	ret = -EINVAL;
1353d55b5fdaSZach Brown 	if (unlikely(ctx || nr_events == 0)) {
1354acd88d4eSKinglong Mee 		pr_debug("EINVAL: ctx %lu nr_events %u\n",
1355d55b5fdaSZach Brown 		         ctx, nr_events);
13561da177e4SLinus Torvalds 		goto out;
13571da177e4SLinus Torvalds 	}
13581da177e4SLinus Torvalds 
13591da177e4SLinus Torvalds 	ioctx = ioctx_alloc(nr_events);
13601da177e4SLinus Torvalds 	ret = PTR_ERR(ioctx);
13611da177e4SLinus Torvalds 	if (!IS_ERR(ioctx)) {
13621da177e4SLinus Torvalds 		ret = put_user(ioctx->user_id, ctxp);
1363a2e1859aSAl Viro 		if (ret)
1364e02ba72aSAnatol Pomozov 			kill_ioctx(current->mm, ioctx, NULL);
1365723be6e3SKent Overstreet 		percpu_ref_put(&ioctx->users);
13661da177e4SLinus Torvalds 	}
13671da177e4SLinus Torvalds 
13681da177e4SLinus Torvalds out:
13691da177e4SLinus Torvalds 	return ret;
13701da177e4SLinus Torvalds }
13711da177e4SLinus Torvalds 
1372c00d2c7eSAl Viro #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(io_setup,unsigned,nr_events,u32 __user *,ctx32p)1373c00d2c7eSAl Viro COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
1374c00d2c7eSAl Viro {
1375c00d2c7eSAl Viro 	struct kioctx *ioctx = NULL;
1376c00d2c7eSAl Viro 	unsigned long ctx;
1377c00d2c7eSAl Viro 	long ret;
1378c00d2c7eSAl Viro 
1379c00d2c7eSAl Viro 	ret = get_user(ctx, ctx32p);
1380c00d2c7eSAl Viro 	if (unlikely(ret))
1381c00d2c7eSAl Viro 		goto out;
1382c00d2c7eSAl Viro 
1383c00d2c7eSAl Viro 	ret = -EINVAL;
1384c00d2c7eSAl Viro 	if (unlikely(ctx || nr_events == 0)) {
1385c00d2c7eSAl Viro 		pr_debug("EINVAL: ctx %lu nr_events %u\n",
1386c00d2c7eSAl Viro 		         ctx, nr_events);
1387c00d2c7eSAl Viro 		goto out;
1388c00d2c7eSAl Viro 	}
1389c00d2c7eSAl Viro 
1390c00d2c7eSAl Viro 	ioctx = ioctx_alloc(nr_events);
1391c00d2c7eSAl Viro 	ret = PTR_ERR(ioctx);
1392c00d2c7eSAl Viro 	if (!IS_ERR(ioctx)) {
1393c00d2c7eSAl Viro 		/* truncating is ok because it's a user address */
1394c00d2c7eSAl Viro 		ret = put_user((u32)ioctx->user_id, ctx32p);
1395c00d2c7eSAl Viro 		if (ret)
1396c00d2c7eSAl Viro 			kill_ioctx(current->mm, ioctx, NULL);
1397c00d2c7eSAl Viro 		percpu_ref_put(&ioctx->users);
1398c00d2c7eSAl Viro 	}
1399c00d2c7eSAl Viro 
1400c00d2c7eSAl Viro out:
1401c00d2c7eSAl Viro 	return ret;
1402c00d2c7eSAl Viro }
1403c00d2c7eSAl Viro #endif
1404c00d2c7eSAl Viro 
14051da177e4SLinus Torvalds /* sys_io_destroy:
14061da177e4SLinus Torvalds  *	Destroy the aio_context specified.  May cancel any outstanding
14071da177e4SLinus Torvalds  *	AIOs and block on completion.  Will fail with -ENOSYS if not
1408642b5123SSatoru Takeuchi  *	implemented.  May fail with -EINVAL if the context pointed to
14091da177e4SLinus Torvalds  *	is invalid.
14101da177e4SLinus Torvalds  */
SYSCALL_DEFINE1(io_destroy,aio_context_t,ctx)1411002c8976SHeiko Carstens SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
14121da177e4SLinus Torvalds {
14131da177e4SLinus Torvalds 	struct kioctx *ioctx = lookup_ioctx(ctx);
14141da177e4SLinus Torvalds 	if (likely(NULL != ioctx)) {
1415dc48e56dSJens Axboe 		struct ctx_rq_wait wait;
1416fb2d4483SBenjamin LaHaise 		int ret;
1417e02ba72aSAnatol Pomozov 
1418dc48e56dSJens Axboe 		init_completion(&wait.comp);
1419dc48e56dSJens Axboe 		atomic_set(&wait.count, 1);
1420dc48e56dSJens Axboe 
1421e02ba72aSAnatol Pomozov 		/* Pass requests_done to kill_ioctx() where it can be set
1422e02ba72aSAnatol Pomozov 		 * in a thread-safe way. If we try to set it here then we have
1423e02ba72aSAnatol Pomozov 		 * a race condition if two io_destroy() called simultaneously.
1424e02ba72aSAnatol Pomozov 		 */
1425dc48e56dSJens Axboe 		ret = kill_ioctx(current->mm, ioctx, &wait);
1426723be6e3SKent Overstreet 		percpu_ref_put(&ioctx->users);
1427e02ba72aSAnatol Pomozov 
1428e02ba72aSAnatol Pomozov 		/* Wait until all IO for the context are done. Otherwise kernel
1429e02ba72aSAnatol Pomozov 		 * keep using user-space buffers even if user thinks the context
1430e02ba72aSAnatol Pomozov 		 * is destroyed.
1431e02ba72aSAnatol Pomozov 		 */
1432fb2d4483SBenjamin LaHaise 		if (!ret)
1433dc48e56dSJens Axboe 			wait_for_completion(&wait.comp);
1434e02ba72aSAnatol Pomozov 
1435fb2d4483SBenjamin LaHaise 		return ret;
14361da177e4SLinus Torvalds 	}
1437acd88d4eSKinglong Mee 	pr_debug("EINVAL: invalid context id\n");
14381da177e4SLinus Torvalds 	return -EINVAL;
14391da177e4SLinus Torvalds }
14401da177e4SLinus Torvalds 
aio_remove_iocb(struct aio_kiocb * iocb)14413c96c7f4SAl Viro static void aio_remove_iocb(struct aio_kiocb *iocb)
14423c96c7f4SAl Viro {
14433c96c7f4SAl Viro 	struct kioctx *ctx = iocb->ki_ctx;
14443c96c7f4SAl Viro 	unsigned long flags;
14453c96c7f4SAl Viro 
14463c96c7f4SAl Viro 	spin_lock_irqsave(&ctx->ctx_lock, flags);
14473c96c7f4SAl Viro 	list_del(&iocb->ki_list);
14483c96c7f4SAl Viro 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
14493c96c7f4SAl Viro }
14503c96c7f4SAl Viro 
aio_complete_rw(struct kiocb * kiocb,long res)14516b19b766SJens Axboe static void aio_complete_rw(struct kiocb *kiocb, long res)
145254843f87SChristoph Hellwig {
145354843f87SChristoph Hellwig 	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
145454843f87SChristoph Hellwig 
14553c96c7f4SAl Viro 	if (!list_empty_careful(&iocb->ki_list))
14563c96c7f4SAl Viro 		aio_remove_iocb(iocb);
14573c96c7f4SAl Viro 
145854843f87SChristoph Hellwig 	if (kiocb->ki_flags & IOCB_WRITE) {
145954843f87SChristoph Hellwig 		struct inode *inode = file_inode(kiocb->ki_filp);
146054843f87SChristoph Hellwig 
146154843f87SChristoph Hellwig 		if (S_ISREG(inode->i_mode))
14628c3cfa80SAmir Goldstein 			kiocb_end_write(kiocb);
146354843f87SChristoph Hellwig 	}
146454843f87SChristoph Hellwig 
14652bb874c0SAl Viro 	iocb->ki_res.res = res;
14666b19b766SJens Axboe 	iocb->ki_res.res2 = 0;
14672bb874c0SAl Viro 	iocb_put(iocb);
146854843f87SChristoph Hellwig }
146954843f87SChristoph Hellwig 
aio_prep_rw(struct kiocb * req,const struct iocb * iocb)147088a6f18bSJens Axboe static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
147154843f87SChristoph Hellwig {
147254843f87SChristoph Hellwig 	int ret;
147354843f87SChristoph Hellwig 
147454843f87SChristoph Hellwig 	req->ki_complete = aio_complete_rw;
1475ec51f8eeSMike Marshall 	req->private = NULL;
147654843f87SChristoph Hellwig 	req->ki_pos = iocb->aio_offset;
1477e7e23fc5SBart Van Assche 	req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
147854843f87SChristoph Hellwig 	if (iocb->aio_flags & IOCB_FLAG_RESFD)
147954843f87SChristoph Hellwig 		req->ki_flags |= IOCB_EVENTFD;
1480d9a08a9eSAdam Manzanares 	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
1481d9a08a9eSAdam Manzanares 		/*
1482d9a08a9eSAdam Manzanares 		 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
1483d9a08a9eSAdam Manzanares 		 * aio_reqprio is interpreted as an I/O scheduling
1484d9a08a9eSAdam Manzanares 		 * class and priority.
1485d9a08a9eSAdam Manzanares 		 */
1486d9a08a9eSAdam Manzanares 		ret = ioprio_check_cap(iocb->aio_reqprio);
1487d9a08a9eSAdam Manzanares 		if (ret) {
14889a6d9a62SAdam Manzanares 			pr_debug("aio ioprio check cap error: %d\n", ret);
148984c4e1f8SLinus Torvalds 			return ret;
1490d9a08a9eSAdam Manzanares 		}
1491d9a08a9eSAdam Manzanares 
1492d9a08a9eSAdam Manzanares 		req->ki_ioprio = iocb->aio_reqprio;
1493d9a08a9eSAdam Manzanares 	} else
149476dc8913SDamien Le Moal 		req->ki_ioprio = get_current_ioprio();
1495d9a08a9eSAdam Manzanares 
149654843f87SChristoph Hellwig 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
149754843f87SChristoph Hellwig 	if (unlikely(ret))
149884c4e1f8SLinus Torvalds 		return ret;
1499154989e4SChristoph Hellwig 
1500154989e4SChristoph Hellwig 	req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
1501154989e4SChristoph Hellwig 	return 0;
150254843f87SChristoph Hellwig }
150354843f87SChristoph Hellwig 
aio_setup_rw(int rw,const struct iocb * iocb,struct iovec ** iovec,bool vectored,bool compat,struct iov_iter * iter)150487e5e6daSJens Axboe static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
150587e5e6daSJens Axboe 		struct iovec **iovec, bool vectored, bool compat,
150687e5e6daSJens Axboe 		struct iov_iter *iter)
1507eed4e51fSBadari Pulavarty {
150889319d31SChristoph Hellwig 	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
150989319d31SChristoph Hellwig 	size_t len = iocb->aio_nbytes;
1510eed4e51fSBadari Pulavarty 
151189319d31SChristoph Hellwig 	if (!vectored) {
151289319d31SChristoph Hellwig 		ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
151389319d31SChristoph Hellwig 		*iovec = NULL;
151441ef4eb8SKent Overstreet 		return ret;
15158bc92afcSKent Overstreet 	}
151689cd35c5SChristoph Hellwig 
151789cd35c5SChristoph Hellwig 	return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
15181da177e4SLinus Torvalds }
15191da177e4SLinus Torvalds 
aio_rw_done(struct kiocb * req,ssize_t ret)15209061d14aSAl Viro static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
152189319d31SChristoph Hellwig {
152289319d31SChristoph Hellwig 	switch (ret) {
152389319d31SChristoph Hellwig 	case -EIOCBQUEUED:
15249061d14aSAl Viro 		break;
152589319d31SChristoph Hellwig 	case -ERESTARTSYS:
152689319d31SChristoph Hellwig 	case -ERESTARTNOINTR:
152789319d31SChristoph Hellwig 	case -ERESTARTNOHAND:
152889319d31SChristoph Hellwig 	case -ERESTART_RESTARTBLOCK:
152941ef4eb8SKent Overstreet 		/*
153041ef4eb8SKent Overstreet 		 * There's no easy way to restart the syscall since other AIO's
153141ef4eb8SKent Overstreet 		 * may be already running. Just fail this IO with EINTR.
153241ef4eb8SKent Overstreet 		 */
153341ef4eb8SKent Overstreet 		ret = -EINTR;
1534df561f66SGustavo A. R. Silva 		fallthrough;
153589319d31SChristoph Hellwig 	default:
15366b19b766SJens Axboe 		req->ki_complete(req, ret);
153789319d31SChristoph Hellwig 	}
153841ef4eb8SKent Overstreet }
15391da177e4SLinus Torvalds 
aio_read(struct kiocb * req,const struct iocb * iocb,bool vectored,bool compat)1540958c13ceSAl Viro static int aio_read(struct kiocb *req, const struct iocb *iocb,
154188a6f18bSJens Axboe 			bool vectored, bool compat)
154289319d31SChristoph Hellwig {
154389319d31SChristoph Hellwig 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
154489319d31SChristoph Hellwig 	struct iov_iter iter;
154554843f87SChristoph Hellwig 	struct file *file;
1546958c13ceSAl Viro 	int ret;
154789319d31SChristoph Hellwig 
154854843f87SChristoph Hellwig 	ret = aio_prep_rw(req, iocb);
154954843f87SChristoph Hellwig 	if (ret)
155054843f87SChristoph Hellwig 		return ret;
155154843f87SChristoph Hellwig 	file = req->ki_filp;
155289319d31SChristoph Hellwig 	if (unlikely(!(file->f_mode & FMODE_READ)))
155384c4e1f8SLinus Torvalds 		return -EBADF;
155489319d31SChristoph Hellwig 	if (unlikely(!file->f_op->read_iter))
155584c4e1f8SLinus Torvalds 		return -EINVAL;
155689319d31SChristoph Hellwig 
1557de4eda9dSAl Viro 	ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter);
155887e5e6daSJens Axboe 	if (ret < 0)
155984c4e1f8SLinus Torvalds 		return ret;
156089319d31SChristoph Hellwig 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
156189319d31SChristoph Hellwig 	if (!ret)
15629061d14aSAl Viro 		aio_rw_done(req, call_read_iter(file, req, &iter));
156389319d31SChristoph Hellwig 	kfree(iovec);
156489319d31SChristoph Hellwig 	return ret;
156589319d31SChristoph Hellwig }
156689319d31SChristoph Hellwig 
aio_write(struct kiocb * req,const struct iocb * iocb,bool vectored,bool compat)1567958c13ceSAl Viro static int aio_write(struct kiocb *req, const struct iocb *iocb,
156888a6f18bSJens Axboe 			 bool vectored, bool compat)
156989319d31SChristoph Hellwig {
157089319d31SChristoph Hellwig 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
157189319d31SChristoph Hellwig 	struct iov_iter iter;
157254843f87SChristoph Hellwig 	struct file *file;
1573958c13ceSAl Viro 	int ret;
157489319d31SChristoph Hellwig 
157554843f87SChristoph Hellwig 	ret = aio_prep_rw(req, iocb);
157654843f87SChristoph Hellwig 	if (ret)
157754843f87SChristoph Hellwig 		return ret;
157854843f87SChristoph Hellwig 	file = req->ki_filp;
157954843f87SChristoph Hellwig 
158089319d31SChristoph Hellwig 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
158184c4e1f8SLinus Torvalds 		return -EBADF;
158289319d31SChristoph Hellwig 	if (unlikely(!file->f_op->write_iter))
158384c4e1f8SLinus Torvalds 		return -EINVAL;
158489319d31SChristoph Hellwig 
1585de4eda9dSAl Viro 	ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter);
158687e5e6daSJens Axboe 	if (ret < 0)
158784c4e1f8SLinus Torvalds 		return ret;
158889319d31SChristoph Hellwig 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
158989319d31SChristoph Hellwig 	if (!ret) {
15908c3cfa80SAmir Goldstein 		if (S_ISREG(file_inode(file)->i_mode))
15918c3cfa80SAmir Goldstein 			kiocb_start_write(req);
159292ce4728SChristoph Hellwig 		req->ki_flags |= IOCB_WRITE;
15939061d14aSAl Viro 		aio_rw_done(req, call_write_iter(file, req, &iter));
159492ce4728SChristoph Hellwig 	}
159589319d31SChristoph Hellwig 	kfree(iovec);
159689319d31SChristoph Hellwig 	return ret;
15971da177e4SLinus Torvalds }
15981da177e4SLinus Torvalds 
aio_fsync_work(struct work_struct * work)1599a3c0d439SChristoph Hellwig static void aio_fsync_work(struct work_struct *work)
1600a3c0d439SChristoph Hellwig {
16012bb874c0SAl Viro 	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
1602530f32fcSMiklos Szeredi 	const struct cred *old_cred = override_creds(iocb->fsync.creds);
1603a3c0d439SChristoph Hellwig 
16042bb874c0SAl Viro 	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
1605530f32fcSMiklos Szeredi 	revert_creds(old_cred);
1606530f32fcSMiklos Szeredi 	put_cred(iocb->fsync.creds);
16072bb874c0SAl Viro 	iocb_put(iocb);
1608a3c0d439SChristoph Hellwig }
1609a3c0d439SChristoph Hellwig 
aio_fsync(struct fsync_iocb * req,const struct iocb * iocb,bool datasync)161088a6f18bSJens Axboe static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
161188a6f18bSJens Axboe 		     bool datasync)
1612a3c0d439SChristoph Hellwig {
1613a3c0d439SChristoph Hellwig 	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
1614a3c0d439SChristoph Hellwig 			iocb->aio_rw_flags))
1615a3c0d439SChristoph Hellwig 		return -EINVAL;
1616a11e1d43SLinus Torvalds 
161784c4e1f8SLinus Torvalds 	if (unlikely(!req->file->f_op->fsync))
1618a3c0d439SChristoph Hellwig 		return -EINVAL;
1619a3c0d439SChristoph Hellwig 
1620530f32fcSMiklos Szeredi 	req->creds = prepare_creds();
1621530f32fcSMiklos Szeredi 	if (!req->creds)
1622530f32fcSMiklos Szeredi 		return -ENOMEM;
1623530f32fcSMiklos Szeredi 
1624a3c0d439SChristoph Hellwig 	req->datasync = datasync;
1625a3c0d439SChristoph Hellwig 	INIT_WORK(&req->work, aio_fsync_work);
1626a3c0d439SChristoph Hellwig 	schedule_work(&req->work);
16279061d14aSAl Viro 	return 0;
1628a3c0d439SChristoph Hellwig }
1629a3c0d439SChristoph Hellwig 
aio_poll_put_work(struct work_struct * work)163001d7a356SJens Axboe static void aio_poll_put_work(struct work_struct *work)
163101d7a356SJens Axboe {
163201d7a356SJens Axboe 	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
163301d7a356SJens Axboe 	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
163401d7a356SJens Axboe 
163501d7a356SJens Axboe 	iocb_put(iocb);
163601d7a356SJens Axboe }
163701d7a356SJens Axboe 
163850252e4bSEric Biggers /*
163950252e4bSEric Biggers  * Safely lock the waitqueue which the request is on, synchronizing with the
164050252e4bSEric Biggers  * case where the ->poll() provider decides to free its waitqueue early.
164150252e4bSEric Biggers  *
164250252e4bSEric Biggers  * Returns true on success, meaning that req->head->lock was locked, req->wait
164350252e4bSEric Biggers  * is on req->head, and an RCU read lock was taken.  Returns false if the
164450252e4bSEric Biggers  * request was already removed from its waitqueue (which might no longer exist).
164550252e4bSEric Biggers  */
poll_iocb_lock_wq(struct poll_iocb * req)164650252e4bSEric Biggers static bool poll_iocb_lock_wq(struct poll_iocb *req)
164750252e4bSEric Biggers {
164850252e4bSEric Biggers 	wait_queue_head_t *head;
164950252e4bSEric Biggers 
165050252e4bSEric Biggers 	/*
165150252e4bSEric Biggers 	 * While we hold the waitqueue lock and the waitqueue is nonempty,
165250252e4bSEric Biggers 	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
165350252e4bSEric Biggers 	 * lock in the first place can race with the waitqueue being freed.
165450252e4bSEric Biggers 	 *
165550252e4bSEric Biggers 	 * We solve this as eventpoll does: by taking advantage of the fact that
165650252e4bSEric Biggers 	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
165750252e4bSEric Biggers 	 * we enter rcu_read_lock() and see that the pointer to the queue is
165850252e4bSEric Biggers 	 * non-NULL, we can then lock it without the memory being freed out from
165950252e4bSEric Biggers 	 * under us, then check whether the request is still on the queue.
166050252e4bSEric Biggers 	 *
166150252e4bSEric Biggers 	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
166250252e4bSEric Biggers 	 * case the caller deletes the entry from the queue, leaving it empty.
166350252e4bSEric Biggers 	 * In that case, only RCU prevents the queue memory from being freed.
166450252e4bSEric Biggers 	 */
166550252e4bSEric Biggers 	rcu_read_lock();
166650252e4bSEric Biggers 	head = smp_load_acquire(&req->head);
166750252e4bSEric Biggers 	if (head) {
166850252e4bSEric Biggers 		spin_lock(&head->lock);
166950252e4bSEric Biggers 		if (!list_empty(&req->wait.entry))
167050252e4bSEric Biggers 			return true;
167150252e4bSEric Biggers 		spin_unlock(&head->lock);
167250252e4bSEric Biggers 	}
167350252e4bSEric Biggers 	rcu_read_unlock();
167450252e4bSEric Biggers 	return false;
167550252e4bSEric Biggers }
167650252e4bSEric Biggers 
poll_iocb_unlock_wq(struct poll_iocb * req)167750252e4bSEric Biggers static void poll_iocb_unlock_wq(struct poll_iocb *req)
167850252e4bSEric Biggers {
167950252e4bSEric Biggers 	spin_unlock(&req->head->lock);
168050252e4bSEric Biggers 	rcu_read_unlock();
168150252e4bSEric Biggers }
168250252e4bSEric Biggers 
aio_poll_complete_work(struct work_struct * work)1683bfe4037eSChristoph Hellwig static void aio_poll_complete_work(struct work_struct *work)
1684bfe4037eSChristoph Hellwig {
1685bfe4037eSChristoph Hellwig 	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
1686bfe4037eSChristoph Hellwig 	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
1687bfe4037eSChristoph Hellwig 	struct poll_table_struct pt = { ._key = req->events };
1688bfe4037eSChristoph Hellwig 	struct kioctx *ctx = iocb->ki_ctx;
1689bfe4037eSChristoph Hellwig 	__poll_t mask = 0;
1690bfe4037eSChristoph Hellwig 
1691bfe4037eSChristoph Hellwig 	if (!READ_ONCE(req->cancelled))
1692bfe4037eSChristoph Hellwig 		mask = vfs_poll(req->file, &pt) & req->events;
1693bfe4037eSChristoph Hellwig 
1694bfe4037eSChristoph Hellwig 	/*
1695bfe4037eSChristoph Hellwig 	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
1696bfe4037eSChristoph Hellwig 	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
1697bfe4037eSChristoph Hellwig 	 * synchronize with them.  In the cancellation case the list_del_init
1698bfe4037eSChristoph Hellwig 	 * itself is not actually needed, but harmless so we keep it in to
1699bfe4037eSChristoph Hellwig 	 * avoid further branches in the fast path.
1700bfe4037eSChristoph Hellwig 	 */
1701bfe4037eSChristoph Hellwig 	spin_lock_irq(&ctx->ctx_lock);
170250252e4bSEric Biggers 	if (poll_iocb_lock_wq(req)) {
1703bfe4037eSChristoph Hellwig 		if (!mask && !READ_ONCE(req->cancelled)) {
1704363bee27SEric Biggers 			/*
1705363bee27SEric Biggers 			 * The request isn't actually ready to be completed yet.
1706363bee27SEric Biggers 			 * Reschedule completion if another wakeup came in.
1707363bee27SEric Biggers 			 */
1708363bee27SEric Biggers 			if (req->work_need_resched) {
1709363bee27SEric Biggers 				schedule_work(&req->work);
1710363bee27SEric Biggers 				req->work_need_resched = false;
1711363bee27SEric Biggers 			} else {
1712363bee27SEric Biggers 				req->work_scheduled = false;
1713363bee27SEric Biggers 			}
171450252e4bSEric Biggers 			poll_iocb_unlock_wq(req);
1715bfe4037eSChristoph Hellwig 			spin_unlock_irq(&ctx->ctx_lock);
1716bfe4037eSChristoph Hellwig 			return;
1717bfe4037eSChristoph Hellwig 		}
1718363bee27SEric Biggers 		list_del_init(&req->wait.entry);
171950252e4bSEric Biggers 		poll_iocb_unlock_wq(req);
172050252e4bSEric Biggers 	} /* else, POLLFREE has freed the waitqueue, so we must complete */
1721bfe4037eSChristoph Hellwig 	list_del_init(&iocb->ki_list);
1722af5c72b1SAl Viro 	iocb->ki_res.res = mangle_poll(mask);
1723bfe4037eSChristoph Hellwig 	spin_unlock_irq(&ctx->ctx_lock);
1724bfe4037eSChristoph Hellwig 
1725af5c72b1SAl Viro 	iocb_put(iocb);
1726bfe4037eSChristoph Hellwig }
1727bfe4037eSChristoph Hellwig 
1728bfe4037eSChristoph Hellwig /* assumes we are called with irqs disabled */
aio_poll_cancel(struct kiocb * iocb)1729bfe4037eSChristoph Hellwig static int aio_poll_cancel(struct kiocb *iocb)
1730bfe4037eSChristoph Hellwig {
1731bfe4037eSChristoph Hellwig 	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
1732bfe4037eSChristoph Hellwig 	struct poll_iocb *req = &aiocb->poll;
1733bfe4037eSChristoph Hellwig 
173450252e4bSEric Biggers 	if (poll_iocb_lock_wq(req)) {
1735bfe4037eSChristoph Hellwig 		WRITE_ONCE(req->cancelled, true);
1736363bee27SEric Biggers 		if (!req->work_scheduled) {
1737bfe4037eSChristoph Hellwig 			schedule_work(&aiocb->poll.work);
1738363bee27SEric Biggers 			req->work_scheduled = true;
1739bfe4037eSChristoph Hellwig 		}
174050252e4bSEric Biggers 		poll_iocb_unlock_wq(req);
174150252e4bSEric Biggers 	} /* else, the request was force-cancelled by POLLFREE already */
1742bfe4037eSChristoph Hellwig 
1743bfe4037eSChristoph Hellwig 	return 0;
1744bfe4037eSChristoph Hellwig }
1745bfe4037eSChristoph Hellwig 
aio_poll_wake(struct wait_queue_entry * wait,unsigned mode,int sync,void * key)1746bfe4037eSChristoph Hellwig static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1747bfe4037eSChristoph Hellwig 		void *key)
1748bfe4037eSChristoph Hellwig {
1749bfe4037eSChristoph Hellwig 	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
1750e8693bcfSChristoph Hellwig 	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
1751bfe4037eSChristoph Hellwig 	__poll_t mask = key_to_poll(key);
1752d3d6a18dSBart Van Assche 	unsigned long flags;
1753bfe4037eSChristoph Hellwig 
1754bfe4037eSChristoph Hellwig 	/* for instances that support it check for an event match first: */
1755af5c72b1SAl Viro 	if (mask && !(mask & req->events))
1756bfe4037eSChristoph Hellwig 		return 0;
1757bfe4037eSChristoph Hellwig 
1758363bee27SEric Biggers 	/*
1759363bee27SEric Biggers 	 * Complete the request inline if possible.  This requires that three
1760363bee27SEric Biggers 	 * conditions be met:
1761363bee27SEric Biggers 	 *   1. An event mask must have been passed.  If a plain wakeup was done
1762363bee27SEric Biggers 	 *	instead, then mask == 0 and we have to call vfs_poll() to get
1763363bee27SEric Biggers 	 *	the events, so inline completion isn't possible.
1764363bee27SEric Biggers 	 *   2. The completion work must not have already been scheduled.
1765363bee27SEric Biggers 	 *   3. ctx_lock must not be busy.  We have to use trylock because we
1766363bee27SEric Biggers 	 *	already hold the waitqueue lock, so this inverts the normal
1767363bee27SEric Biggers 	 *	locking order.  Use irqsave/irqrestore because not all
1768363bee27SEric Biggers 	 *	filesystems (e.g. fuse) call this function with IRQs disabled,
1769363bee27SEric Biggers 	 *	yet IRQs have to be disabled before ctx_lock is obtained.
1770363bee27SEric Biggers 	 */
1771363bee27SEric Biggers 	if (mask && !req->work_scheduled &&
1772363bee27SEric Biggers 	    spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
177301d7a356SJens Axboe 		struct kioctx *ctx = iocb->ki_ctx;
177401d7a356SJens Axboe 
1775363bee27SEric Biggers 		list_del_init(&req->wait.entry);
1776e8693bcfSChristoph Hellwig 		list_del(&iocb->ki_list);
1777af5c72b1SAl Viro 		iocb->ki_res.res = mangle_poll(mask);
17784b374986SXie Yongji 		if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
177901d7a356SJens Axboe 			iocb = NULL;
178001d7a356SJens Axboe 			INIT_WORK(&req->work, aio_poll_put_work);
178101d7a356SJens Axboe 			schedule_work(&req->work);
178201d7a356SJens Axboe 		}
178301d7a356SJens Axboe 		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
178401d7a356SJens Axboe 		if (iocb)
1785af5c72b1SAl Viro 			iocb_put(iocb);
1786af5c72b1SAl Viro 	} else {
1787363bee27SEric Biggers 		/*
1788363bee27SEric Biggers 		 * Schedule the completion work if needed.  If it was already
1789363bee27SEric Biggers 		 * scheduled, record that another wakeup came in.
1790363bee27SEric Biggers 		 *
1791363bee27SEric Biggers 		 * Don't remove the request from the waitqueue here, as it might
1792363bee27SEric Biggers 		 * not actually be complete yet (we won't know until vfs_poll()
179350252e4bSEric Biggers 		 * is called), and we must not miss any wakeups.  POLLFREE is an
179450252e4bSEric Biggers 		 * exception to this; see below.
1795363bee27SEric Biggers 		 */
1796363bee27SEric Biggers 		if (req->work_scheduled) {
1797363bee27SEric Biggers 			req->work_need_resched = true;
1798363bee27SEric Biggers 		} else {
1799bfe4037eSChristoph Hellwig 			schedule_work(&req->work);
1800363bee27SEric Biggers 			req->work_scheduled = true;
1801363bee27SEric Biggers 		}
180250252e4bSEric Biggers 
180350252e4bSEric Biggers 		/*
180450252e4bSEric Biggers 		 * If the waitqueue is being freed early but we can't complete
180550252e4bSEric Biggers 		 * the request inline, we have to tear down the request as best
180650252e4bSEric Biggers 		 * we can.  That means immediately removing the request from its
180750252e4bSEric Biggers 		 * waitqueue and preventing all further accesses to the
180850252e4bSEric Biggers 		 * waitqueue via the request.  We also need to schedule the
180950252e4bSEric Biggers 		 * completion work (done above).  Also mark the request as
181050252e4bSEric Biggers 		 * cancelled, to potentially skip an unneeded call to ->poll().
181150252e4bSEric Biggers 		 */
181250252e4bSEric Biggers 		if (mask & POLLFREE) {
181350252e4bSEric Biggers 			WRITE_ONCE(req->cancelled, true);
181450252e4bSEric Biggers 			list_del_init(&req->wait.entry);
181550252e4bSEric Biggers 
181650252e4bSEric Biggers 			/*
181750252e4bSEric Biggers 			 * Careful: this *must* be the last step, since as soon
181850252e4bSEric Biggers 			 * as req->head is NULL'ed out, the request can be
181950252e4bSEric Biggers 			 * completed and freed, since aio_poll_complete_work()
182050252e4bSEric Biggers 			 * will no longer need to take the waitqueue lock.
182150252e4bSEric Biggers 			 */
182250252e4bSEric Biggers 			smp_store_release(&req->head, NULL);
182350252e4bSEric Biggers 		}
1824af5c72b1SAl Viro 	}
1825bfe4037eSChristoph Hellwig 	return 1;
1826bfe4037eSChristoph Hellwig }
1827bfe4037eSChristoph Hellwig 
1828bfe4037eSChristoph Hellwig struct aio_poll_table {
1829bfe4037eSChristoph Hellwig 	struct poll_table_struct	pt;
1830bfe4037eSChristoph Hellwig 	struct aio_kiocb		*iocb;
183150252e4bSEric Biggers 	bool				queued;
1832bfe4037eSChristoph Hellwig 	int				error;
1833bfe4037eSChristoph Hellwig };
1834bfe4037eSChristoph Hellwig 
1835bfe4037eSChristoph Hellwig static void
aio_poll_queue_proc(struct file * file,struct wait_queue_head * head,struct poll_table_struct * p)1836bfe4037eSChristoph Hellwig aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1837bfe4037eSChristoph Hellwig 		struct poll_table_struct *p)
1838bfe4037eSChristoph Hellwig {
1839bfe4037eSChristoph Hellwig 	struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
1840bfe4037eSChristoph Hellwig 
1841bfe4037eSChristoph Hellwig 	/* multiple wait queues per file are not supported */
184250252e4bSEric Biggers 	if (unlikely(pt->queued)) {
1843bfe4037eSChristoph Hellwig 		pt->error = -EINVAL;
1844bfe4037eSChristoph Hellwig 		return;
1845bfe4037eSChristoph Hellwig 	}
1846bfe4037eSChristoph Hellwig 
184750252e4bSEric Biggers 	pt->queued = true;
1848bfe4037eSChristoph Hellwig 	pt->error = 0;
1849bfe4037eSChristoph Hellwig 	pt->iocb->poll.head = head;
1850bfe4037eSChristoph Hellwig 	add_wait_queue(head, &pt->iocb->poll.wait);
1851bfe4037eSChristoph Hellwig }
1852bfe4037eSChristoph Hellwig 
aio_poll(struct aio_kiocb * aiocb,const struct iocb * iocb)1853958c13ceSAl Viro static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
1854bfe4037eSChristoph Hellwig {
1855bfe4037eSChristoph Hellwig 	struct kioctx *ctx = aiocb->ki_ctx;
1856bfe4037eSChristoph Hellwig 	struct poll_iocb *req = &aiocb->poll;
1857bfe4037eSChristoph Hellwig 	struct aio_poll_table apt;
1858af5c72b1SAl Viro 	bool cancel = false;
1859bfe4037eSChristoph Hellwig 	__poll_t mask;
1860bfe4037eSChristoph Hellwig 
1861bfe4037eSChristoph Hellwig 	/* reject any unknown events outside the normal event mask. */
1862bfe4037eSChristoph Hellwig 	if ((u16)iocb->aio_buf != iocb->aio_buf)
1863bfe4037eSChristoph Hellwig 		return -EINVAL;
1864bfe4037eSChristoph Hellwig 	/* reject fields that are not defined for poll */
1865bfe4037eSChristoph Hellwig 	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
1866bfe4037eSChristoph Hellwig 		return -EINVAL;
1867bfe4037eSChristoph Hellwig 
1868bfe4037eSChristoph Hellwig 	INIT_WORK(&req->work, aio_poll_complete_work);
1869bfe4037eSChristoph Hellwig 	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
1870bfe4037eSChristoph Hellwig 
18712bc4ca9bSJens Axboe 	req->head = NULL;
18722bc4ca9bSJens Axboe 	req->cancelled = false;
1873363bee27SEric Biggers 	req->work_scheduled = false;
1874363bee27SEric Biggers 	req->work_need_resched = false;
18752bc4ca9bSJens Axboe 
1876bfe4037eSChristoph Hellwig 	apt.pt._qproc = aio_poll_queue_proc;
1877bfe4037eSChristoph Hellwig 	apt.pt._key = req->events;
1878bfe4037eSChristoph Hellwig 	apt.iocb = aiocb;
187950252e4bSEric Biggers 	apt.queued = false;
1880bfe4037eSChristoph Hellwig 	apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
1881bfe4037eSChristoph Hellwig 
1882bfe4037eSChristoph Hellwig 	/* initialized the list so that we can do list_empty checks */
1883bfe4037eSChristoph Hellwig 	INIT_LIST_HEAD(&req->wait.entry);
1884bfe4037eSChristoph Hellwig 	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
1885bfe4037eSChristoph Hellwig 
1886bfe4037eSChristoph Hellwig 	mask = vfs_poll(req->file, &apt.pt) & req->events;
1887bfe4037eSChristoph Hellwig 	spin_lock_irq(&ctx->ctx_lock);
188850252e4bSEric Biggers 	if (likely(apt.queued)) {
188950252e4bSEric Biggers 		bool on_queue = poll_iocb_lock_wq(req);
189050252e4bSEric Biggers 
189150252e4bSEric Biggers 		if (!on_queue || req->work_scheduled) {
1892363bee27SEric Biggers 			/*
1893363bee27SEric Biggers 			 * aio_poll_wake() already either scheduled the async
1894363bee27SEric Biggers 			 * completion work, or completed the request inline.
1895363bee27SEric Biggers 			 */
1896363bee27SEric Biggers 			if (apt.error) /* unsupported case: multiple queues */
1897af5c72b1SAl Viro 				cancel = true;
1898bfe4037eSChristoph Hellwig 			apt.error = 0;
1899af5c72b1SAl Viro 			mask = 0;
1900af5c72b1SAl Viro 		}
1901af5c72b1SAl Viro 		if (mask || apt.error) {
1902363bee27SEric Biggers 			/* Steal to complete synchronously. */
1903bfe4037eSChristoph Hellwig 			list_del_init(&req->wait.entry);
1904af5c72b1SAl Viro 		} else if (cancel) {
1905363bee27SEric Biggers 			/* Cancel if possible (may be too late though). */
1906af5c72b1SAl Viro 			WRITE_ONCE(req->cancelled, true);
190750252e4bSEric Biggers 		} else if (on_queue) {
1908363bee27SEric Biggers 			/*
1909363bee27SEric Biggers 			 * Actually waiting for an event, so add the request to
1910363bee27SEric Biggers 			 * active_reqs so that it can be cancelled if needed.
1911363bee27SEric Biggers 			 */
1912bfe4037eSChristoph Hellwig 			list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
1913bfe4037eSChristoph Hellwig 			aiocb->ki_cancel = aio_poll_cancel;
1914bfe4037eSChristoph Hellwig 		}
191550252e4bSEric Biggers 		if (on_queue)
191650252e4bSEric Biggers 			poll_iocb_unlock_wq(req);
1917af5c72b1SAl Viro 	}
1918af5c72b1SAl Viro 	if (mask) { /* no async, we'd stolen it */
1919af5c72b1SAl Viro 		aiocb->ki_res.res = mangle_poll(mask);
1920af5c72b1SAl Viro 		apt.error = 0;
1921af5c72b1SAl Viro 	}
1922bfe4037eSChristoph Hellwig 	spin_unlock_irq(&ctx->ctx_lock);
1923bfe4037eSChristoph Hellwig 	if (mask)
1924af5c72b1SAl Viro 		iocb_put(aiocb);
1925af5c72b1SAl Viro 	return apt.error;
1926bfe4037eSChristoph Hellwig }
1927bfe4037eSChristoph Hellwig 
__io_submit_one(struct kioctx * ctx,const struct iocb * iocb,struct iocb __user * user_iocb,struct aio_kiocb * req,bool compat)192888a6f18bSJens Axboe static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
19297316b49cSAl Viro 			   struct iocb __user *user_iocb, struct aio_kiocb *req,
19307316b49cSAl Viro 			   bool compat)
19311da177e4SLinus Torvalds {
193284c4e1f8SLinus Torvalds 	req->ki_filp = fget(iocb->aio_fildes);
193384c4e1f8SLinus Torvalds 	if (unlikely(!req->ki_filp))
19347316b49cSAl Viro 		return -EBADF;
193584c4e1f8SLinus Torvalds 
193688a6f18bSJens Axboe 	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
193774259703SAl Viro 		struct eventfd_ctx *eventfd;
19389c3060beSDavide Libenzi 		/*
19399c3060beSDavide Libenzi 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
19409c3060beSDavide Libenzi 		 * instance of the file* now. The file descriptor must be
19419c3060beSDavide Libenzi 		 * an eventfd() fd, and will be signaled for each completed
19429c3060beSDavide Libenzi 		 * event using the eventfd_signal() function.
19439c3060beSDavide Libenzi 		 */
194474259703SAl Viro 		eventfd = eventfd_ctx_fdget(iocb->aio_resfd);
19457316b49cSAl Viro 		if (IS_ERR(eventfd))
194618bfb9c6SDan Carpenter 			return PTR_ERR(eventfd);
19477316b49cSAl Viro 
194874259703SAl Viro 		req->ki_eventfd = eventfd;
19499830f4beSGoldwyn Rodrigues 	}
19509830f4beSGoldwyn Rodrigues 
19517316b49cSAl Viro 	if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
1952caf4167aSKent Overstreet 		pr_debug("EFAULT: aio_key\n");
19537316b49cSAl Viro 		return -EFAULT;
19541da177e4SLinus Torvalds 	}
19551da177e4SLinus Torvalds 
1956a9339b78SAl Viro 	req->ki_res.obj = (u64)(unsigned long)user_iocb;
1957a9339b78SAl Viro 	req->ki_res.data = iocb->aio_data;
1958a9339b78SAl Viro 	req->ki_res.res = 0;
1959a9339b78SAl Viro 	req->ki_res.res2 = 0;
19601da177e4SLinus Torvalds 
196188a6f18bSJens Axboe 	switch (iocb->aio_lio_opcode) {
196289319d31SChristoph Hellwig 	case IOCB_CMD_PREAD:
19637316b49cSAl Viro 		return aio_read(&req->rw, iocb, false, compat);
196489319d31SChristoph Hellwig 	case IOCB_CMD_PWRITE:
19657316b49cSAl Viro 		return aio_write(&req->rw, iocb, false, compat);
196689319d31SChristoph Hellwig 	case IOCB_CMD_PREADV:
19677316b49cSAl Viro 		return aio_read(&req->rw, iocb, true, compat);
196889319d31SChristoph Hellwig 	case IOCB_CMD_PWRITEV:
19697316b49cSAl Viro 		return aio_write(&req->rw, iocb, true, compat);
1970a3c0d439SChristoph Hellwig 	case IOCB_CMD_FSYNC:
19717316b49cSAl Viro 		return aio_fsync(&req->fsync, iocb, false);
1972a3c0d439SChristoph Hellwig 	case IOCB_CMD_FDSYNC:
19737316b49cSAl Viro 		return aio_fsync(&req->fsync, iocb, true);
1974bfe4037eSChristoph Hellwig 	case IOCB_CMD_POLL:
19757316b49cSAl Viro 		return aio_poll(req, iocb);
197689319d31SChristoph Hellwig 	default:
197788a6f18bSJens Axboe 		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
19787316b49cSAl Viro 		return -EINVAL;
197989319d31SChristoph Hellwig 	}
19801da177e4SLinus Torvalds }
19811da177e4SLinus Torvalds 
io_submit_one(struct kioctx * ctx,struct iocb __user * user_iocb,bool compat)198288a6f18bSJens Axboe static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
198388a6f18bSJens Axboe 			 bool compat)
198488a6f18bSJens Axboe {
19857316b49cSAl Viro 	struct aio_kiocb *req;
198688a6f18bSJens Axboe 	struct iocb iocb;
19877316b49cSAl Viro 	int err;
198888a6f18bSJens Axboe 
198988a6f18bSJens Axboe 	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
199088a6f18bSJens Axboe 		return -EFAULT;
199188a6f18bSJens Axboe 
19927316b49cSAl Viro 	/* enforce forwards compatibility on users */
19937316b49cSAl Viro 	if (unlikely(iocb.aio_reserved2)) {
19947316b49cSAl Viro 		pr_debug("EINVAL: reserve field set\n");
19957316b49cSAl Viro 		return -EINVAL;
19967316b49cSAl Viro 	}
19977316b49cSAl Viro 
19987316b49cSAl Viro 	/* prevent overflows */
19997316b49cSAl Viro 	if (unlikely(
20007316b49cSAl Viro 	    (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
20017316b49cSAl Viro 	    (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
20027316b49cSAl Viro 	    ((ssize_t)iocb.aio_nbytes < 0)
20037316b49cSAl Viro 	   )) {
20047316b49cSAl Viro 		pr_debug("EINVAL: overflow check\n");
20057316b49cSAl Viro 		return -EINVAL;
20067316b49cSAl Viro 	}
20077316b49cSAl Viro 
20087316b49cSAl Viro 	req = aio_get_req(ctx);
20097316b49cSAl Viro 	if (unlikely(!req))
20107316b49cSAl Viro 		return -EAGAIN;
20117316b49cSAl Viro 
20127316b49cSAl Viro 	err = __io_submit_one(ctx, &iocb, user_iocb, req, compat);
20137316b49cSAl Viro 
20147316b49cSAl Viro 	/* Done with the synchronous reference */
20157316b49cSAl Viro 	iocb_put(req);
20167316b49cSAl Viro 
20177316b49cSAl Viro 	/*
20187316b49cSAl Viro 	 * If err is 0, we'd either done aio_complete() ourselves or have
20197316b49cSAl Viro 	 * arranged for that to be done asynchronously.  Anything non-zero
20207316b49cSAl Viro 	 * means that we need to destroy req ourselves.
20217316b49cSAl Viro 	 */
20227316b49cSAl Viro 	if (unlikely(err)) {
20237316b49cSAl Viro 		iocb_destroy(req);
20247316b49cSAl Viro 		put_reqs_available(ctx, 1);
20257316b49cSAl Viro 	}
20267316b49cSAl Viro 	return err;
202788a6f18bSJens Axboe }
202888a6f18bSJens Axboe 
20299d85cba7SJeff Moyer /* sys_io_submit:
20309d85cba7SJeff Moyer  *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
20319d85cba7SJeff Moyer  *	the number of iocbs queued.  May return -EINVAL if the aio_context
20329d85cba7SJeff Moyer  *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
20339d85cba7SJeff Moyer  *	*iocbpp[0] is not properly initialized, if the operation specified
20349d85cba7SJeff Moyer  *	is invalid for the file descriptor in the iocb.  May fail with
20359d85cba7SJeff Moyer  *	-EFAULT if any of the data structures point to invalid data.  May
20369d85cba7SJeff Moyer  *	fail with -EBADF if the file descriptor specified in the first
20379d85cba7SJeff Moyer  *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
20389d85cba7SJeff Moyer  *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
20399d85cba7SJeff Moyer  *	fail with -ENOSYS if not implemented.
20409d85cba7SJeff Moyer  */
SYSCALL_DEFINE3(io_submit,aio_context_t,ctx_id,long,nr,struct iocb __user * __user *,iocbpp)20419d85cba7SJeff Moyer SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
20429d85cba7SJeff Moyer 		struct iocb __user * __user *, iocbpp)
20439d85cba7SJeff Moyer {
204467ba049fSAl Viro 	struct kioctx *ctx;
204567ba049fSAl Viro 	long ret = 0;
204667ba049fSAl Viro 	int i = 0;
204767ba049fSAl Viro 	struct blk_plug plug;
204867ba049fSAl Viro 
204967ba049fSAl Viro 	if (unlikely(nr < 0))
205067ba049fSAl Viro 		return -EINVAL;
205167ba049fSAl Viro 
205267ba049fSAl Viro 	ctx = lookup_ioctx(ctx_id);
205367ba049fSAl Viro 	if (unlikely(!ctx)) {
205467ba049fSAl Viro 		pr_debug("EINVAL: invalid context id\n");
205567ba049fSAl Viro 		return -EINVAL;
205667ba049fSAl Viro 	}
205767ba049fSAl Viro 
20581da92779SAl Viro 	if (nr > ctx->nr_events)
20591da92779SAl Viro 		nr = ctx->nr_events;
20601da92779SAl Viro 
2061a79d40e9SJens Axboe 	if (nr > AIO_PLUG_THRESHOLD)
206267ba049fSAl Viro 		blk_start_plug(&plug);
206367ba049fSAl Viro 	for (i = 0; i < nr; i++) {
206467ba049fSAl Viro 		struct iocb __user *user_iocb;
206567ba049fSAl Viro 
206667ba049fSAl Viro 		if (unlikely(get_user(user_iocb, iocbpp + i))) {
206767ba049fSAl Viro 			ret = -EFAULT;
206867ba049fSAl Viro 			break;
206967ba049fSAl Viro 		}
207067ba049fSAl Viro 
207167ba049fSAl Viro 		ret = io_submit_one(ctx, user_iocb, false);
207267ba049fSAl Viro 		if (ret)
207367ba049fSAl Viro 			break;
207467ba049fSAl Viro 	}
2075a79d40e9SJens Axboe 	if (nr > AIO_PLUG_THRESHOLD)
207667ba049fSAl Viro 		blk_finish_plug(&plug);
207767ba049fSAl Viro 
207867ba049fSAl Viro 	percpu_ref_put(&ctx->users);
207967ba049fSAl Viro 	return i ? i : ret;
20809d85cba7SJeff Moyer }
20819d85cba7SJeff Moyer 
2082c00d2c7eSAl Viro #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(io_submit,compat_aio_context_t,ctx_id,int,nr,compat_uptr_t __user *,iocbpp)2083c00d2c7eSAl Viro COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
208467ba049fSAl Viro 		       int, nr, compat_uptr_t __user *, iocbpp)
2085c00d2c7eSAl Viro {
208667ba049fSAl Viro 	struct kioctx *ctx;
208767ba049fSAl Viro 	long ret = 0;
208867ba049fSAl Viro 	int i = 0;
208967ba049fSAl Viro 	struct blk_plug plug;
2090c00d2c7eSAl Viro 
2091c00d2c7eSAl Viro 	if (unlikely(nr < 0))
2092c00d2c7eSAl Viro 		return -EINVAL;
2093c00d2c7eSAl Viro 
209467ba049fSAl Viro 	ctx = lookup_ioctx(ctx_id);
209567ba049fSAl Viro 	if (unlikely(!ctx)) {
209667ba049fSAl Viro 		pr_debug("EINVAL: invalid context id\n");
209767ba049fSAl Viro 		return -EINVAL;
209867ba049fSAl Viro 	}
209967ba049fSAl Viro 
21001da92779SAl Viro 	if (nr > ctx->nr_events)
21011da92779SAl Viro 		nr = ctx->nr_events;
21021da92779SAl Viro 
2103a79d40e9SJens Axboe 	if (nr > AIO_PLUG_THRESHOLD)
210467ba049fSAl Viro 		blk_start_plug(&plug);
210567ba049fSAl Viro 	for (i = 0; i < nr; i++) {
210667ba049fSAl Viro 		compat_uptr_t user_iocb;
210767ba049fSAl Viro 
210867ba049fSAl Viro 		if (unlikely(get_user(user_iocb, iocbpp + i))) {
210967ba049fSAl Viro 			ret = -EFAULT;
211067ba049fSAl Viro 			break;
211167ba049fSAl Viro 		}
211267ba049fSAl Viro 
211367ba049fSAl Viro 		ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
211467ba049fSAl Viro 		if (ret)
211567ba049fSAl Viro 			break;
211667ba049fSAl Viro 	}
2117a79d40e9SJens Axboe 	if (nr > AIO_PLUG_THRESHOLD)
211867ba049fSAl Viro 		blk_finish_plug(&plug);
211967ba049fSAl Viro 
212067ba049fSAl Viro 	percpu_ref_put(&ctx->users);
212167ba049fSAl Viro 	return i ? i : ret;
2122c00d2c7eSAl Viro }
2123c00d2c7eSAl Viro #endif
2124c00d2c7eSAl Viro 
21251da177e4SLinus Torvalds /* sys_io_cancel:
21261da177e4SLinus Torvalds  *	Attempts to cancel an iocb previously passed to io_submit.  If
21271da177e4SLinus Torvalds  *	the operation is successfully cancelled, the resulting event is
21281da177e4SLinus Torvalds  *	copied into the memory pointed to by result without being placed
21291da177e4SLinus Torvalds  *	into the completion queue and 0 is returned.  May fail with
21301da177e4SLinus Torvalds  *	-EFAULT if any of the data structures pointed to are invalid.
21311da177e4SLinus Torvalds  *	May fail with -EINVAL if aio_context specified by ctx_id is
21321da177e4SLinus Torvalds  *	invalid.  May fail with -EAGAIN if the iocb specified was not
21331da177e4SLinus Torvalds  *	cancelled.  Will fail with -ENOSYS if not implemented.
21341da177e4SLinus Torvalds  */
SYSCALL_DEFINE3(io_cancel,aio_context_t,ctx_id,struct iocb __user *,iocb,struct io_event __user *,result)2135002c8976SHeiko Carstens SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
2136002c8976SHeiko Carstens 		struct io_event __user *, result)
21371da177e4SLinus Torvalds {
21381da177e4SLinus Torvalds 	struct kioctx *ctx;
213904b2fa9fSChristoph Hellwig 	struct aio_kiocb *kiocb;
2140888933f8SChristoph Hellwig 	int ret = -EINVAL;
21411da177e4SLinus Torvalds 	u32 key;
2142a9339b78SAl Viro 	u64 obj = (u64)(unsigned long)iocb;
21431da177e4SLinus Torvalds 
2144f3a2752aSChristoph Hellwig 	if (unlikely(get_user(key, &iocb->aio_key)))
21451da177e4SLinus Torvalds 		return -EFAULT;
2146f3a2752aSChristoph Hellwig 	if (unlikely(key != KIOCB_KEY))
2147f3a2752aSChristoph Hellwig 		return -EINVAL;
21481da177e4SLinus Torvalds 
21491da177e4SLinus Torvalds 	ctx = lookup_ioctx(ctx_id);
21501da177e4SLinus Torvalds 	if (unlikely(!ctx))
21511da177e4SLinus Torvalds 		return -EINVAL;
21521da177e4SLinus Torvalds 
21531da177e4SLinus Torvalds 	spin_lock_irq(&ctx->ctx_lock);
2154833f4154SAl Viro 	/* TODO: use a hash or array, this sucks. */
2155833f4154SAl Viro 	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
2156a9339b78SAl Viro 		if (kiocb->ki_res.obj == obj) {
2157888933f8SChristoph Hellwig 			ret = kiocb->ki_cancel(&kiocb->rw);
2158888933f8SChristoph Hellwig 			list_del_init(&kiocb->ki_list);
2159833f4154SAl Viro 			break;
2160833f4154SAl Viro 		}
2161888933f8SChristoph Hellwig 	}
21621da177e4SLinus Torvalds 	spin_unlock_irq(&ctx->ctx_lock);
21631da177e4SLinus Torvalds 
21641da177e4SLinus Torvalds 	if (!ret) {
2165bec68faaSKent Overstreet 		/*
2166bec68faaSKent Overstreet 		 * The result argument is no longer used - the io_event is
2167bec68faaSKent Overstreet 		 * always delivered via the ring buffer. -EINPROGRESS indicates
2168bec68faaSKent Overstreet 		 * cancellation is progress:
21691da177e4SLinus Torvalds 		 */
2170bec68faaSKent Overstreet 		ret = -EINPROGRESS;
21711da177e4SLinus Torvalds 	}
21721da177e4SLinus Torvalds 
2173723be6e3SKent Overstreet 	percpu_ref_put(&ctx->users);
21741da177e4SLinus Torvalds 
21751da177e4SLinus Torvalds 	return ret;
21761da177e4SLinus Torvalds }
21771da177e4SLinus Torvalds 
do_io_getevents(aio_context_t ctx_id,long min_nr,long nr,struct io_event __user * events,struct timespec64 * ts)2178fa2e62a5SDeepa Dinamani static long do_io_getevents(aio_context_t ctx_id,
2179fa2e62a5SDeepa Dinamani 		long min_nr,
2180fa2e62a5SDeepa Dinamani 		long nr,
2181fa2e62a5SDeepa Dinamani 		struct io_event __user *events,
2182fa2e62a5SDeepa Dinamani 		struct timespec64 *ts)
2183fa2e62a5SDeepa Dinamani {
2184fa2e62a5SDeepa Dinamani 	ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
2185fa2e62a5SDeepa Dinamani 	struct kioctx *ioctx = lookup_ioctx(ctx_id);
2186fa2e62a5SDeepa Dinamani 	long ret = -EINVAL;
2187fa2e62a5SDeepa Dinamani 
2188fa2e62a5SDeepa Dinamani 	if (likely(ioctx)) {
2189fa2e62a5SDeepa Dinamani 		if (likely(min_nr <= nr && min_nr >= 0))
2190fa2e62a5SDeepa Dinamani 			ret = read_events(ioctx, min_nr, nr, events, until);
2191fa2e62a5SDeepa Dinamani 		percpu_ref_put(&ioctx->users);
2192fa2e62a5SDeepa Dinamani 	}
2193fa2e62a5SDeepa Dinamani 
2194fa2e62a5SDeepa Dinamani 	return ret;
2195fa2e62a5SDeepa Dinamani }
2196fa2e62a5SDeepa Dinamani 
21971da177e4SLinus Torvalds /* io_getevents:
21981da177e4SLinus Torvalds  *	Attempts to read at least min_nr events and up to nr events from
2199642b5123SSatoru Takeuchi  *	the completion queue for the aio_context specified by ctx_id. If
2200642b5123SSatoru Takeuchi  *	it succeeds, the number of read events is returned. May fail with
2201642b5123SSatoru Takeuchi  *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
2202642b5123SSatoru Takeuchi  *	out of range, if timeout is out of range.  May fail with -EFAULT
2203642b5123SSatoru Takeuchi  *	if any of the memory specified is invalid.  May return 0 or
2204642b5123SSatoru Takeuchi  *	< min_nr if the timeout specified by timeout has elapsed
2205642b5123SSatoru Takeuchi  *	before sufficient events are available, where timeout == NULL
2206642b5123SSatoru Takeuchi  *	specifies an infinite timeout. Note that the timeout pointed to by
22076900807cSJeff Moyer  *	timeout is relative.  Will fail with -ENOSYS if not implemented.
22081da177e4SLinus Torvalds  */
22093ca47e95SArnd Bergmann #ifdef CONFIG_64BIT
22107a35397fSDeepa Dinamani 
SYSCALL_DEFINE5(io_getevents,aio_context_t,ctx_id,long,min_nr,long,nr,struct io_event __user *,events,struct __kernel_timespec __user *,timeout)2211002c8976SHeiko Carstens SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
2212002c8976SHeiko Carstens 		long, min_nr,
2213002c8976SHeiko Carstens 		long, nr,
2214002c8976SHeiko Carstens 		struct io_event __user *, events,
22157a35397fSDeepa Dinamani 		struct __kernel_timespec __user *, timeout)
22161da177e4SLinus Torvalds {
2217fa2e62a5SDeepa Dinamani 	struct timespec64	ts;
22187a074e96SChristoph Hellwig 	int			ret;
22191da177e4SLinus Torvalds 
22207a074e96SChristoph Hellwig 	if (timeout && unlikely(get_timespec64(&ts, timeout)))
2221fa2e62a5SDeepa Dinamani 		return -EFAULT;
22227a074e96SChristoph Hellwig 
22237a074e96SChristoph Hellwig 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
22247a074e96SChristoph Hellwig 	if (!ret && signal_pending(current))
22257a074e96SChristoph Hellwig 		ret = -EINTR;
22267a074e96SChristoph Hellwig 	return ret;
22271da177e4SLinus Torvalds }
2228fa2e62a5SDeepa Dinamani 
22297a35397fSDeepa Dinamani #endif
22307a35397fSDeepa Dinamani 
22319ba546c0SChristoph Hellwig struct __aio_sigset {
22329ba546c0SChristoph Hellwig 	const sigset_t __user	*sigmask;
22339ba546c0SChristoph Hellwig 	size_t		sigsetsize;
22349ba546c0SChristoph Hellwig };
22359ba546c0SChristoph Hellwig 
SYSCALL_DEFINE6(io_pgetevents,aio_context_t,ctx_id,long,min_nr,long,nr,struct io_event __user *,events,struct __kernel_timespec __user *,timeout,const struct __aio_sigset __user *,usig)22367a074e96SChristoph Hellwig SYSCALL_DEFINE6(io_pgetevents,
22377a074e96SChristoph Hellwig 		aio_context_t, ctx_id,
22387a074e96SChristoph Hellwig 		long, min_nr,
22397a074e96SChristoph Hellwig 		long, nr,
22407a074e96SChristoph Hellwig 		struct io_event __user *, events,
22417a35397fSDeepa Dinamani 		struct __kernel_timespec __user *, timeout,
22427a074e96SChristoph Hellwig 		const struct __aio_sigset __user *, usig)
22437a074e96SChristoph Hellwig {
22447a074e96SChristoph Hellwig 	struct __aio_sigset	ksig = { NULL, };
22457a074e96SChristoph Hellwig 	struct timespec64	ts;
224697abc889SOleg Nesterov 	bool interrupted;
22477a074e96SChristoph Hellwig 	int ret;
22487a074e96SChristoph Hellwig 
22497a074e96SChristoph Hellwig 	if (timeout && unlikely(get_timespec64(&ts, timeout)))
22507a074e96SChristoph Hellwig 		return -EFAULT;
22517a074e96SChristoph Hellwig 
22527a074e96SChristoph Hellwig 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
22537a074e96SChristoph Hellwig 		return -EFAULT;
22547a074e96SChristoph Hellwig 
2255b772434bSOleg Nesterov 	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
22567a35397fSDeepa Dinamani 	if (ret)
22577a35397fSDeepa Dinamani 		return ret;
22587a074e96SChristoph Hellwig 
22597a074e96SChristoph Hellwig 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
226097abc889SOleg Nesterov 
226197abc889SOleg Nesterov 	interrupted = signal_pending(current);
2262b772434bSOleg Nesterov 	restore_saved_sigmask_unless(interrupted);
226397abc889SOleg Nesterov 	if (interrupted && !ret)
22647a074e96SChristoph Hellwig 		ret = -ERESTARTNOHAND;
22657a074e96SChristoph Hellwig 
22667a074e96SChristoph Hellwig 	return ret;
22671da177e4SLinus Torvalds }
2268c00d2c7eSAl Viro 
22697a35397fSDeepa Dinamani #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
22707a35397fSDeepa Dinamani 
SYSCALL_DEFINE6(io_pgetevents_time32,aio_context_t,ctx_id,long,min_nr,long,nr,struct io_event __user *,events,struct old_timespec32 __user *,timeout,const struct __aio_sigset __user *,usig)22717a35397fSDeepa Dinamani SYSCALL_DEFINE6(io_pgetevents_time32,
22727a35397fSDeepa Dinamani 		aio_context_t, ctx_id,
22737a35397fSDeepa Dinamani 		long, min_nr,
22747a35397fSDeepa Dinamani 		long, nr,
22757a35397fSDeepa Dinamani 		struct io_event __user *, events,
22767a35397fSDeepa Dinamani 		struct old_timespec32 __user *, timeout,
22777a35397fSDeepa Dinamani 		const struct __aio_sigset __user *, usig)
22787a35397fSDeepa Dinamani {
22797a35397fSDeepa Dinamani 	struct __aio_sigset	ksig = { NULL, };
22807a35397fSDeepa Dinamani 	struct timespec64	ts;
228197abc889SOleg Nesterov 	bool interrupted;
22827a35397fSDeepa Dinamani 	int ret;
22837a35397fSDeepa Dinamani 
22847a35397fSDeepa Dinamani 	if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
22857a35397fSDeepa Dinamani 		return -EFAULT;
22867a35397fSDeepa Dinamani 
22877a35397fSDeepa Dinamani 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
22887a35397fSDeepa Dinamani 		return -EFAULT;
22897a35397fSDeepa Dinamani 
2290ded653ccSDeepa Dinamani 
2291b772434bSOleg Nesterov 	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
2292ded653ccSDeepa Dinamani 	if (ret)
2293ded653ccSDeepa Dinamani 		return ret;
22941da177e4SLinus Torvalds 
22951da177e4SLinus Torvalds 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
229697abc889SOleg Nesterov 
229797abc889SOleg Nesterov 	interrupted = signal_pending(current);
2298b772434bSOleg Nesterov 	restore_saved_sigmask_unless(interrupted);
229997abc889SOleg Nesterov 	if (interrupted && !ret)
23001da177e4SLinus Torvalds 		ret = -ERESTARTNOHAND;
23011da177e4SLinus Torvalds 
23021da177e4SLinus Torvalds 	return ret;
23031da177e4SLinus Torvalds }
2304c00d2c7eSAl Viro 
23057a35397fSDeepa Dinamani #endif
23067a35397fSDeepa Dinamani 
23077a35397fSDeepa Dinamani #if defined(CONFIG_COMPAT_32BIT_TIME)
23087a35397fSDeepa Dinamani 
SYSCALL_DEFINE5(io_getevents_time32,__u32,ctx_id,__s32,min_nr,__s32,nr,struct io_event __user *,events,struct old_timespec32 __user *,timeout)23098dabe724SArnd Bergmann SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
23108dabe724SArnd Bergmann 		__s32, min_nr,
23118dabe724SArnd Bergmann 		__s32, nr,
2312c00d2c7eSAl Viro 		struct io_event __user *, events,
23139afc5eeeSArnd Bergmann 		struct old_timespec32 __user *, timeout)
2314c00d2c7eSAl Viro {
2315fa2e62a5SDeepa Dinamani 	struct timespec64 t;
23167a074e96SChristoph Hellwig 	int ret;
2317c00d2c7eSAl Viro 
23189afc5eeeSArnd Bergmann 	if (timeout && get_old_timespec32(&t, timeout))
2319c00d2c7eSAl Viro 		return -EFAULT;
2320c00d2c7eSAl Viro 
23217a074e96SChristoph Hellwig 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
23227a074e96SChristoph Hellwig 	if (!ret && signal_pending(current))
23237a074e96SChristoph Hellwig 		ret = -EINTR;
23247a074e96SChristoph Hellwig 	return ret;
2325c00d2c7eSAl Viro }
2326fa2e62a5SDeepa Dinamani 
23277a35397fSDeepa Dinamani #endif
23287a35397fSDeepa Dinamani 
23297a35397fSDeepa Dinamani #ifdef CONFIG_COMPAT
23307a074e96SChristoph Hellwig 
23317a074e96SChristoph Hellwig struct __compat_aio_sigset {
233297eba80fSGuillem Jover 	compat_uptr_t		sigmask;
23337a074e96SChristoph Hellwig 	compat_size_t		sigsetsize;
23347a074e96SChristoph Hellwig };
23357a074e96SChristoph Hellwig 
23367a35397fSDeepa Dinamani #if defined(CONFIG_COMPAT_32BIT_TIME)
23377a35397fSDeepa Dinamani 
COMPAT_SYSCALL_DEFINE6(io_pgetevents,compat_aio_context_t,ctx_id,compat_long_t,min_nr,compat_long_t,nr,struct io_event __user *,events,struct old_timespec32 __user *,timeout,const struct __compat_aio_sigset __user *,usig)23387a074e96SChristoph Hellwig COMPAT_SYSCALL_DEFINE6(io_pgetevents,
23397a074e96SChristoph Hellwig 		compat_aio_context_t, ctx_id,
23407a074e96SChristoph Hellwig 		compat_long_t, min_nr,
23417a074e96SChristoph Hellwig 		compat_long_t, nr,
23427a074e96SChristoph Hellwig 		struct io_event __user *, events,
23439afc5eeeSArnd Bergmann 		struct old_timespec32 __user *, timeout,
23447a074e96SChristoph Hellwig 		const struct __compat_aio_sigset __user *, usig)
23457a074e96SChristoph Hellwig {
234697eba80fSGuillem Jover 	struct __compat_aio_sigset ksig = { 0, };
23477a074e96SChristoph Hellwig 	struct timespec64 t;
234897abc889SOleg Nesterov 	bool interrupted;
23497a074e96SChristoph Hellwig 	int ret;
23507a074e96SChristoph Hellwig 
23519afc5eeeSArnd Bergmann 	if (timeout && get_old_timespec32(&t, timeout))
23527a074e96SChristoph Hellwig 		return -EFAULT;
23537a074e96SChristoph Hellwig 
23547a074e96SChristoph Hellwig 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
23557a074e96SChristoph Hellwig 		return -EFAULT;
23567a074e96SChristoph Hellwig 
235797eba80fSGuillem Jover 	ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
2358ded653ccSDeepa Dinamani 	if (ret)
2359ded653ccSDeepa Dinamani 		return ret;
23607a074e96SChristoph Hellwig 
23617a074e96SChristoph Hellwig 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
236297abc889SOleg Nesterov 
236397abc889SOleg Nesterov 	interrupted = signal_pending(current);
2364b772434bSOleg Nesterov 	restore_saved_sigmask_unless(interrupted);
236597abc889SOleg Nesterov 	if (interrupted && !ret)
23667a074e96SChristoph Hellwig 		ret = -ERESTARTNOHAND;
23677a074e96SChristoph Hellwig 
23687a074e96SChristoph Hellwig 	return ret;
23691da177e4SLinus Torvalds }
23701da177e4SLinus Torvalds 
23717a35397fSDeepa Dinamani #endif
23727a35397fSDeepa Dinamani 
COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,compat_aio_context_t,ctx_id,compat_long_t,min_nr,compat_long_t,nr,struct io_event __user *,events,struct __kernel_timespec __user *,timeout,const struct __compat_aio_sigset __user *,usig)23737a35397fSDeepa Dinamani COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
23747a35397fSDeepa Dinamani 		compat_aio_context_t, ctx_id,
23757a35397fSDeepa Dinamani 		compat_long_t, min_nr,
23767a35397fSDeepa Dinamani 		compat_long_t, nr,
23777a35397fSDeepa Dinamani 		struct io_event __user *, events,
23787a35397fSDeepa Dinamani 		struct __kernel_timespec __user *, timeout,
23797a35397fSDeepa Dinamani 		const struct __compat_aio_sigset __user *, usig)
23807a35397fSDeepa Dinamani {
238197eba80fSGuillem Jover 	struct __compat_aio_sigset ksig = { 0, };
23827a35397fSDeepa Dinamani 	struct timespec64 t;
238397abc889SOleg Nesterov 	bool interrupted;
23847a35397fSDeepa Dinamani 	int ret;
23857a35397fSDeepa Dinamani 
23867a35397fSDeepa Dinamani 	if (timeout && get_timespec64(&t, timeout))
23877a35397fSDeepa Dinamani 		return -EFAULT;
23887a35397fSDeepa Dinamani 
23897a35397fSDeepa Dinamani 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
23907a35397fSDeepa Dinamani 		return -EFAULT;
23917a35397fSDeepa Dinamani 
239297eba80fSGuillem Jover 	ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
23937a35397fSDeepa Dinamani 	if (ret)
23947a35397fSDeepa Dinamani 		return ret;
23957a35397fSDeepa Dinamani 
23967a35397fSDeepa Dinamani 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
239797abc889SOleg Nesterov 
239897abc889SOleg Nesterov 	interrupted = signal_pending(current);
2399b772434bSOleg Nesterov 	restore_saved_sigmask_unless(interrupted);
240097abc889SOleg Nesterov 	if (interrupted && !ret)
24017a35397fSDeepa Dinamani 		ret = -ERESTARTNOHAND;
24027a35397fSDeepa Dinamani 
24031da177e4SLinus Torvalds 	return ret;
24041da177e4SLinus Torvalds }
24051da177e4SLinus Torvalds #endif
2406