1f3b44f92SJens Axboe // SPDX-License-Identifier: GPL-2.0
2f3b44f92SJens Axboe #include <linux/kernel.h>
3f3b44f92SJens Axboe #include <linux/errno.h>
4f3b44f92SJens Axboe #include <linux/fs.h>
5f3b44f92SJens Axboe #include <linux/file.h>
6f3b44f92SJens Axboe #include <linux/blk-mq.h>
7f3b44f92SJens Axboe #include <linux/mm.h>
8f3b44f92SJens Axboe #include <linux/slab.h>
9f3b44f92SJens Axboe #include <linux/fsnotify.h>
10f3b44f92SJens Axboe #include <linux/poll.h>
11f3b44f92SJens Axboe #include <linux/nospec.h>
12f3b44f92SJens Axboe #include <linux/compat.h>
13f3b44f92SJens Axboe #include <linux/io_uring.h>
14f3b44f92SJens Axboe
15f3b44f92SJens Axboe #include <uapi/linux/io_uring.h>
16f3b44f92SJens Axboe
17f3b44f92SJens Axboe #include "io_uring.h"
18f3b44f92SJens Axboe #include "opdef.h"
19f3b44f92SJens Axboe #include "kbuf.h"
20f3b44f92SJens Axboe #include "rsrc.h"
21f3b44f92SJens Axboe #include "rw.h"
22f3b44f92SJens Axboe
23f3b44f92SJens Axboe struct io_rw {
24f3b44f92SJens Axboe /* NOTE: kiocb has the file as the first member, so don't do it here */
25f3b44f92SJens Axboe struct kiocb kiocb;
26f3b44f92SJens Axboe u64 addr;
27f3b44f92SJens Axboe u32 len;
28f3b44f92SJens Axboe rwf_t flags;
29f3b44f92SJens Axboe };
30f3b44f92SJens Axboe
io_file_supports_nowait(struct io_kiocb * req)31f3b44f92SJens Axboe static inline bool io_file_supports_nowait(struct io_kiocb *req)
32f3b44f92SJens Axboe {
33f3b44f92SJens Axboe return req->flags & REQ_F_SUPPORT_NOWAIT;
34f3b44f92SJens Axboe }
35f3b44f92SJens Axboe
364ab9d465SDylan Yudaken #ifdef CONFIG_COMPAT
io_iov_compat_buffer_select_prep(struct io_rw * rw)374ab9d465SDylan Yudaken static int io_iov_compat_buffer_select_prep(struct io_rw *rw)
384ab9d465SDylan Yudaken {
394ab9d465SDylan Yudaken struct compat_iovec __user *uiov;
404ab9d465SDylan Yudaken compat_ssize_t clen;
414ab9d465SDylan Yudaken
424ab9d465SDylan Yudaken uiov = u64_to_user_ptr(rw->addr);
434ab9d465SDylan Yudaken if (!access_ok(uiov, sizeof(*uiov)))
444ab9d465SDylan Yudaken return -EFAULT;
454ab9d465SDylan Yudaken if (__get_user(clen, &uiov->iov_len))
464ab9d465SDylan Yudaken return -EFAULT;
474ab9d465SDylan Yudaken if (clen < 0)
484ab9d465SDylan Yudaken return -EINVAL;
494ab9d465SDylan Yudaken
504ab9d465SDylan Yudaken rw->len = clen;
514ab9d465SDylan Yudaken return 0;
524ab9d465SDylan Yudaken }
534ab9d465SDylan Yudaken #endif
544ab9d465SDylan Yudaken
io_iov_buffer_select_prep(struct io_kiocb * req)554ab9d465SDylan Yudaken static int io_iov_buffer_select_prep(struct io_kiocb *req)
564ab9d465SDylan Yudaken {
574ab9d465SDylan Yudaken struct iovec __user *uiov;
584ab9d465SDylan Yudaken struct iovec iov;
594ab9d465SDylan Yudaken struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
604ab9d465SDylan Yudaken
614ab9d465SDylan Yudaken if (rw->len != 1)
624ab9d465SDylan Yudaken return -EINVAL;
634ab9d465SDylan Yudaken
644ab9d465SDylan Yudaken #ifdef CONFIG_COMPAT
654ab9d465SDylan Yudaken if (req->ctx->compat)
664ab9d465SDylan Yudaken return io_iov_compat_buffer_select_prep(rw);
674ab9d465SDylan Yudaken #endif
684ab9d465SDylan Yudaken
694ab9d465SDylan Yudaken uiov = u64_to_user_ptr(rw->addr);
704ab9d465SDylan Yudaken if (copy_from_user(&iov, uiov, sizeof(*uiov)))
714ab9d465SDylan Yudaken return -EFAULT;
724ab9d465SDylan Yudaken rw->len = iov.iov_len;
734ab9d465SDylan Yudaken return 0;
744ab9d465SDylan Yudaken }
754ab9d465SDylan Yudaken
io_prep_rw(struct io_kiocb * req,const struct io_uring_sqe * sqe)76f3b44f92SJens Axboe int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
77f3b44f92SJens Axboe {
78f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
79f3b44f92SJens Axboe unsigned ioprio;
80f3b44f92SJens Axboe int ret;
81f3b44f92SJens Axboe
82f3b44f92SJens Axboe rw->kiocb.ki_pos = READ_ONCE(sqe->off);
83f3b44f92SJens Axboe /* used for fixed read/write too - just read unconditionally */
84f3b44f92SJens Axboe req->buf_index = READ_ONCE(sqe->buf_index);
85f3b44f92SJens Axboe
86f3b44f92SJens Axboe if (req->opcode == IORING_OP_READ_FIXED ||
87f3b44f92SJens Axboe req->opcode == IORING_OP_WRITE_FIXED) {
88f3b44f92SJens Axboe struct io_ring_ctx *ctx = req->ctx;
89f3b44f92SJens Axboe u16 index;
90f3b44f92SJens Axboe
91f3b44f92SJens Axboe if (unlikely(req->buf_index >= ctx->nr_user_bufs))
92f3b44f92SJens Axboe return -EFAULT;
93f3b44f92SJens Axboe index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
94f3b44f92SJens Axboe req->imu = ctx->user_bufs[index];
95f3b44f92SJens Axboe io_req_set_rsrc_node(req, ctx, 0);
96f3b44f92SJens Axboe }
97f3b44f92SJens Axboe
98f3b44f92SJens Axboe ioprio = READ_ONCE(sqe->ioprio);
99f3b44f92SJens Axboe if (ioprio) {
100f3b44f92SJens Axboe ret = ioprio_check_cap(ioprio);
101f3b44f92SJens Axboe if (ret)
102f3b44f92SJens Axboe return ret;
103f3b44f92SJens Axboe
104f3b44f92SJens Axboe rw->kiocb.ki_ioprio = ioprio;
105f3b44f92SJens Axboe } else {
106f3b44f92SJens Axboe rw->kiocb.ki_ioprio = get_current_ioprio();
107f3b44f92SJens Axboe }
108099ada2cSJens Axboe rw->kiocb.dio_complete = NULL;
109f3b44f92SJens Axboe
110f3b44f92SJens Axboe rw->addr = READ_ONCE(sqe->addr);
111f3b44f92SJens Axboe rw->len = READ_ONCE(sqe->len);
112f3b44f92SJens Axboe rw->flags = READ_ONCE(sqe->rw_flags);
1134ab9d465SDylan Yudaken
1144ab9d465SDylan Yudaken /* Have to do this validation here, as this is in io_read() rw->len might
1154ab9d465SDylan Yudaken * have chanaged due to buffer selection
1164ab9d465SDylan Yudaken */
1174ab9d465SDylan Yudaken if (req->opcode == IORING_OP_READV && req->flags & REQ_F_BUFFER_SELECT) {
1184ab9d465SDylan Yudaken ret = io_iov_buffer_select_prep(req);
1194ab9d465SDylan Yudaken if (ret)
1204ab9d465SDylan Yudaken return ret;
1214ab9d465SDylan Yudaken }
1224ab9d465SDylan Yudaken
123f3b44f92SJens Axboe return 0;
124f3b44f92SJens Axboe }
125f3b44f92SJens Axboe
io_readv_writev_cleanup(struct io_kiocb * req)126f3b44f92SJens Axboe void io_readv_writev_cleanup(struct io_kiocb *req)
127f3b44f92SJens Axboe {
128f3b44f92SJens Axboe struct io_async_rw *io = req->async_data;
129f3b44f92SJens Axboe
130f3b44f92SJens Axboe kfree(io->free_iovec);
131f3b44f92SJens Axboe }
132f3b44f92SJens Axboe
io_rw_done(struct kiocb * kiocb,ssize_t ret)133f3b44f92SJens Axboe static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
134f3b44f92SJens Axboe {
135f3b44f92SJens Axboe switch (ret) {
136f3b44f92SJens Axboe case -EIOCBQUEUED:
137f3b44f92SJens Axboe break;
138f3b44f92SJens Axboe case -ERESTARTSYS:
139f3b44f92SJens Axboe case -ERESTARTNOINTR:
140f3b44f92SJens Axboe case -ERESTARTNOHAND:
141f3b44f92SJens Axboe case -ERESTART_RESTARTBLOCK:
142f3b44f92SJens Axboe /*
143f3b44f92SJens Axboe * We can't just restart the syscall, since previously
144f3b44f92SJens Axboe * submitted sqes may already be in progress. Just fail this
145f3b44f92SJens Axboe * IO with EINTR.
146f3b44f92SJens Axboe */
147f3b44f92SJens Axboe ret = -EINTR;
148f3b44f92SJens Axboe fallthrough;
149f3b44f92SJens Axboe default:
150f3b44f92SJens Axboe kiocb->ki_complete(kiocb, ret);
151f3b44f92SJens Axboe }
152f3b44f92SJens Axboe }
153f3b44f92SJens Axboe
io_kiocb_update_pos(struct io_kiocb * req)154f3b44f92SJens Axboe static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
155f3b44f92SJens Axboe {
156f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
157f3b44f92SJens Axboe
158f3b44f92SJens Axboe if (rw->kiocb.ki_pos != -1)
159f3b44f92SJens Axboe return &rw->kiocb.ki_pos;
160f3b44f92SJens Axboe
161f3b44f92SJens Axboe if (!(req->file->f_mode & FMODE_STREAM)) {
162f3b44f92SJens Axboe req->flags |= REQ_F_CUR_POS;
163f3b44f92SJens Axboe rw->kiocb.ki_pos = req->file->f_pos;
164f3b44f92SJens Axboe return &rw->kiocb.ki_pos;
165f3b44f92SJens Axboe }
166f3b44f92SJens Axboe
167f3b44f92SJens Axboe rw->kiocb.ki_pos = 0;
168f3b44f92SJens Axboe return NULL;
169f3b44f92SJens Axboe }
170f3b44f92SJens Axboe
io_req_task_queue_reissue(struct io_kiocb * req)171f3b44f92SJens Axboe static void io_req_task_queue_reissue(struct io_kiocb *req)
172f3b44f92SJens Axboe {
173f3b44f92SJens Axboe req->io_task_work.func = io_queue_iowq;
174f3b44f92SJens Axboe io_req_task_work_add(req);
175f3b44f92SJens Axboe }
176f3b44f92SJens Axboe
177f3b44f92SJens Axboe #ifdef CONFIG_BLOCK
io_resubmit_prep(struct io_kiocb * req)178f3b44f92SJens Axboe static bool io_resubmit_prep(struct io_kiocb *req)
179f3b44f92SJens Axboe {
180f3b44f92SJens Axboe struct io_async_rw *io = req->async_data;
181f3b44f92SJens Axboe
182f3b44f92SJens Axboe if (!req_has_async_data(req))
183f3b44f92SJens Axboe return !io_req_prep_async(req);
184f3b44f92SJens Axboe iov_iter_restore(&io->s.iter, &io->s.iter_state);
185f3b44f92SJens Axboe return true;
186f3b44f92SJens Axboe }
187f3b44f92SJens Axboe
io_rw_should_reissue(struct io_kiocb * req)188f3b44f92SJens Axboe static bool io_rw_should_reissue(struct io_kiocb *req)
189f3b44f92SJens Axboe {
190f3b44f92SJens Axboe umode_t mode = file_inode(req->file)->i_mode;
191f3b44f92SJens Axboe struct io_ring_ctx *ctx = req->ctx;
192f3b44f92SJens Axboe
193f3b44f92SJens Axboe if (!S_ISBLK(mode) && !S_ISREG(mode))
194f3b44f92SJens Axboe return false;
195f3b44f92SJens Axboe if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
196f3b44f92SJens Axboe !(ctx->flags & IORING_SETUP_IOPOLL)))
197f3b44f92SJens Axboe return false;
198f3b44f92SJens Axboe /*
199f3b44f92SJens Axboe * If ref is dying, we might be running poll reap from the exit work.
200f3b44f92SJens Axboe * Don't attempt to reissue from that path, just let it fail with
201f3b44f92SJens Axboe * -EAGAIN.
202f3b44f92SJens Axboe */
203f3b44f92SJens Axboe if (percpu_ref_is_dying(&ctx->refs))
204f3b44f92SJens Axboe return false;
205f3b44f92SJens Axboe /*
206f3b44f92SJens Axboe * Play it safe and assume not safe to re-import and reissue if we're
207f3b44f92SJens Axboe * not in the original thread group (or in task context).
208f3b44f92SJens Axboe */
209f3b44f92SJens Axboe if (!same_thread_group(req->task, current) || !in_task())
210f3b44f92SJens Axboe return false;
211f3b44f92SJens Axboe return true;
212f3b44f92SJens Axboe }
213f3b44f92SJens Axboe #else
io_resubmit_prep(struct io_kiocb * req)214f3b44f92SJens Axboe static bool io_resubmit_prep(struct io_kiocb *req)
215f3b44f92SJens Axboe {
216f3b44f92SJens Axboe return false;
217f3b44f92SJens Axboe }
io_rw_should_reissue(struct io_kiocb * req)218f3b44f92SJens Axboe static bool io_rw_should_reissue(struct io_kiocb *req)
219f3b44f92SJens Axboe {
220f3b44f92SJens Axboe return false;
221f3b44f92SJens Axboe }
222f3b44f92SJens Axboe #endif
223f3b44f92SJens Axboe
io_req_end_write(struct io_kiocb * req)224a370167fSAmir Goldstein static void io_req_end_write(struct io_kiocb *req)
225f3b44f92SJens Axboe {
226f3b44f92SJens Axboe if (req->flags & REQ_F_ISREG) {
227e484fd73SAmir Goldstein struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
228f3b44f92SJens Axboe
229e484fd73SAmir Goldstein kiocb_end_write(&rw->kiocb);
230f3b44f92SJens Axboe }
231f3b44f92SJens Axboe }
232f3b44f92SJens Axboe
2332ec33a6cSJens Axboe /*
2342ec33a6cSJens Axboe * Trigger the notifications after having done some IO, and finish the write
2352ec33a6cSJens Axboe * accounting, if any.
2362ec33a6cSJens Axboe */
io_req_io_end(struct io_kiocb * req)2372ec33a6cSJens Axboe static void io_req_io_end(struct io_kiocb *req)
2382ec33a6cSJens Axboe {
2392ec33a6cSJens Axboe struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
2402ec33a6cSJens Axboe
2412ec33a6cSJens Axboe if (rw->kiocb.ki_flags & IOCB_WRITE) {
242a370167fSAmir Goldstein io_req_end_write(req);
2432ec33a6cSJens Axboe fsnotify_modify(req->file);
2442ec33a6cSJens Axboe } else {
2452ec33a6cSJens Axboe fsnotify_access(req->file);
2462ec33a6cSJens Axboe }
2472ec33a6cSJens Axboe }
2482ec33a6cSJens Axboe
__io_complete_rw_common(struct io_kiocb * req,long res)249f3b44f92SJens Axboe static bool __io_complete_rw_common(struct io_kiocb *req, long res)
250f3b44f92SJens Axboe {
251f3b44f92SJens Axboe if (unlikely(res != req->cqe.res)) {
252f3b44f92SJens Axboe if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
253f3b44f92SJens Axboe io_rw_should_reissue(req)) {
2542ec33a6cSJens Axboe /*
2552ec33a6cSJens Axboe * Reissue will start accounting again, finish the
2562ec33a6cSJens Axboe * current cycle.
2572ec33a6cSJens Axboe */
2582ec33a6cSJens Axboe io_req_io_end(req);
259f3b44f92SJens Axboe req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
260f3b44f92SJens Axboe return true;
261f3b44f92SJens Axboe }
262f3b44f92SJens Axboe req_set_fail(req);
263f3b44f92SJens Axboe req->cqe.res = res;
264f3b44f92SJens Axboe }
265f3b44f92SJens Axboe return false;
266f3b44f92SJens Axboe }
267f3b44f92SJens Axboe
io_fixup_rw_res(struct io_kiocb * req,long res)26862bb0647SPavel Begunkov static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
2694d9cb92cSPavel Begunkov {
2704d9cb92cSPavel Begunkov struct io_async_rw *io = req->async_data;
2714d9cb92cSPavel Begunkov
2724d9cb92cSPavel Begunkov /* add previously done IO, if any */
2734d9cb92cSPavel Begunkov if (req_has_async_data(req) && io->bytes_done > 0) {
2744d9cb92cSPavel Begunkov if (res < 0)
2754d9cb92cSPavel Begunkov res = io->bytes_done;
2764d9cb92cSPavel Begunkov else
2774d9cb92cSPavel Begunkov res += io->bytes_done;
2784d9cb92cSPavel Begunkov }
2794d9cb92cSPavel Begunkov return res;
2804d9cb92cSPavel Begunkov }
2814d9cb92cSPavel Begunkov
io_req_rw_complete(struct io_kiocb * req,struct io_tw_state * ts)282c92fcfc2SJens Axboe void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
283b000145eSJens Axboe {
284099ada2cSJens Axboe struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
285099ada2cSJens Axboe struct kiocb *kiocb = &rw->kiocb;
286099ada2cSJens Axboe
287099ada2cSJens Axboe if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
288099ada2cSJens Axboe long res = kiocb->dio_complete(rw->kiocb.private);
289099ada2cSJens Axboe
290099ada2cSJens Axboe io_req_set_res(req, io_fixup_rw_res(req, res), 0);
291099ada2cSJens Axboe }
292099ada2cSJens Axboe
2932ec33a6cSJens Axboe io_req_io_end(req);
2943671163bSPavel Begunkov
2953671163bSPavel Begunkov if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
296a282967cSPavel Begunkov unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
2973671163bSPavel Begunkov
2983671163bSPavel Begunkov req->cqe.flags |= io_put_kbuf(req, issue_flags);
2993671163bSPavel Begunkov }
300a282967cSPavel Begunkov io_req_task_complete(req, ts);
301b000145eSJens Axboe }
302b000145eSJens Axboe
io_complete_rw(struct kiocb * kiocb,long res)303f3b44f92SJens Axboe static void io_complete_rw(struct kiocb *kiocb, long res)
304f3b44f92SJens Axboe {
305f3b44f92SJens Axboe struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
306f3b44f92SJens Axboe struct io_kiocb *req = cmd_to_io_kiocb(rw);
307f3b44f92SJens Axboe
308099ada2cSJens Axboe if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
309f3b44f92SJens Axboe if (__io_complete_rw_common(req, res))
310f3b44f92SJens Axboe return;
3114d9cb92cSPavel Begunkov io_req_set_res(req, io_fixup_rw_res(req, res), 0);
312099ada2cSJens Axboe }
313b000145eSJens Axboe req->io_task_work.func = io_req_rw_complete;
3148751d154SPavel Begunkov __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
315f3b44f92SJens Axboe }
316f3b44f92SJens Axboe
io_complete_rw_iopoll(struct kiocb * kiocb,long res)317f3b44f92SJens Axboe static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
318f3b44f92SJens Axboe {
319f3b44f92SJens Axboe struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
320f3b44f92SJens Axboe struct io_kiocb *req = cmd_to_io_kiocb(rw);
321f3b44f92SJens Axboe
322f3b44f92SJens Axboe if (kiocb->ki_flags & IOCB_WRITE)
323a370167fSAmir Goldstein io_req_end_write(req);
324f3b44f92SJens Axboe if (unlikely(res != req->cqe.res)) {
325f3b44f92SJens Axboe if (res == -EAGAIN && io_rw_should_reissue(req)) {
326f3b44f92SJens Axboe req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
327f3b44f92SJens Axboe return;
328f3b44f92SJens Axboe }
329f3b44f92SJens Axboe req->cqe.res = res;
330f3b44f92SJens Axboe }
331f3b44f92SJens Axboe
332f3b44f92SJens Axboe /* order with io_iopoll_complete() checking ->iopoll_completed */
333f3b44f92SJens Axboe smp_store_release(&req->iopoll_completed, 1);
334f3b44f92SJens Axboe }
335f3b44f92SJens Axboe
kiocb_done(struct io_kiocb * req,ssize_t ret,unsigned int issue_flags)336df9830d8SPavel Begunkov static int kiocb_done(struct io_kiocb *req, ssize_t ret,
337f3b44f92SJens Axboe unsigned int issue_flags)
338f3b44f92SJens Axboe {
339f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
3404d9cb92cSPavel Begunkov unsigned final_ret = io_fixup_rw_res(req, ret);
341f3b44f92SJens Axboe
3421939316bSAl Viro if (ret >= 0 && req->flags & REQ_F_CUR_POS)
343f3b44f92SJens Axboe req->file->f_pos = rw->kiocb.ki_pos;
344df9830d8SPavel Begunkov if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
345df9830d8SPavel Begunkov if (!__io_complete_rw_common(req, ret)) {
3462ec33a6cSJens Axboe /*
3472ec33a6cSJens Axboe * Safe to call io_end from here as we're inline
3482ec33a6cSJens Axboe * from the submission path.
3492ec33a6cSJens Axboe */
3502ec33a6cSJens Axboe io_req_io_end(req);
3514d9cb92cSPavel Begunkov io_req_set_res(req, final_ret,
352df9830d8SPavel Begunkov io_put_kbuf(req, issue_flags));
353df9830d8SPavel Begunkov return IOU_OK;
354df9830d8SPavel Begunkov }
355df9830d8SPavel Begunkov } else {
356f3b44f92SJens Axboe io_rw_done(&rw->kiocb, ret);
357df9830d8SPavel Begunkov }
358f3b44f92SJens Axboe
359f3b44f92SJens Axboe if (req->flags & REQ_F_REISSUE) {
360f3b44f92SJens Axboe req->flags &= ~REQ_F_REISSUE;
361f3b44f92SJens Axboe if (io_resubmit_prep(req))
362f3b44f92SJens Axboe io_req_task_queue_reissue(req);
363f3b44f92SJens Axboe else
3644d9cb92cSPavel Begunkov io_req_task_queue_fail(req, final_ret);
365f3b44f92SJens Axboe }
366df9830d8SPavel Begunkov return IOU_ISSUE_SKIP_COMPLETE;
367f3b44f92SJens Axboe }
368f3b44f92SJens Axboe
__io_import_iovec(int ddir,struct io_kiocb * req,struct io_rw_state * s,unsigned int issue_flags)369f3b44f92SJens Axboe static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
370f3b44f92SJens Axboe struct io_rw_state *s,
371f3b44f92SJens Axboe unsigned int issue_flags)
372f3b44f92SJens Axboe {
373f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
374f3b44f92SJens Axboe struct iov_iter *iter = &s->iter;
375f3b44f92SJens Axboe u8 opcode = req->opcode;
376f3b44f92SJens Axboe struct iovec *iovec;
377f3b44f92SJens Axboe void __user *buf;
378f3b44f92SJens Axboe size_t sqe_len;
379f3b44f92SJens Axboe ssize_t ret;
380f3b44f92SJens Axboe
381f3b44f92SJens Axboe if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
382f337a84dSPavel Begunkov ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
383f3b44f92SJens Axboe if (ret)
384f3b44f92SJens Axboe return ERR_PTR(ret);
385f3b44f92SJens Axboe return NULL;
386f3b44f92SJens Axboe }
387f3b44f92SJens Axboe
388f3b44f92SJens Axboe buf = u64_to_user_ptr(rw->addr);
389f3b44f92SJens Axboe sqe_len = rw->len;
390f3b44f92SJens Axboe
3914ab9d465SDylan Yudaken if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
3924ab9d465SDylan Yudaken (req->flags & REQ_F_BUFFER_SELECT)) {
393f3b44f92SJens Axboe if (io_do_buffer_select(req)) {
394f3b44f92SJens Axboe buf = io_buffer_select(req, &sqe_len, issue_flags);
395f3b44f92SJens Axboe if (!buf)
396f3b44f92SJens Axboe return ERR_PTR(-ENOBUFS);
397f3b44f92SJens Axboe rw->addr = (unsigned long) buf;
398f3b44f92SJens Axboe rw->len = sqe_len;
399f3b44f92SJens Axboe }
400f3b44f92SJens Axboe
4011e23db45SJens Axboe ret = import_ubuf(ddir, buf, sqe_len, iter);
402f3b44f92SJens Axboe if (ret)
403f3b44f92SJens Axboe return ERR_PTR(ret);
404f3b44f92SJens Axboe return NULL;
405f3b44f92SJens Axboe }
406f3b44f92SJens Axboe
407f3b44f92SJens Axboe iovec = s->fast_iov;
408f3b44f92SJens Axboe ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
409f3b44f92SJens Axboe req->ctx->compat);
410f3b44f92SJens Axboe if (unlikely(ret < 0))
411f3b44f92SJens Axboe return ERR_PTR(ret);
412f3b44f92SJens Axboe return iovec;
413f3b44f92SJens Axboe }
414f3b44f92SJens Axboe
io_import_iovec(int rw,struct io_kiocb * req,struct iovec ** iovec,struct io_rw_state * s,unsigned int issue_flags)415f3b44f92SJens Axboe static inline int io_import_iovec(int rw, struct io_kiocb *req,
416f3b44f92SJens Axboe struct iovec **iovec, struct io_rw_state *s,
417f3b44f92SJens Axboe unsigned int issue_flags)
418f3b44f92SJens Axboe {
419f3b44f92SJens Axboe *iovec = __io_import_iovec(rw, req, s, issue_flags);
42081594e7eSDmitrii Bundin if (IS_ERR(*iovec))
421f3b44f92SJens Axboe return PTR_ERR(*iovec);
422f3b44f92SJens Axboe
423f3b44f92SJens Axboe iov_iter_save_state(&s->iter, &s->iter_state);
424f3b44f92SJens Axboe return 0;
425f3b44f92SJens Axboe }
426f3b44f92SJens Axboe
io_kiocb_ppos(struct kiocb * kiocb)427f3b44f92SJens Axboe static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
428f3b44f92SJens Axboe {
429f3b44f92SJens Axboe return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
430f3b44f92SJens Axboe }
431f3b44f92SJens Axboe
432f3b44f92SJens Axboe /*
433f3b44f92SJens Axboe * For files that don't have ->read_iter() and ->write_iter(), handle them
434f3b44f92SJens Axboe * by looping over ->read() or ->write() manually.
435f3b44f92SJens Axboe */
loop_rw_iter(int ddir,struct io_rw * rw,struct iov_iter * iter)436f3b44f92SJens Axboe static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
437f3b44f92SJens Axboe {
438f3b44f92SJens Axboe struct kiocb *kiocb = &rw->kiocb;
439f3b44f92SJens Axboe struct file *file = kiocb->ki_filp;
440f3b44f92SJens Axboe ssize_t ret = 0;
441f3b44f92SJens Axboe loff_t *ppos;
442f3b44f92SJens Axboe
443f3b44f92SJens Axboe /*
444f3b44f92SJens Axboe * Don't support polled IO through this interface, and we can't
445f3b44f92SJens Axboe * support non-blocking either. For the latter, this just causes
446f3b44f92SJens Axboe * the kiocb to be handled from an async context.
447f3b44f92SJens Axboe */
448f3b44f92SJens Axboe if (kiocb->ki_flags & IOCB_HIPRI)
449f3b44f92SJens Axboe return -EOPNOTSUPP;
450f3b44f92SJens Axboe if ((kiocb->ki_flags & IOCB_NOWAIT) &&
451f3b44f92SJens Axboe !(kiocb->ki_filp->f_flags & O_NONBLOCK))
452f3b44f92SJens Axboe return -EAGAIN;
453f3b44f92SJens Axboe
454f3b44f92SJens Axboe ppos = io_kiocb_ppos(kiocb);
455f3b44f92SJens Axboe
456f3b44f92SJens Axboe while (iov_iter_count(iter)) {
45795e49cf8SJens Axboe void __user *addr;
45895e49cf8SJens Axboe size_t len;
459f3b44f92SJens Axboe ssize_t nr;
460f3b44f92SJens Axboe
4611e23db45SJens Axboe if (iter_is_ubuf(iter)) {
46295e49cf8SJens Axboe addr = iter->ubuf + iter->iov_offset;
46395e49cf8SJens Axboe len = iov_iter_count(iter);
4641e23db45SJens Axboe } else if (!iov_iter_is_bvec(iter)) {
46595e49cf8SJens Axboe addr = iter_iov_addr(iter);
46695e49cf8SJens Axboe len = iter_iov_len(iter);
467f3b44f92SJens Axboe } else {
46895e49cf8SJens Axboe addr = u64_to_user_ptr(rw->addr);
46995e49cf8SJens Axboe len = rw->len;
470f3b44f92SJens Axboe }
471f3b44f92SJens Axboe
47295e49cf8SJens Axboe if (ddir == READ)
47395e49cf8SJens Axboe nr = file->f_op->read(file, addr, len, ppos);
47495e49cf8SJens Axboe else
47595e49cf8SJens Axboe nr = file->f_op->write(file, addr, len, ppos);
476f3b44f92SJens Axboe
477f3b44f92SJens Axboe if (nr < 0) {
478f3b44f92SJens Axboe if (!ret)
479f3b44f92SJens Axboe ret = nr;
480f3b44f92SJens Axboe break;
481f3b44f92SJens Axboe }
482f3b44f92SJens Axboe ret += nr;
483f3b44f92SJens Axboe if (!iov_iter_is_bvec(iter)) {
484f3b44f92SJens Axboe iov_iter_advance(iter, nr);
485f3b44f92SJens Axboe } else {
486f3b44f92SJens Axboe rw->addr += nr;
487f3b44f92SJens Axboe rw->len -= nr;
488f3b44f92SJens Axboe if (!rw->len)
489f3b44f92SJens Axboe break;
490f3b44f92SJens Axboe }
49195e49cf8SJens Axboe if (nr != len)
492f3b44f92SJens Axboe break;
493f3b44f92SJens Axboe }
494f3b44f92SJens Axboe
495f3b44f92SJens Axboe return ret;
496f3b44f92SJens Axboe }
497f3b44f92SJens Axboe
io_req_map_rw(struct io_kiocb * req,const struct iovec * iovec,const struct iovec * fast_iov,struct iov_iter * iter)498f3b44f92SJens Axboe static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
499f3b44f92SJens Axboe const struct iovec *fast_iov, struct iov_iter *iter)
500f3b44f92SJens Axboe {
501f3b44f92SJens Axboe struct io_async_rw *io = req->async_data;
502f3b44f92SJens Axboe
503f3b44f92SJens Axboe memcpy(&io->s.iter, iter, sizeof(*iter));
504f3b44f92SJens Axboe io->free_iovec = iovec;
505f3b44f92SJens Axboe io->bytes_done = 0;
506f3b44f92SJens Axboe /* can only be fixed buffers, no need to do anything */
5071e23db45SJens Axboe if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter))
508f3b44f92SJens Axboe return;
509f3b44f92SJens Axboe if (!iovec) {
510f3b44f92SJens Axboe unsigned iov_off = 0;
511f3b44f92SJens Axboe
512de4f5fedSJens Axboe io->s.iter.__iov = io->s.fast_iov;
513de4f5fedSJens Axboe if (iter->__iov != fast_iov) {
514de4f5fedSJens Axboe iov_off = iter_iov(iter) - fast_iov;
515de4f5fedSJens Axboe io->s.iter.__iov += iov_off;
516f3b44f92SJens Axboe }
517f3b44f92SJens Axboe if (io->s.fast_iov != fast_iov)
518f3b44f92SJens Axboe memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
519f3b44f92SJens Axboe sizeof(struct iovec) * iter->nr_segs);
520f3b44f92SJens Axboe } else {
521f3b44f92SJens Axboe req->flags |= REQ_F_NEED_CLEANUP;
522f3b44f92SJens Axboe }
523f3b44f92SJens Axboe }
524f3b44f92SJens Axboe
io_setup_async_rw(struct io_kiocb * req,const struct iovec * iovec,struct io_rw_state * s,bool force)525f3b44f92SJens Axboe static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
526f3b44f92SJens Axboe struct io_rw_state *s, bool force)
527f3b44f92SJens Axboe {
528f30bd4d0SBreno Leitao if (!force && !io_cold_defs[req->opcode].prep_async)
529f3b44f92SJens Axboe return 0;
530f3b44f92SJens Axboe if (!req_has_async_data(req)) {
531f3b44f92SJens Axboe struct io_async_rw *iorw;
532f3b44f92SJens Axboe
533f3b44f92SJens Axboe if (io_alloc_async_data(req)) {
534f3b44f92SJens Axboe kfree(iovec);
535f3b44f92SJens Axboe return -ENOMEM;
536f3b44f92SJens Axboe }
537f3b44f92SJens Axboe
538f3b44f92SJens Axboe io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
539f3b44f92SJens Axboe iorw = req->async_data;
540f3b44f92SJens Axboe /* we've copied and mapped the iter, ensure state is saved */
541f3b44f92SJens Axboe iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
542f3b44f92SJens Axboe }
543f3b44f92SJens Axboe return 0;
544f3b44f92SJens Axboe }
545f3b44f92SJens Axboe
io_rw_prep_async(struct io_kiocb * req,int rw)546f3b44f92SJens Axboe static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
547f3b44f92SJens Axboe {
548f3b44f92SJens Axboe struct io_async_rw *iorw = req->async_data;
549f3b44f92SJens Axboe struct iovec *iov;
550f3b44f92SJens Axboe int ret;
551f3b44f92SJens Axboe
552*c239b77eSJens Axboe iorw->bytes_done = 0;
553*c239b77eSJens Axboe iorw->free_iovec = NULL;
554*c239b77eSJens Axboe
555f3b44f92SJens Axboe /* submission path, ->uring_lock should already be taken */
556f3b44f92SJens Axboe ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
557f3b44f92SJens Axboe if (unlikely(ret < 0))
558f3b44f92SJens Axboe return ret;
559f3b44f92SJens Axboe
560*c239b77eSJens Axboe if (iov) {
561f3b44f92SJens Axboe iorw->free_iovec = iov;
562f3b44f92SJens Axboe req->flags |= REQ_F_NEED_CLEANUP;
563*c239b77eSJens Axboe }
564*c239b77eSJens Axboe
565f3b44f92SJens Axboe return 0;
566f3b44f92SJens Axboe }
567f3b44f92SJens Axboe
io_readv_prep_async(struct io_kiocb * req)568f3b44f92SJens Axboe int io_readv_prep_async(struct io_kiocb *req)
569f3b44f92SJens Axboe {
570de4eda9dSAl Viro return io_rw_prep_async(req, ITER_DEST);
571f3b44f92SJens Axboe }
572f3b44f92SJens Axboe
io_writev_prep_async(struct io_kiocb * req)573f3b44f92SJens Axboe int io_writev_prep_async(struct io_kiocb *req)
574f3b44f92SJens Axboe {
575de4eda9dSAl Viro return io_rw_prep_async(req, ITER_SOURCE);
576f3b44f92SJens Axboe }
577f3b44f92SJens Axboe
578f3b44f92SJens Axboe /*
579f3b44f92SJens Axboe * This is our waitqueue callback handler, registered through __folio_lock_async()
580f3b44f92SJens Axboe * when we initially tried to do the IO with the iocb armed our waitqueue.
581f3b44f92SJens Axboe * This gets called when the page is unlocked, and we generally expect that to
582f3b44f92SJens Axboe * happen when the page IO is completed and the page is now uptodate. This will
583f3b44f92SJens Axboe * queue a task_work based retry of the operation, attempting to copy the data
584f3b44f92SJens Axboe * again. If the latter fails because the page was NOT uptodate, then we will
585f3b44f92SJens Axboe * do a thread based blocking retry of the operation. That's the unexpected
586f3b44f92SJens Axboe * slow path.
587f3b44f92SJens Axboe */
io_async_buf_func(struct wait_queue_entry * wait,unsigned mode,int sync,void * arg)588f3b44f92SJens Axboe static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
589f3b44f92SJens Axboe int sync, void *arg)
590f3b44f92SJens Axboe {
591f3b44f92SJens Axboe struct wait_page_queue *wpq;
592f3b44f92SJens Axboe struct io_kiocb *req = wait->private;
593f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
594f3b44f92SJens Axboe struct wait_page_key *key = arg;
595f3b44f92SJens Axboe
596f3b44f92SJens Axboe wpq = container_of(wait, struct wait_page_queue, wait);
597f3b44f92SJens Axboe
598f3b44f92SJens Axboe if (!wake_page_match(wpq, key))
599f3b44f92SJens Axboe return 0;
600f3b44f92SJens Axboe
601f3b44f92SJens Axboe rw->kiocb.ki_flags &= ~IOCB_WAITQ;
602f3b44f92SJens Axboe list_del_init(&wait->entry);
603f3b44f92SJens Axboe io_req_task_queue(req);
604f3b44f92SJens Axboe return 1;
605f3b44f92SJens Axboe }
606f3b44f92SJens Axboe
607f3b44f92SJens Axboe /*
608f3b44f92SJens Axboe * This controls whether a given IO request should be armed for async page
609f3b44f92SJens Axboe * based retry. If we return false here, the request is handed to the async
610f3b44f92SJens Axboe * worker threads for retry. If we're doing buffered reads on a regular file,
611f3b44f92SJens Axboe * we prepare a private wait_page_queue entry and retry the operation. This
612f3b44f92SJens Axboe * will either succeed because the page is now uptodate and unlocked, or it
613f3b44f92SJens Axboe * will register a callback when the page is unlocked at IO completion. Through
614f3b44f92SJens Axboe * that callback, io_uring uses task_work to setup a retry of the operation.
615f3b44f92SJens Axboe * That retry will attempt the buffered read again. The retry will generally
616f3b44f92SJens Axboe * succeed, or in rare cases where it fails, we then fall back to using the
617f3b44f92SJens Axboe * async worker threads for a blocking retry.
618f3b44f92SJens Axboe */
io_rw_should_retry(struct io_kiocb * req)619f3b44f92SJens Axboe static bool io_rw_should_retry(struct io_kiocb *req)
620f3b44f92SJens Axboe {
621f3b44f92SJens Axboe struct io_async_rw *io = req->async_data;
622f3b44f92SJens Axboe struct wait_page_queue *wait = &io->wpq;
623f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
624f3b44f92SJens Axboe struct kiocb *kiocb = &rw->kiocb;
625f3b44f92SJens Axboe
626f3b44f92SJens Axboe /* never retry for NOWAIT, we just complete with -EAGAIN */
627f3b44f92SJens Axboe if (req->flags & REQ_F_NOWAIT)
628f3b44f92SJens Axboe return false;
629f3b44f92SJens Axboe
630f3b44f92SJens Axboe /* Only for buffered IO */
631f3b44f92SJens Axboe if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
632f3b44f92SJens Axboe return false;
633f3b44f92SJens Axboe
634f3b44f92SJens Axboe /*
635f3b44f92SJens Axboe * just use poll if we can, and don't attempt if the fs doesn't
636f3b44f92SJens Axboe * support callback based unlocks
637f3b44f92SJens Axboe */
638f3b44f92SJens Axboe if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
639f3b44f92SJens Axboe return false;
640f3b44f92SJens Axboe
641f3b44f92SJens Axboe wait->wait.func = io_async_buf_func;
642f3b44f92SJens Axboe wait->wait.private = req;
643f3b44f92SJens Axboe wait->wait.flags = 0;
644f3b44f92SJens Axboe INIT_LIST_HEAD(&wait->wait.entry);
645f3b44f92SJens Axboe kiocb->ki_flags |= IOCB_WAITQ;
646f3b44f92SJens Axboe kiocb->ki_flags &= ~IOCB_NOWAIT;
647f3b44f92SJens Axboe kiocb->ki_waitq = wait;
648f3b44f92SJens Axboe return true;
649f3b44f92SJens Axboe }
650f3b44f92SJens Axboe
io_iter_do_read(struct io_rw * rw,struct iov_iter * iter)651f3b44f92SJens Axboe static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
652f3b44f92SJens Axboe {
653f3b44f92SJens Axboe struct file *file = rw->kiocb.ki_filp;
654f3b44f92SJens Axboe
655f3b44f92SJens Axboe if (likely(file->f_op->read_iter))
656f3b44f92SJens Axboe return call_read_iter(file, &rw->kiocb, iter);
657f3b44f92SJens Axboe else if (file->f_op->read)
658f3b44f92SJens Axboe return loop_rw_iter(READ, rw, iter);
659f3b44f92SJens Axboe else
660f3b44f92SJens Axboe return -EINVAL;
661f3b44f92SJens Axboe }
662f3b44f92SJens Axboe
need_complete_io(struct io_kiocb * req)6634e17aaabSStefan Roesch static bool need_complete_io(struct io_kiocb *req)
664f3b44f92SJens Axboe {
665f3b44f92SJens Axboe return req->flags & REQ_F_ISREG ||
666f3b44f92SJens Axboe S_ISBLK(file_inode(req->file)->i_mode);
667f3b44f92SJens Axboe }
668f3b44f92SJens Axboe
io_rw_init_file(struct io_kiocb * req,fmode_t mode)669f3b44f92SJens Axboe static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
670f3b44f92SJens Axboe {
671f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
672f3b44f92SJens Axboe struct kiocb *kiocb = &rw->kiocb;
673f3b44f92SJens Axboe struct io_ring_ctx *ctx = req->ctx;
674f3b44f92SJens Axboe struct file *file = req->file;
675f3b44f92SJens Axboe int ret;
676f3b44f92SJens Axboe
677f3b44f92SJens Axboe if (unlikely(!file || !(file->f_mode & mode)))
678f3b44f92SJens Axboe return -EBADF;
679f3b44f92SJens Axboe
6803beed235SChristoph Hellwig if (!(req->flags & REQ_F_FIXED_FILE))
6818487f083SChristoph Hellwig req->flags |= io_file_get_flags(file);
682f3b44f92SJens Axboe
6835264406cSLinus Torvalds kiocb->ki_flags = file->f_iocb_flags;
684f3b44f92SJens Axboe ret = kiocb_set_rw_flags(kiocb, rw->flags);
685f3b44f92SJens Axboe if (unlikely(ret))
686f3b44f92SJens Axboe return ret;
68712e4e8c7SPavel Begunkov kiocb->ki_flags |= IOCB_ALLOC_CACHE;
688f3b44f92SJens Axboe
689f3b44f92SJens Axboe /*
690f3b44f92SJens Axboe * If the file is marked O_NONBLOCK, still allow retry for it if it
691f3b44f92SJens Axboe * supports async. Otherwise it's impossible to use O_NONBLOCK files
692f3b44f92SJens Axboe * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
693f3b44f92SJens Axboe */
694f3b44f92SJens Axboe if ((kiocb->ki_flags & IOCB_NOWAIT) ||
695f3b44f92SJens Axboe ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
696f3b44f92SJens Axboe req->flags |= REQ_F_NOWAIT;
697f3b44f92SJens Axboe
698f3b44f92SJens Axboe if (ctx->flags & IORING_SETUP_IOPOLL) {
699f3b44f92SJens Axboe if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
700f3b44f92SJens Axboe return -EOPNOTSUPP;
701f3b44f92SJens Axboe
702f3b44f92SJens Axboe kiocb->private = NULL;
70312e4e8c7SPavel Begunkov kiocb->ki_flags |= IOCB_HIPRI;
704f3b44f92SJens Axboe kiocb->ki_complete = io_complete_rw_iopoll;
705f3b44f92SJens Axboe req->iopoll_completed = 0;
706f3b44f92SJens Axboe } else {
707f3b44f92SJens Axboe if (kiocb->ki_flags & IOCB_HIPRI)
708f3b44f92SJens Axboe return -EINVAL;
709f3b44f92SJens Axboe kiocb->ki_complete = io_complete_rw;
710f3b44f92SJens Axboe }
711f3b44f92SJens Axboe
712f3b44f92SJens Axboe return 0;
713f3b44f92SJens Axboe }
714f3b44f92SJens Axboe
io_read(struct io_kiocb * req,unsigned int issue_flags)715f3b44f92SJens Axboe int io_read(struct io_kiocb *req, unsigned int issue_flags)
716f3b44f92SJens Axboe {
717f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
718f3b44f92SJens Axboe struct io_rw_state __s, *s = &__s;
719f3b44f92SJens Axboe struct iovec *iovec;
720f3b44f92SJens Axboe struct kiocb *kiocb = &rw->kiocb;
721f3b44f92SJens Axboe bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
722f3b44f92SJens Axboe struct io_async_rw *io;
723f3b44f92SJens Axboe ssize_t ret, ret2;
724f3b44f92SJens Axboe loff_t *ppos;
725f3b44f92SJens Axboe
726f3b44f92SJens Axboe if (!req_has_async_data(req)) {
727de4eda9dSAl Viro ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
728f3b44f92SJens Axboe if (unlikely(ret < 0))
729f3b44f92SJens Axboe return ret;
730f3b44f92SJens Axboe } else {
731f3b44f92SJens Axboe io = req->async_data;
732f3b44f92SJens Axboe s = &io->s;
733f3b44f92SJens Axboe
734f3b44f92SJens Axboe /*
735f3b44f92SJens Axboe * Safe and required to re-import if we're using provided
736f3b44f92SJens Axboe * buffers, as we dropped the selected one before retry.
737f3b44f92SJens Axboe */
738f3b44f92SJens Axboe if (io_do_buffer_select(req)) {
739de4eda9dSAl Viro ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
740f3b44f92SJens Axboe if (unlikely(ret < 0))
741f3b44f92SJens Axboe return ret;
742f3b44f92SJens Axboe }
743f3b44f92SJens Axboe
744f3b44f92SJens Axboe /*
745f3b44f92SJens Axboe * We come here from an earlier attempt, restore our state to
746f3b44f92SJens Axboe * match in case it doesn't. It's cheap enough that we don't
747f3b44f92SJens Axboe * need to make this conditional.
748f3b44f92SJens Axboe */
749f3b44f92SJens Axboe iov_iter_restore(&s->iter, &s->iter_state);
750f3b44f92SJens Axboe iovec = NULL;
751f3b44f92SJens Axboe }
752f3b44f92SJens Axboe ret = io_rw_init_file(req, FMODE_READ);
753f3b44f92SJens Axboe if (unlikely(ret)) {
754f3b44f92SJens Axboe kfree(iovec);
755f3b44f92SJens Axboe return ret;
756f3b44f92SJens Axboe }
757f3b44f92SJens Axboe req->cqe.res = iov_iter_count(&s->iter);
758f3b44f92SJens Axboe
759f3b44f92SJens Axboe if (force_nonblock) {
760f3b44f92SJens Axboe /* If the file doesn't support async, just async punt */
761f3b44f92SJens Axboe if (unlikely(!io_file_supports_nowait(req))) {
762f3b44f92SJens Axboe ret = io_setup_async_rw(req, iovec, s, true);
763f3b44f92SJens Axboe return ret ?: -EAGAIN;
764f3b44f92SJens Axboe }
765f3b44f92SJens Axboe kiocb->ki_flags |= IOCB_NOWAIT;
766f3b44f92SJens Axboe } else {
767f3b44f92SJens Axboe /* Ensure we clear previously set non-block flag */
768f3b44f92SJens Axboe kiocb->ki_flags &= ~IOCB_NOWAIT;
769f3b44f92SJens Axboe }
770f3b44f92SJens Axboe
771f3b44f92SJens Axboe ppos = io_kiocb_update_pos(req);
772f3b44f92SJens Axboe
773f3b44f92SJens Axboe ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
774f3b44f92SJens Axboe if (unlikely(ret)) {
775f3b44f92SJens Axboe kfree(iovec);
776f3b44f92SJens Axboe return ret;
777f3b44f92SJens Axboe }
778f3b44f92SJens Axboe
779f3b44f92SJens Axboe ret = io_iter_do_read(rw, &s->iter);
780f3b44f92SJens Axboe
781f3b44f92SJens Axboe if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
782f3b44f92SJens Axboe req->flags &= ~REQ_F_REISSUE;
783f3b44f92SJens Axboe /* if we can poll, just do that */
784f3b44f92SJens Axboe if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
785f3b44f92SJens Axboe return -EAGAIN;
786f3b44f92SJens Axboe /* IOPOLL retry should happen for io-wq threads */
787f3b44f92SJens Axboe if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
788f3b44f92SJens Axboe goto done;
789f3b44f92SJens Axboe /* no retry on NONBLOCK nor RWF_NOWAIT */
790f3b44f92SJens Axboe if (req->flags & REQ_F_NOWAIT)
791f3b44f92SJens Axboe goto done;
792f3b44f92SJens Axboe ret = 0;
793f3b44f92SJens Axboe } else if (ret == -EIOCBQUEUED) {
794df9830d8SPavel Begunkov if (iovec)
795df9830d8SPavel Begunkov kfree(iovec);
796df9830d8SPavel Begunkov return IOU_ISSUE_SKIP_COMPLETE;
797f3b44f92SJens Axboe } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
7984e17aaabSStefan Roesch (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
799f3b44f92SJens Axboe /* read all, failed, already did sync or don't want to retry */
800f3b44f92SJens Axboe goto done;
801f3b44f92SJens Axboe }
802f3b44f92SJens Axboe
803f3b44f92SJens Axboe /*
804f3b44f92SJens Axboe * Don't depend on the iter state matching what was consumed, or being
805f3b44f92SJens Axboe * untouched in case of error. Restore it and we'll advance it
806f3b44f92SJens Axboe * manually if we need to.
807f3b44f92SJens Axboe */
808f3b44f92SJens Axboe iov_iter_restore(&s->iter, &s->iter_state);
809f3b44f92SJens Axboe
810f3b44f92SJens Axboe ret2 = io_setup_async_rw(req, iovec, s, true);
811f3b44f92SJens Axboe iovec = NULL;
812c278d9f8SPavel Begunkov if (ret2) {
813c278d9f8SPavel Begunkov ret = ret > 0 ? ret : ret2;
814c278d9f8SPavel Begunkov goto done;
815c278d9f8SPavel Begunkov }
816c278d9f8SPavel Begunkov
817f3b44f92SJens Axboe io = req->async_data;
818f3b44f92SJens Axboe s = &io->s;
819f3b44f92SJens Axboe /*
820f3b44f92SJens Axboe * Now use our persistent iterator and state, if we aren't already.
821f3b44f92SJens Axboe * We've restored and mapped the iter to match.
822f3b44f92SJens Axboe */
823f3b44f92SJens Axboe
824f3b44f92SJens Axboe do {
825f3b44f92SJens Axboe /*
826f3b44f92SJens Axboe * We end up here because of a partial read, either from
827f3b44f92SJens Axboe * above or inside this loop. Advance the iter by the bytes
828f3b44f92SJens Axboe * that were consumed.
829f3b44f92SJens Axboe */
830f3b44f92SJens Axboe iov_iter_advance(&s->iter, ret);
831f3b44f92SJens Axboe if (!iov_iter_count(&s->iter))
832f3b44f92SJens Axboe break;
833f3b44f92SJens Axboe io->bytes_done += ret;
834f3b44f92SJens Axboe iov_iter_save_state(&s->iter, &s->iter_state);
835f3b44f92SJens Axboe
836f3b44f92SJens Axboe /* if we can retry, do so with the callbacks armed */
837f3b44f92SJens Axboe if (!io_rw_should_retry(req)) {
838f3b44f92SJens Axboe kiocb->ki_flags &= ~IOCB_WAITQ;
839f3b44f92SJens Axboe return -EAGAIN;
840f3b44f92SJens Axboe }
841f3b44f92SJens Axboe
842bf68b5b3SPavel Begunkov req->cqe.res = iov_iter_count(&s->iter);
843f3b44f92SJens Axboe /*
844f3b44f92SJens Axboe * Now retry read with the IOCB_WAITQ parts set in the iocb. If
845f3b44f92SJens Axboe * we get -EIOCBQUEUED, then we'll get a notification when the
846f3b44f92SJens Axboe * desired page gets unlocked. We can also get a partial read
847f3b44f92SJens Axboe * here, and if we do, then just retry at the new offset.
848f3b44f92SJens Axboe */
849f3b44f92SJens Axboe ret = io_iter_do_read(rw, &s->iter);
850f3b44f92SJens Axboe if (ret == -EIOCBQUEUED)
851f3b44f92SJens Axboe return IOU_ISSUE_SKIP_COMPLETE;
852f3b44f92SJens Axboe /* we got some bytes, but not all. retry. */
853f3b44f92SJens Axboe kiocb->ki_flags &= ~IOCB_WAITQ;
854f3b44f92SJens Axboe iov_iter_restore(&s->iter, &s->iter_state);
855f3b44f92SJens Axboe } while (ret > 0);
856f3b44f92SJens Axboe done:
857f3b44f92SJens Axboe /* it's faster to check here then delegate to kfree */
858f3b44f92SJens Axboe if (iovec)
859f3b44f92SJens Axboe kfree(iovec);
860df9830d8SPavel Begunkov return kiocb_done(req, ret, issue_flags);
861f3b44f92SJens Axboe }
862f3b44f92SJens Axboe
io_write(struct io_kiocb * req,unsigned int issue_flags)863f3b44f92SJens Axboe int io_write(struct io_kiocb *req, unsigned int issue_flags)
864f3b44f92SJens Axboe {
865f2ccb5aeSStefan Metzmacher struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
866f3b44f92SJens Axboe struct io_rw_state __s, *s = &__s;
867f3b44f92SJens Axboe struct iovec *iovec;
868f3b44f92SJens Axboe struct kiocb *kiocb = &rw->kiocb;
869f3b44f92SJens Axboe bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
870f3b44f92SJens Axboe ssize_t ret, ret2;
871f3b44f92SJens Axboe loff_t *ppos;
872f3b44f92SJens Axboe
873f3b44f92SJens Axboe if (!req_has_async_data(req)) {
874de4eda9dSAl Viro ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags);
875f3b44f92SJens Axboe if (unlikely(ret < 0))
876f3b44f92SJens Axboe return ret;
877f3b44f92SJens Axboe } else {
878f3b44f92SJens Axboe struct io_async_rw *io = req->async_data;
879f3b44f92SJens Axboe
880f3b44f92SJens Axboe s = &io->s;
881f3b44f92SJens Axboe iov_iter_restore(&s->iter, &s->iter_state);
882f3b44f92SJens Axboe iovec = NULL;
883f3b44f92SJens Axboe }
884f3b44f92SJens Axboe ret = io_rw_init_file(req, FMODE_WRITE);
885f3b44f92SJens Axboe if (unlikely(ret)) {
886f3b44f92SJens Axboe kfree(iovec);
887f3b44f92SJens Axboe return ret;
888f3b44f92SJens Axboe }
889f3b44f92SJens Axboe req->cqe.res = iov_iter_count(&s->iter);
890f3b44f92SJens Axboe
891f3b44f92SJens Axboe if (force_nonblock) {
892f3b44f92SJens Axboe /* If the file doesn't support async, just async punt */
893f3b44f92SJens Axboe if (unlikely(!io_file_supports_nowait(req)))
894f3b44f92SJens Axboe goto copy_iov;
895f3b44f92SJens Axboe
8964e17aaabSStefan Roesch /* File path supports NOWAIT for non-direct_IO only for block devices. */
8974e17aaabSStefan Roesch if (!(kiocb->ki_flags & IOCB_DIRECT) &&
8984e17aaabSStefan Roesch !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
899f3b44f92SJens Axboe (req->flags & REQ_F_ISREG))
900f3b44f92SJens Axboe goto copy_iov;
901f3b44f92SJens Axboe
902f3b44f92SJens Axboe kiocb->ki_flags |= IOCB_NOWAIT;
903f3b44f92SJens Axboe } else {
904f3b44f92SJens Axboe /* Ensure we clear previously set non-block flag */
905f3b44f92SJens Axboe kiocb->ki_flags &= ~IOCB_NOWAIT;
906f3b44f92SJens Axboe }
907f3b44f92SJens Axboe
908f3b44f92SJens Axboe ppos = io_kiocb_update_pos(req);
909f3b44f92SJens Axboe
910f3b44f92SJens Axboe ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
911df9830d8SPavel Begunkov if (unlikely(ret)) {
912df9830d8SPavel Begunkov kfree(iovec);
913df9830d8SPavel Begunkov return ret;
914df9830d8SPavel Begunkov }
915f3b44f92SJens Axboe
916e484fd73SAmir Goldstein if (req->flags & REQ_F_ISREG)
917e484fd73SAmir Goldstein kiocb_start_write(kiocb);
918f3b44f92SJens Axboe kiocb->ki_flags |= IOCB_WRITE;
919f3b44f92SJens Axboe
920f3b44f92SJens Axboe if (likely(req->file->f_op->write_iter))
921f3b44f92SJens Axboe ret2 = call_write_iter(req->file, kiocb, &s->iter);
922f3b44f92SJens Axboe else if (req->file->f_op->write)
923f3b44f92SJens Axboe ret2 = loop_rw_iter(WRITE, rw, &s->iter);
924f3b44f92SJens Axboe else
925f3b44f92SJens Axboe ret2 = -EINVAL;
926f3b44f92SJens Axboe
927f3b44f92SJens Axboe if (req->flags & REQ_F_REISSUE) {
928f3b44f92SJens Axboe req->flags &= ~REQ_F_REISSUE;
929f3b44f92SJens Axboe ret2 = -EAGAIN;
930f3b44f92SJens Axboe }
931f3b44f92SJens Axboe
932f3b44f92SJens Axboe /*
933f3b44f92SJens Axboe * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
934f3b44f92SJens Axboe * retry them without IOCB_NOWAIT.
935f3b44f92SJens Axboe */
936f3b44f92SJens Axboe if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
937f3b44f92SJens Axboe ret2 = -EAGAIN;
938f3b44f92SJens Axboe /* no retry on NONBLOCK nor RWF_NOWAIT */
939f3b44f92SJens Axboe if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
940f3b44f92SJens Axboe goto done;
941f3b44f92SJens Axboe if (!force_nonblock || ret2 != -EAGAIN) {
942f3b44f92SJens Axboe /* IOPOLL retry should happen for io-wq threads */
943f3b44f92SJens Axboe if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
944f3b44f92SJens Axboe goto copy_iov;
9454e17aaabSStefan Roesch
9464e17aaabSStefan Roesch if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
947c86416c6SStefan Roesch struct io_async_rw *io;
9484e17aaabSStefan Roesch
9491c849b48SStefan Roesch trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
9501c849b48SStefan Roesch req->cqe.res, ret2);
9511c849b48SStefan Roesch
9524e17aaabSStefan Roesch /* This is a partial write. The file pos has already been
9534e17aaabSStefan Roesch * updated, setup the async struct to complete the request
9544e17aaabSStefan Roesch * in the worker. Also update bytes_done to account for
9554e17aaabSStefan Roesch * the bytes already written.
9564e17aaabSStefan Roesch */
9574e17aaabSStefan Roesch iov_iter_save_state(&s->iter, &s->iter_state);
9584e17aaabSStefan Roesch ret = io_setup_async_rw(req, iovec, s, true);
9594e17aaabSStefan Roesch
960c86416c6SStefan Roesch io = req->async_data;
961c86416c6SStefan Roesch if (io)
962c86416c6SStefan Roesch io->bytes_done += ret2;
9634e17aaabSStefan Roesch
964e053aaf4SJens Axboe if (kiocb->ki_flags & IOCB_WRITE)
965a370167fSAmir Goldstein io_req_end_write(req);
9664e17aaabSStefan Roesch return ret ? ret : -EAGAIN;
9674e17aaabSStefan Roesch }
968f3b44f92SJens Axboe done:
969df9830d8SPavel Begunkov ret = kiocb_done(req, ret2, issue_flags);
970f3b44f92SJens Axboe } else {
971f3b44f92SJens Axboe copy_iov:
972f3b44f92SJens Axboe iov_iter_restore(&s->iter, &s->iter_state);
973f3b44f92SJens Axboe ret = io_setup_async_rw(req, iovec, s, false);
974e053aaf4SJens Axboe if (!ret) {
975e053aaf4SJens Axboe if (kiocb->ki_flags & IOCB_WRITE)
976a370167fSAmir Goldstein io_req_end_write(req);
977e053aaf4SJens Axboe return -EAGAIN;
978e053aaf4SJens Axboe }
979e053aaf4SJens Axboe return ret;
980f3b44f92SJens Axboe }
981f3b44f92SJens Axboe /* it's reportedly faster than delegating the null check to kfree() */
982f3b44f92SJens Axboe if (iovec)
983f3b44f92SJens Axboe kfree(iovec);
984f3b44f92SJens Axboe return ret;
985f3b44f92SJens Axboe }
986f3b44f92SJens Axboe
io_rw_fail(struct io_kiocb * req)98747b4c686SPavel Begunkov void io_rw_fail(struct io_kiocb *req)
98847b4c686SPavel Begunkov {
98947b4c686SPavel Begunkov int res;
99047b4c686SPavel Begunkov
99147b4c686SPavel Begunkov res = io_fixup_rw_res(req, req->cqe.res);
99247b4c686SPavel Begunkov io_req_set_res(req, res, req->cqe.flags);
99347b4c686SPavel Begunkov }
99447b4c686SPavel Begunkov
io_do_iopoll(struct io_ring_ctx * ctx,bool force_nonspin)995f3b44f92SJens Axboe int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
996f3b44f92SJens Axboe {
997f3b44f92SJens Axboe struct io_wq_work_node *pos, *start, *prev;
99854bdd67dSKeith Busch unsigned int poll_flags = 0;
999f3b44f92SJens Axboe DEFINE_IO_COMP_BATCH(iob);
1000f3b44f92SJens Axboe int nr_events = 0;
1001f3b44f92SJens Axboe
1002f3b44f92SJens Axboe /*
1003f3b44f92SJens Axboe * Only spin for completions if we don't have multiple devices hanging
1004f3b44f92SJens Axboe * off our complete list.
1005f3b44f92SJens Axboe */
1006f3b44f92SJens Axboe if (ctx->poll_multi_queue || force_nonspin)
1007f3b44f92SJens Axboe poll_flags |= BLK_POLL_ONESHOT;
1008f3b44f92SJens Axboe
1009f3b44f92SJens Axboe wq_list_for_each(pos, start, &ctx->iopoll_list) {
1010f3b44f92SJens Axboe struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
1011a1119fb0SJens Axboe struct file *file = req->file;
1012f3b44f92SJens Axboe int ret;
1013f3b44f92SJens Axboe
1014f3b44f92SJens Axboe /*
1015f3b44f92SJens Axboe * Move completed and retryable entries to our local lists.
1016f3b44f92SJens Axboe * If we find a request that requires polling, break out
1017f3b44f92SJens Axboe * and complete those lists first, if we have entries there.
1018f3b44f92SJens Axboe */
1019f3b44f92SJens Axboe if (READ_ONCE(req->iopoll_completed))
1020f3b44f92SJens Axboe break;
1021f3b44f92SJens Axboe
10225756a3a7SKanchan Joshi if (req->opcode == IORING_OP_URING_CMD) {
1023a1119fb0SJens Axboe struct io_uring_cmd *ioucmd;
10245756a3a7SKanchan Joshi
1025a1119fb0SJens Axboe ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
1026de97fcb3SJens Axboe ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob,
1027de97fcb3SJens Axboe poll_flags);
1028a1119fb0SJens Axboe } else {
1029a1119fb0SJens Axboe struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
1030a1119fb0SJens Axboe
1031a1119fb0SJens Axboe ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
1032a1119fb0SJens Axboe }
1033f3b44f92SJens Axboe if (unlikely(ret < 0))
1034f3b44f92SJens Axboe return ret;
1035f3b44f92SJens Axboe else if (ret)
1036f3b44f92SJens Axboe poll_flags |= BLK_POLL_ONESHOT;
1037f3b44f92SJens Axboe
1038f3b44f92SJens Axboe /* iopoll may have completed current req */
1039f3b44f92SJens Axboe if (!rq_list_empty(iob.req_list) ||
1040f3b44f92SJens Axboe READ_ONCE(req->iopoll_completed))
1041f3b44f92SJens Axboe break;
1042f3b44f92SJens Axboe }
1043f3b44f92SJens Axboe
1044f3b44f92SJens Axboe if (!rq_list_empty(iob.req_list))
1045f3b44f92SJens Axboe iob.complete(&iob);
1046f3b44f92SJens Axboe else if (!pos)
1047f3b44f92SJens Axboe return 0;
1048f3b44f92SJens Axboe
1049f3b44f92SJens Axboe prev = start;
1050f3b44f92SJens Axboe wq_list_for_each_resume(pos, prev) {
1051f3b44f92SJens Axboe struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
1052f3b44f92SJens Axboe
1053f3b44f92SJens Axboe /* order with io_complete_rw_iopoll(), e.g. ->result updates */
1054f3b44f92SJens Axboe if (!smp_load_acquire(&req->iopoll_completed))
1055f3b44f92SJens Axboe break;
1056f3b44f92SJens Axboe nr_events++;
1057f3b44f92SJens Axboe req->cqe.flags = io_put_kbuf(req, 0);
1058544d163dSPavel Begunkov }
1059f3b44f92SJens Axboe if (unlikely(!nr_events))
1060f3b44f92SJens Axboe return 0;
1061f3b44f92SJens Axboe
1062f3b44f92SJens Axboe pos = start ? start->next : ctx->iopoll_list.first;
1063f3b44f92SJens Axboe wq_list_cut(&ctx->iopoll_list, prev, start);
1064ec26c225SPavel Begunkov
1065ec26c225SPavel Begunkov if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
1066ec26c225SPavel Begunkov return 0;
1067ec26c225SPavel Begunkov ctx->submit_state.compl_reqs.first = pos;
1068ec26c225SPavel Begunkov __io_submit_flush_completions(ctx);
1069f3b44f92SJens Axboe return nr_events;
1070f3b44f92SJens Axboe }
1071