1 #ifndef IOU_CORE_H 2 #define IOU_CORE_H 3 4 #include <linux/errno.h> 5 #include <linux/lockdep.h> 6 #include <linux/resume_user_mode.h> 7 #include <linux/kasan.h> 8 #include <linux/io_uring_types.h> 9 #include <uapi/linux/eventpoll.h> 10 #include "io-wq.h" 11 #include "slist.h" 12 #include "filetable.h" 13 14 #ifndef CREATE_TRACE_POINTS 15 #include <trace/events/io_uring.h> 16 #endif 17 18 enum { 19 /* 20 * A hint to not wake right away but delay until there are enough of 21 * tw's queued to match the number of CQEs the task is waiting for. 22 * 23 * Must not be used wirh requests generating more than one CQE. 24 * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. 25 */ 26 IOU_F_TWQ_LAZY_WAKE = 1, 27 }; 28 29 enum { 30 IOU_OK = 0, 31 IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, 32 33 /* 34 * Requeue the task_work to restart operations on this request. The 35 * actual value isn't important, should just be not an otherwise 36 * valid error code, yet less than -MAX_ERRNO and valid internally. 37 */ 38 IOU_REQUEUE = -3072, 39 40 /* 41 * Intended only when both IO_URING_F_MULTISHOT is passed 42 * to indicate to the poll runner that multishot should be 43 * removed and the result is set on req->cqe.res. 44 */ 45 IOU_STOP_MULTISHOT = -ECANCELED, 46 }; 47 48 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); 49 void io_req_cqe_overflow(struct io_kiocb *req); 50 int io_run_task_work_sig(struct io_ring_ctx *ctx); 51 void io_req_defer_failed(struct io_kiocb *req, s32 res); 52 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); 53 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 54 bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); 55 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 56 57 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); 58 59 struct file *io_file_get_normal(struct io_kiocb *req, int fd); 60 struct file *io_file_get_fixed(struct io_kiocb *req, int fd, 61 unsigned issue_flags); 62 63 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 64 bool io_alloc_async_data(struct io_kiocb *req); 65 void io_req_task_queue(struct io_kiocb *req); 66 void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); 67 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); 68 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 69 void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); 70 void tctx_task_work(struct callback_head *cb); 71 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 72 int io_uring_alloc_task_context(struct task_struct *task, 73 struct io_ring_ctx *ctx); 74 75 int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, 76 int start, int end); 77 78 int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); 79 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 80 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 81 void __io_submit_flush_completions(struct io_ring_ctx *ctx); 82 int io_req_prep_async(struct io_kiocb *req); 83 84 struct io_wq_work *io_wq_free_work(struct io_wq_work *work); 85 void io_wq_submit_work(struct io_wq_work *work); 86 87 void io_free_req(struct io_kiocb *req); 88 void io_queue_next(struct io_kiocb *req); 89 void io_task_refs_refill(struct io_uring_task *tctx); 90 bool __io_alloc_req_refill(struct io_ring_ctx *ctx); 91 92 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, 93 bool cancel_all); 94 95 void *io_mem_alloc(size_t size); 96 void io_mem_free(void *ptr); 97 98 #if defined(CONFIG_PROVE_LOCKING) 99 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) 100 { 101 lockdep_assert(in_task()); 102 103 if (ctx->flags & IORING_SETUP_IOPOLL) { 104 lockdep_assert_held(&ctx->uring_lock); 105 } else if (!ctx->task_complete) { 106 lockdep_assert_held(&ctx->completion_lock); 107 } else if (ctx->submitter_task) { 108 /* 109 * ->submitter_task may be NULL and we can still post a CQE, 110 * if the ring has been setup with IORING_SETUP_R_DISABLED. 111 * Not from an SQE, as those cannot be submitted, but via 112 * updating tagged resources. 113 */ 114 if (ctx->submitter_task->flags & PF_EXITING) 115 lockdep_assert(current_work()); 116 else 117 lockdep_assert(current == ctx->submitter_task); 118 } 119 } 120 #else 121 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) 122 { 123 } 124 #endif 125 126 static inline void io_req_task_work_add(struct io_kiocb *req) 127 { 128 __io_req_task_work_add(req, 0); 129 } 130 131 #define io_for_each_link(pos, head) \ 132 for (pos = (head); pos; pos = pos->link) 133 134 static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, 135 struct io_uring_cqe **ret, 136 bool overflow) 137 { 138 io_lockdep_assert_cq_locked(ctx); 139 140 if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { 141 if (unlikely(!io_cqe_cache_refill(ctx, overflow))) 142 return false; 143 } 144 *ret = ctx->cqe_cached; 145 ctx->cached_cq_tail++; 146 ctx->cqe_cached++; 147 if (ctx->flags & IORING_SETUP_CQE32) 148 ctx->cqe_cached++; 149 return true; 150 } 151 152 static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) 153 { 154 return io_get_cqe_overflow(ctx, ret, false); 155 } 156 157 static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 158 struct io_kiocb *req) 159 { 160 struct io_uring_cqe *cqe; 161 162 /* 163 * If we can't get a cq entry, userspace overflowed the 164 * submission (by quite a lot). Increment the overflow count in 165 * the ring. 166 */ 167 if (unlikely(!io_get_cqe(ctx, &cqe))) 168 return false; 169 170 if (trace_io_uring_complete_enabled()) 171 trace_io_uring_complete(req->ctx, req, req->cqe.user_data, 172 req->cqe.res, req->cqe.flags, 173 req->big_cqe.extra1, req->big_cqe.extra2); 174 175 memcpy(cqe, &req->cqe, sizeof(*cqe)); 176 if (ctx->flags & IORING_SETUP_CQE32) { 177 memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); 178 memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 179 } 180 return true; 181 } 182 183 static inline void req_set_fail(struct io_kiocb *req) 184 { 185 req->flags |= REQ_F_FAIL; 186 if (req->flags & REQ_F_CQE_SKIP) { 187 req->flags &= ~REQ_F_CQE_SKIP; 188 req->flags |= REQ_F_SKIP_LINK_CQES; 189 } 190 } 191 192 static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) 193 { 194 req->cqe.res = res; 195 req->cqe.flags = cflags; 196 } 197 198 static inline bool req_has_async_data(struct io_kiocb *req) 199 { 200 return req->flags & REQ_F_ASYNC_DATA; 201 } 202 203 static inline void io_put_file(struct io_kiocb *req) 204 { 205 if (!(req->flags & REQ_F_FIXED_FILE) && req->file) 206 fput(req->file); 207 } 208 209 static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, 210 unsigned issue_flags) 211 { 212 lockdep_assert_held(&ctx->uring_lock); 213 if (issue_flags & IO_URING_F_UNLOCKED) 214 mutex_unlock(&ctx->uring_lock); 215 } 216 217 static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, 218 unsigned issue_flags) 219 { 220 /* 221 * "Normal" inline submissions always hold the uring_lock, since we 222 * grab it from the system call. Same is true for the SQPOLL offload. 223 * The only exception is when we've detached the request and issue it 224 * from an async worker thread, grab the lock for that case. 225 */ 226 if (issue_flags & IO_URING_F_UNLOCKED) 227 mutex_lock(&ctx->uring_lock); 228 lockdep_assert_held(&ctx->uring_lock); 229 } 230 231 static inline void io_commit_cqring(struct io_ring_ctx *ctx) 232 { 233 /* order cqe stores with ring update */ 234 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); 235 } 236 237 static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) 238 { 239 if (wq_has_sleeper(&ctx->poll_wq)) 240 __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, 241 poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); 242 } 243 244 static inline void io_cqring_wake(struct io_ring_ctx *ctx) 245 { 246 /* 247 * Trigger waitqueue handler on all waiters on our waitqueue. This 248 * won't necessarily wake up all the tasks, io_should_wake() will make 249 * that decision. 250 * 251 * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter 252 * set in the mask so that if we recurse back into our own poll 253 * waitqueue handlers, we know we have a dependency between eventfd or 254 * epoll and should terminate multishot poll at that point. 255 */ 256 if (wq_has_sleeper(&ctx->cq_wait)) 257 __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, 258 poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); 259 } 260 261 static inline bool io_sqring_full(struct io_ring_ctx *ctx) 262 { 263 struct io_rings *r = ctx->rings; 264 265 /* 266 * SQPOLL must use the actual sqring head, as using the cached_sq_head 267 * is race prone if the SQPOLL thread has grabbed entries but not yet 268 * committed them to the ring. For !SQPOLL, this doesn't matter, but 269 * since this helper is just used for SQPOLL sqring waits (or POLLOUT), 270 * just read the actual sqring head unconditionally. 271 */ 272 return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; 273 } 274 275 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 276 { 277 struct io_rings *rings = ctx->rings; 278 unsigned int entries; 279 280 /* make sure SQ entry isn't read before tail */ 281 entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 282 return min(entries, ctx->sq_entries); 283 } 284 285 static inline int io_run_task_work(void) 286 { 287 /* 288 * Always check-and-clear the task_work notification signal. With how 289 * signaling works for task_work, we can find it set with nothing to 290 * run. We need to clear it for that case, like get_signal() does. 291 */ 292 if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 293 clear_notify_signal(); 294 /* 295 * PF_IO_WORKER never returns to userspace, so check here if we have 296 * notify work that needs processing. 297 */ 298 if (current->flags & PF_IO_WORKER && 299 test_thread_flag(TIF_NOTIFY_RESUME)) { 300 __set_current_state(TASK_RUNNING); 301 resume_user_mode_work(NULL); 302 } 303 if (task_work_pending(current)) { 304 __set_current_state(TASK_RUNNING); 305 task_work_run(); 306 return 1; 307 } 308 309 return 0; 310 } 311 312 static inline bool io_task_work_pending(struct io_ring_ctx *ctx) 313 { 314 return task_work_pending(current) || !llist_empty(&ctx->work_llist); 315 } 316 317 static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) 318 { 319 if (!ts->locked) { 320 mutex_lock(&ctx->uring_lock); 321 ts->locked = true; 322 } 323 } 324 325 /* 326 * Don't complete immediately but use deferred completion infrastructure. 327 * Protected by ->uring_lock and can only be used either with 328 * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex. 329 */ 330 static inline void io_req_complete_defer(struct io_kiocb *req) 331 __must_hold(&req->ctx->uring_lock) 332 { 333 struct io_submit_state *state = &req->ctx->submit_state; 334 335 lockdep_assert_held(&req->ctx->uring_lock); 336 337 wq_list_add_tail(&req->comp_list, &state->compl_reqs); 338 } 339 340 static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) 341 { 342 if (unlikely(ctx->off_timeout_used || ctx->drain_active || 343 ctx->has_evfd || ctx->poll_activated)) 344 __io_commit_cqring_flush(ctx); 345 } 346 347 static inline void io_get_task_refs(int nr) 348 { 349 struct io_uring_task *tctx = current->io_uring; 350 351 tctx->cached_refs -= nr; 352 if (unlikely(tctx->cached_refs < 0)) 353 io_task_refs_refill(tctx); 354 } 355 356 static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) 357 { 358 return !ctx->submit_state.free_list.next; 359 } 360 361 extern struct kmem_cache *req_cachep; 362 363 static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) 364 { 365 struct io_kiocb *req; 366 367 req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); 368 wq_stack_extract(&ctx->submit_state.free_list); 369 return req; 370 } 371 372 static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req) 373 { 374 if (unlikely(io_req_cache_empty(ctx))) { 375 if (!__io_alloc_req_refill(ctx)) 376 return false; 377 } 378 *req = io_extract_req(ctx); 379 return true; 380 } 381 382 static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) 383 { 384 return likely(ctx->submitter_task == current); 385 } 386 387 static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) 388 { 389 return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || 390 ctx->submitter_task == current); 391 } 392 393 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) 394 { 395 io_req_set_res(req, res, 0); 396 req->io_task_work.func = io_req_task_complete; 397 io_req_task_work_add(req); 398 } 399 400 /* 401 * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each 402 * slot. 403 */ 404 static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) 405 { 406 if (ctx->flags & IORING_SETUP_SQE128) 407 return 2 * sizeof(struct io_uring_sqe); 408 return sizeof(struct io_uring_sqe); 409 } 410 #endif 411