1 #ifndef IO_URING_TYPES_H 2 #define IO_URING_TYPES_H 3 4 #include <linux/blkdev.h> 5 #include <linux/task_work.h> 6 #include <linux/bitmap.h> 7 #include <linux/llist.h> 8 #include <uapi/linux/io_uring.h> 9 10 struct io_wq_work_node { 11 struct io_wq_work_node *next; 12 }; 13 14 struct io_wq_work_list { 15 struct io_wq_work_node *first; 16 struct io_wq_work_node *last; 17 }; 18 19 struct io_wq_work { 20 struct io_wq_work_node list; 21 unsigned flags; 22 /* place it here instead of io_kiocb as it fills padding and saves 4B */ 23 int cancel_seq; 24 }; 25 26 struct io_fixed_file { 27 /* file * with additional FFS_* flags */ 28 unsigned long file_ptr; 29 }; 30 31 struct io_file_table { 32 struct io_fixed_file *files; 33 unsigned long *bitmap; 34 unsigned int alloc_hint; 35 }; 36 37 struct io_hash_bucket { 38 spinlock_t lock; 39 struct hlist_head list; 40 } ____cacheline_aligned_in_smp; 41 42 struct io_hash_table { 43 struct io_hash_bucket *hbs; 44 unsigned hash_bits; 45 }; 46 47 /* 48 * Arbitrary limit, can be raised if need be 49 */ 50 #define IO_RINGFD_REG_MAX 16 51 52 struct io_uring_task { 53 /* submission side */ 54 int cached_refs; 55 const struct io_ring_ctx *last; 56 struct io_wq *io_wq; 57 struct file *registered_rings[IO_RINGFD_REG_MAX]; 58 59 struct xarray xa; 60 struct wait_queue_head wait; 61 atomic_t in_cancel; 62 atomic_t inflight_tracked; 63 struct percpu_counter inflight; 64 65 struct { /* task_work */ 66 struct llist_head task_list; 67 struct callback_head task_work; 68 } ____cacheline_aligned_in_smp; 69 }; 70 71 struct io_uring { 72 u32 head; 73 u32 tail; 74 }; 75 76 /* 77 * This data is shared with the application through the mmap at offsets 78 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 79 * 80 * The offsets to the member fields are published through struct 81 * io_sqring_offsets when calling io_uring_setup. 82 */ 83 struct io_rings { 84 /* 85 * Head and tail offsets into the ring; the offsets need to be 86 * masked to get valid indices. 87 * 88 * The kernel controls head of the sq ring and the tail of the cq ring, 89 * and the application controls tail of the sq ring and the head of the 90 * cq ring. 91 */ 92 struct io_uring sq, cq; 93 /* 94 * Bitmasks to apply to head and tail offsets (constant, equals 95 * ring_entries - 1) 96 */ 97 u32 sq_ring_mask, cq_ring_mask; 98 /* Ring sizes (constant, power of 2) */ 99 u32 sq_ring_entries, cq_ring_entries; 100 /* 101 * Number of invalid entries dropped by the kernel due to 102 * invalid index stored in array 103 * 104 * Written by the kernel, shouldn't be modified by the 105 * application (i.e. get number of "new events" by comparing to 106 * cached value). 107 * 108 * After a new SQ head value was read by the application this 109 * counter includes all submissions that were dropped reaching 110 * the new SQ head (and possibly more). 111 */ 112 u32 sq_dropped; 113 /* 114 * Runtime SQ flags 115 * 116 * Written by the kernel, shouldn't be modified by the 117 * application. 118 * 119 * The application needs a full memory barrier before checking 120 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 121 */ 122 atomic_t sq_flags; 123 /* 124 * Runtime CQ flags 125 * 126 * Written by the application, shouldn't be modified by the 127 * kernel. 128 */ 129 u32 cq_flags; 130 /* 131 * Number of completion events lost because the queue was full; 132 * this should be avoided by the application by making sure 133 * there are not more requests pending than there is space in 134 * the completion queue. 135 * 136 * Written by the kernel, shouldn't be modified by the 137 * application (i.e. get number of "new events" by comparing to 138 * cached value). 139 * 140 * As completion events come in out of order this counter is not 141 * ordered with any other data. 142 */ 143 u32 cq_overflow; 144 /* 145 * Ring buffer of completion events. 146 * 147 * The kernel writes completion events fresh every time they are 148 * produced, so the application is allowed to modify pending 149 * entries. 150 */ 151 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 152 }; 153 154 struct io_restriction { 155 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 156 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 157 u8 sqe_flags_allowed; 158 u8 sqe_flags_required; 159 bool registered; 160 }; 161 162 struct io_submit_link { 163 struct io_kiocb *head; 164 struct io_kiocb *last; 165 }; 166 167 struct io_submit_state { 168 /* inline/task_work completion list, under ->uring_lock */ 169 struct io_wq_work_node free_list; 170 /* batch completion logic */ 171 struct io_wq_work_list compl_reqs; 172 struct io_submit_link link; 173 174 bool plug_started; 175 bool need_plug; 176 unsigned short submit_nr; 177 unsigned int cqes_count; 178 struct blk_plug plug; 179 }; 180 181 struct io_ev_fd { 182 struct eventfd_ctx *cq_ev_fd; 183 unsigned int eventfd_async: 1; 184 struct rcu_head rcu; 185 atomic_t refs; 186 atomic_t ops; 187 }; 188 189 struct io_alloc_cache { 190 struct io_wq_work_node list; 191 unsigned int nr_cached; 192 unsigned int max_cached; 193 size_t elem_size; 194 }; 195 196 struct io_ring_ctx { 197 /* const or read-mostly hot data */ 198 struct { 199 unsigned int flags; 200 unsigned int drain_next: 1; 201 unsigned int restricted: 1; 202 unsigned int off_timeout_used: 1; 203 unsigned int drain_active: 1; 204 unsigned int has_evfd: 1; 205 /* all CQEs should be posted only by the submitter task */ 206 unsigned int task_complete: 1; 207 unsigned int lockless_cq: 1; 208 unsigned int syscall_iopoll: 1; 209 unsigned int poll_activated: 1; 210 unsigned int drain_disabled: 1; 211 unsigned int compat: 1; 212 213 struct task_struct *submitter_task; 214 struct io_rings *rings; 215 struct percpu_ref refs; 216 217 enum task_work_notify_mode notify_method; 218 } ____cacheline_aligned_in_smp; 219 220 /* submission data */ 221 struct { 222 struct mutex uring_lock; 223 224 /* 225 * Ring buffer of indices into array of io_uring_sqe, which is 226 * mmapped by the application using the IORING_OFF_SQES offset. 227 * 228 * This indirection could e.g. be used to assign fixed 229 * io_uring_sqe entries to operations and only submit them to 230 * the queue when needed. 231 * 232 * The kernel modifies neither the indices array nor the entries 233 * array. 234 */ 235 u32 *sq_array; 236 struct io_uring_sqe *sq_sqes; 237 unsigned cached_sq_head; 238 unsigned sq_entries; 239 240 /* 241 * Fixed resources fast path, should be accessed only under 242 * uring_lock, and updated through io_uring_register(2) 243 */ 244 struct io_rsrc_node *rsrc_node; 245 atomic_t cancel_seq; 246 struct io_file_table file_table; 247 unsigned nr_user_files; 248 unsigned nr_user_bufs; 249 struct io_mapped_ubuf **user_bufs; 250 251 struct io_submit_state submit_state; 252 253 struct xarray io_bl_xa; 254 255 struct io_hash_table cancel_table_locked; 256 struct io_alloc_cache apoll_cache; 257 struct io_alloc_cache netmsg_cache; 258 259 /* 260 * ->iopoll_list is protected by the ctx->uring_lock for 261 * io_uring instances that don't use IORING_SETUP_SQPOLL. 262 * For SQPOLL, only the single threaded io_sq_thread() will 263 * manipulate the list, hence no extra locking is needed there. 264 */ 265 struct io_wq_work_list iopoll_list; 266 bool poll_multi_queue; 267 } ____cacheline_aligned_in_smp; 268 269 struct { 270 /* 271 * We cache a range of free CQEs we can use, once exhausted it 272 * should go through a slower range setup, see __io_get_cqe() 273 */ 274 struct io_uring_cqe *cqe_cached; 275 struct io_uring_cqe *cqe_sentinel; 276 277 unsigned cached_cq_tail; 278 unsigned cq_entries; 279 struct io_ev_fd __rcu *io_ev_fd; 280 unsigned cq_extra; 281 } ____cacheline_aligned_in_smp; 282 283 /* 284 * task_work and async notification delivery cacheline. Expected to 285 * regularly bounce b/w CPUs. 286 */ 287 struct { 288 struct llist_head work_llist; 289 unsigned long check_cq; 290 atomic_t cq_wait_nr; 291 atomic_t cq_timeouts; 292 struct wait_queue_head cq_wait; 293 } ____cacheline_aligned_in_smp; 294 295 /* timeouts */ 296 struct { 297 spinlock_t timeout_lock; 298 struct list_head timeout_list; 299 struct list_head ltimeout_list; 300 unsigned cq_last_tm_flush; 301 } ____cacheline_aligned_in_smp; 302 303 struct io_uring_cqe completion_cqes[16]; 304 305 spinlock_t completion_lock; 306 307 /* IRQ completion list, under ->completion_lock */ 308 struct io_wq_work_list locked_free_list; 309 unsigned int locked_free_nr; 310 311 struct list_head io_buffers_comp; 312 struct list_head cq_overflow_list; 313 struct io_hash_table cancel_table; 314 315 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 316 struct io_sq_data *sq_data; /* if using sq thread polling */ 317 318 struct wait_queue_head sqo_sq_wait; 319 struct list_head sqd_list; 320 321 unsigned int file_alloc_start; 322 unsigned int file_alloc_end; 323 324 struct xarray personalities; 325 u32 pers_next; 326 327 struct list_head io_buffers_cache; 328 329 /* deferred free list, protected by ->uring_lock */ 330 struct hlist_head io_buf_list; 331 332 /* Keep this last, we don't need it for the fast path */ 333 struct wait_queue_head poll_wq; 334 struct io_restriction restrictions; 335 336 /* slow path rsrc auxilary data, used by update/register */ 337 struct io_mapped_ubuf *dummy_ubuf; 338 struct io_rsrc_data *file_data; 339 struct io_rsrc_data *buf_data; 340 341 /* protected by ->uring_lock */ 342 struct list_head rsrc_ref_list; 343 struct io_alloc_cache rsrc_node_cache; 344 struct wait_queue_head rsrc_quiesce_wq; 345 unsigned rsrc_quiesce; 346 347 struct list_head io_buffers_pages; 348 349 /* hashed buffered write serialization */ 350 struct io_wq_hash *hash_map; 351 352 /* Only used for accounting purposes */ 353 struct user_struct *user; 354 struct mm_struct *mm_account; 355 356 /* ctx exit and cancelation */ 357 struct llist_head fallback_llist; 358 struct delayed_work fallback_work; 359 struct work_struct exit_work; 360 struct list_head tctx_list; 361 struct completion ref_comp; 362 363 /* io-wq management, e.g. thread count */ 364 u32 iowq_limits[2]; 365 bool iowq_limits_set; 366 367 struct callback_head poll_wq_task_work; 368 struct list_head defer_list; 369 unsigned sq_thread_idle; 370 /* protected by ->completion_lock */ 371 unsigned evfd_last_cq_tail; 372 373 /* 374 * If IORING_SETUP_NO_MMAP is used, then the below holds 375 * the gup'ed pages for the two rings, and the sqes. 376 */ 377 unsigned short n_ring_pages; 378 unsigned short n_sqe_pages; 379 struct page **ring_pages; 380 struct page **sqe_pages; 381 }; 382 383 struct io_tw_state { 384 /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */ 385 bool locked; 386 }; 387 388 enum { 389 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 390 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 391 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 392 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 393 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 394 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 395 REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, 396 397 /* first byte is taken by user flags, shift it to not overlap */ 398 REQ_F_FAIL_BIT = 8, 399 REQ_F_INFLIGHT_BIT, 400 REQ_F_CUR_POS_BIT, 401 REQ_F_NOWAIT_BIT, 402 REQ_F_LINK_TIMEOUT_BIT, 403 REQ_F_NEED_CLEANUP_BIT, 404 REQ_F_POLLED_BIT, 405 REQ_F_BUFFER_SELECTED_BIT, 406 REQ_F_BUFFER_RING_BIT, 407 REQ_F_REISSUE_BIT, 408 REQ_F_CREDS_BIT, 409 REQ_F_REFCOUNT_BIT, 410 REQ_F_ARM_LTIMEOUT_BIT, 411 REQ_F_ASYNC_DATA_BIT, 412 REQ_F_SKIP_LINK_CQES_BIT, 413 REQ_F_SINGLE_POLL_BIT, 414 REQ_F_DOUBLE_POLL_BIT, 415 REQ_F_PARTIAL_IO_BIT, 416 REQ_F_APOLL_MULTISHOT_BIT, 417 REQ_F_CLEAR_POLLIN_BIT, 418 REQ_F_HASH_LOCKED_BIT, 419 /* keep async read/write and isreg together and in order */ 420 REQ_F_SUPPORT_NOWAIT_BIT, 421 REQ_F_ISREG_BIT, 422 423 /* not a real bit, just to check we're not overflowing the space */ 424 __REQ_F_LAST_BIT, 425 }; 426 427 enum { 428 /* ctx owns file */ 429 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 430 /* drain existing IO first */ 431 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 432 /* linked sqes */ 433 REQ_F_LINK = BIT(REQ_F_LINK_BIT), 434 /* doesn't sever on completion < 0 */ 435 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 436 /* IOSQE_ASYNC */ 437 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 438 /* IOSQE_BUFFER_SELECT */ 439 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 440 /* IOSQE_CQE_SKIP_SUCCESS */ 441 REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), 442 443 /* fail rest of links */ 444 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 445 /* on inflight list, should be cancelled and waited on exit reliably */ 446 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 447 /* read/write uses file position */ 448 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 449 /* must not punt to workers */ 450 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 451 /* has or had linked timeout */ 452 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 453 /* needs cleanup */ 454 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 455 /* already went through poll handler */ 456 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 457 /* buffer already selected */ 458 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 459 /* buffer selected from ring, needs commit */ 460 REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), 461 /* caller should reissue async */ 462 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 463 /* supports async reads/writes */ 464 REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), 465 /* regular file */ 466 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 467 /* has creds assigned */ 468 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 469 /* skip refcounting if not set */ 470 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 471 /* there is a linked timeout that has to be armed */ 472 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 473 /* ->async_data allocated */ 474 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), 475 /* don't post CQEs while failing linked requests */ 476 REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), 477 /* single poll may be active */ 478 REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), 479 /* double poll may active */ 480 REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), 481 /* request has already done partial IO */ 482 REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), 483 /* fast poll multishot mode */ 484 REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), 485 /* recvmsg special flag, clear EPOLLIN */ 486 REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), 487 /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ 488 REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), 489 }; 490 491 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); 492 493 struct io_task_work { 494 struct llist_node node; 495 io_req_tw_func_t func; 496 }; 497 498 struct io_cqe { 499 __u64 user_data; 500 __s32 res; 501 /* fd initially, then cflags for completion */ 502 union { 503 __u32 flags; 504 int fd; 505 }; 506 }; 507 508 /* 509 * Each request type overlays its private data structure on top of this one. 510 * They must not exceed this one in size. 511 */ 512 struct io_cmd_data { 513 struct file *file; 514 /* each command gets 56 bytes of data */ 515 __u8 data[56]; 516 }; 517 518 static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) 519 { 520 BUILD_BUG_ON(cmd_sz > sizeof(struct io_cmd_data)); 521 } 522 #define io_kiocb_to_cmd(req, cmd_type) ( \ 523 io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ 524 ((cmd_type *)&(req)->cmd) \ 525 ) 526 #define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) 527 528 struct io_kiocb { 529 union { 530 /* 531 * NOTE! Each of the io_kiocb union members has the file pointer 532 * as the first entry in their struct definition. So you can 533 * access the file pointer through any of the sub-structs, 534 * or directly as just 'file' in this struct. 535 */ 536 struct file *file; 537 struct io_cmd_data cmd; 538 }; 539 540 u8 opcode; 541 /* polled IO has completed */ 542 u8 iopoll_completed; 543 /* 544 * Can be either a fixed buffer index, or used with provided buffers. 545 * For the latter, before issue it points to the buffer group ID, 546 * and after selection it points to the buffer ID itself. 547 */ 548 u16 buf_index; 549 unsigned int flags; 550 551 struct io_cqe cqe; 552 553 struct io_ring_ctx *ctx; 554 struct task_struct *task; 555 556 struct io_rsrc_node *rsrc_node; 557 558 union { 559 /* store used ubuf, so we can prevent reloading */ 560 struct io_mapped_ubuf *imu; 561 562 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 563 struct io_buffer *kbuf; 564 565 /* 566 * stores buffer ID for ring provided buffers, valid IFF 567 * REQ_F_BUFFER_RING is set. 568 */ 569 struct io_buffer_list *buf_list; 570 }; 571 572 union { 573 /* used by request caches, completion batching and iopoll */ 574 struct io_wq_work_node comp_list; 575 /* cache ->apoll->events */ 576 __poll_t apoll_events; 577 }; 578 atomic_t refs; 579 atomic_t poll_refs; 580 struct io_task_work io_task_work; 581 unsigned nr_tw; 582 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 583 struct hlist_node hash_node; 584 /* internal polling, see IORING_FEAT_FAST_POLL */ 585 struct async_poll *apoll; 586 /* opcode allocated if it needs to store data for async defer */ 587 void *async_data; 588 /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ 589 struct io_kiocb *link; 590 /* custom credentials, valid IFF REQ_F_CREDS is set */ 591 const struct cred *creds; 592 struct io_wq_work work; 593 594 struct { 595 u64 extra1; 596 u64 extra2; 597 } big_cqe; 598 }; 599 600 struct io_overflow_cqe { 601 struct list_head list; 602 struct io_uring_cqe cqe; 603 }; 604 605 #endif 606