1 /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2 /* 3 * Header file for the io_uring interface. 4 * 5 * Copyright (C) 2019 Jens Axboe 6 * Copyright (C) 2019 Christoph Hellwig 7 */ 8 #ifndef LINUX_IO_URING_H 9 #define LINUX_IO_URING_H 10 11 #include <linux/fs.h> 12 #include <linux/types.h> 13 14 /* 15 * IO submission data structure (Submission Queue Entry) 16 */ 17 struct io_uring_sqe { 18 __u8 opcode; /* type of operation for this sqe */ 19 __u8 flags; /* IOSQE_ flags */ 20 __u16 ioprio; /* ioprio for the request */ 21 __s32 fd; /* file descriptor to do IO on */ 22 union { 23 __u64 off; /* offset into file */ 24 __u64 addr2; 25 __u32 cmd_op; 26 }; 27 union { 28 __u64 addr; /* pointer to buffer or iovecs */ 29 __u64 splice_off_in; 30 }; 31 __u32 len; /* buffer size or number of iovecs */ 32 union { 33 __kernel_rwf_t rw_flags; 34 __u32 fsync_flags; 35 __u16 poll_events; /* compatibility */ 36 __u32 poll32_events; /* word-reversed for BE */ 37 __u32 sync_range_flags; 38 __u32 msg_flags; 39 __u32 timeout_flags; 40 __u32 accept_flags; 41 __u32 cancel_flags; 42 __u32 open_flags; 43 __u32 statx_flags; 44 __u32 fadvise_advice; 45 __u32 splice_flags; 46 __u32 rename_flags; 47 __u32 unlink_flags; 48 __u32 hardlink_flags; 49 __u32 xattr_flags; 50 }; 51 __u64 user_data; /* data to be passed back at completion time */ 52 /* pack this to avoid bogus arm OABI complaints */ 53 union { 54 /* index into fixed buffers, if used */ 55 __u16 buf_index; 56 /* for grouped buffer selection */ 57 __u16 buf_group; 58 } __attribute__((packed)); 59 /* personality to use, if used */ 60 __u16 personality; 61 union { 62 __s32 splice_fd_in; 63 __u32 file_index; 64 }; 65 union { 66 struct { 67 __u64 addr3; 68 __u64 __pad2[1]; 69 }; 70 /* 71 * If the ring is initialized with IORING_SETUP_SQE128, then 72 * this field is used for 80 bytes of arbitrary command data 73 */ 74 __u8 cmd[0]; 75 }; 76 }; 77 78 /* 79 * If sqe->file_index is set to this for opcodes that instantiate a new 80 * direct descriptor (like openat/openat2/accept), then io_uring will allocate 81 * an available direct descriptor instead of having the application pass one 82 * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE 83 * if the space is full. 84 */ 85 #define IORING_FILE_INDEX_ALLOC (~0U) 86 87 enum { 88 IOSQE_FIXED_FILE_BIT, 89 IOSQE_IO_DRAIN_BIT, 90 IOSQE_IO_LINK_BIT, 91 IOSQE_IO_HARDLINK_BIT, 92 IOSQE_ASYNC_BIT, 93 IOSQE_BUFFER_SELECT_BIT, 94 IOSQE_CQE_SKIP_SUCCESS_BIT, 95 }; 96 97 /* 98 * sqe->flags 99 */ 100 /* use fixed fileset */ 101 #define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT) 102 /* issue after inflight IO */ 103 #define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT) 104 /* links next sqe */ 105 #define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT) 106 /* like LINK, but stronger */ 107 #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) 108 /* always go async */ 109 #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) 110 /* select buffer from sqe->buf_group */ 111 #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) 112 /* don't post CQE if request succeeded */ 113 #define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT) 114 115 /* 116 * io_uring_setup() flags 117 */ 118 #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ 119 #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ 120 #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ 121 #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ 122 #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ 123 #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ 124 #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ 125 #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ 126 /* 127 * Cooperative task running. When requests complete, they often require 128 * forcing the submitter to transition to the kernel to complete. If this 129 * flag is set, work will be done when the task transitions anyway, rather 130 * than force an inter-processor interrupt reschedule. This avoids interrupting 131 * a task running in userspace, and saves an IPI. 132 */ 133 #define IORING_SETUP_COOP_TASKRUN (1U << 8) 134 /* 135 * If COOP_TASKRUN is set, get notified if task work is available for 136 * running and a kernel transition would be needed to run it. This sets 137 * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. 138 */ 139 #define IORING_SETUP_TASKRUN_FLAG (1U << 9) 140 141 #define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ 142 #define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ 143 144 enum io_uring_op { 145 IORING_OP_NOP, 146 IORING_OP_READV, 147 IORING_OP_WRITEV, 148 IORING_OP_FSYNC, 149 IORING_OP_READ_FIXED, 150 IORING_OP_WRITE_FIXED, 151 IORING_OP_POLL_ADD, 152 IORING_OP_POLL_REMOVE, 153 IORING_OP_SYNC_FILE_RANGE, 154 IORING_OP_SENDMSG, 155 IORING_OP_RECVMSG, 156 IORING_OP_TIMEOUT, 157 IORING_OP_TIMEOUT_REMOVE, 158 IORING_OP_ACCEPT, 159 IORING_OP_ASYNC_CANCEL, 160 IORING_OP_LINK_TIMEOUT, 161 IORING_OP_CONNECT, 162 IORING_OP_FALLOCATE, 163 IORING_OP_OPENAT, 164 IORING_OP_CLOSE, 165 IORING_OP_FILES_UPDATE, 166 IORING_OP_STATX, 167 IORING_OP_READ, 168 IORING_OP_WRITE, 169 IORING_OP_FADVISE, 170 IORING_OP_MADVISE, 171 IORING_OP_SEND, 172 IORING_OP_RECV, 173 IORING_OP_OPENAT2, 174 IORING_OP_EPOLL_CTL, 175 IORING_OP_SPLICE, 176 IORING_OP_PROVIDE_BUFFERS, 177 IORING_OP_REMOVE_BUFFERS, 178 IORING_OP_TEE, 179 IORING_OP_SHUTDOWN, 180 IORING_OP_RENAMEAT, 181 IORING_OP_UNLINKAT, 182 IORING_OP_MKDIRAT, 183 IORING_OP_SYMLINKAT, 184 IORING_OP_LINKAT, 185 IORING_OP_MSG_RING, 186 IORING_OP_FSETXATTR, 187 IORING_OP_SETXATTR, 188 IORING_OP_FGETXATTR, 189 IORING_OP_GETXATTR, 190 IORING_OP_SOCKET, 191 IORING_OP_URING_CMD, 192 193 /* this goes last, obviously */ 194 IORING_OP_LAST, 195 }; 196 197 /* 198 * sqe->fsync_flags 199 */ 200 #define IORING_FSYNC_DATASYNC (1U << 0) 201 202 /* 203 * sqe->timeout_flags 204 */ 205 #define IORING_TIMEOUT_ABS (1U << 0) 206 #define IORING_TIMEOUT_UPDATE (1U << 1) 207 #define IORING_TIMEOUT_BOOTTIME (1U << 2) 208 #define IORING_TIMEOUT_REALTIME (1U << 3) 209 #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) 210 #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) 211 #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) 212 #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) 213 /* 214 * sqe->splice_flags 215 * extends splice(2) flags 216 */ 217 #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ 218 219 /* 220 * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the 221 * command flags for POLL_ADD are stored in sqe->len. 222 * 223 * IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if 224 * the poll handler will continue to report 225 * CQEs on behalf of the same SQE. 226 * 227 * IORING_POLL_UPDATE Update existing poll request, matching 228 * sqe->addr as the old user_data field. 229 */ 230 #define IORING_POLL_ADD_MULTI (1U << 0) 231 #define IORING_POLL_UPDATE_EVENTS (1U << 1) 232 #define IORING_POLL_UPDATE_USER_DATA (1U << 2) 233 234 /* 235 * ASYNC_CANCEL flags. 236 * 237 * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key 238 * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the 239 * request 'user_data' 240 * IORING_ASYNC_CANCEL_ANY Match any request 241 */ 242 #define IORING_ASYNC_CANCEL_ALL (1U << 0) 243 #define IORING_ASYNC_CANCEL_FD (1U << 1) 244 #define IORING_ASYNC_CANCEL_ANY (1U << 2) 245 246 /* 247 * send/sendmsg and recv/recvmsg flags (sqe->addr2) 248 * 249 * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send 250 * or receive and arm poll if that yields an 251 * -EAGAIN result, arm poll upfront and skip 252 * the initial transfer attempt. 253 */ 254 #define IORING_RECVSEND_POLL_FIRST (1U << 0) 255 256 /* 257 * accept flags stored in sqe->ioprio 258 */ 259 #define IORING_ACCEPT_MULTISHOT (1U << 0) 260 261 /* 262 * IO completion data structure (Completion Queue Entry) 263 */ 264 struct io_uring_cqe { 265 __u64 user_data; /* sqe->data submission passed back */ 266 __s32 res; /* result code for this event */ 267 __u32 flags; 268 269 /* 270 * If the ring is initialized with IORING_SETUP_CQE32, then this field 271 * contains 16-bytes of padding, doubling the size of the CQE. 272 */ 273 __u64 big_cqe[]; 274 }; 275 276 /* 277 * cqe->flags 278 * 279 * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID 280 * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries 281 * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv 282 */ 283 #define IORING_CQE_F_BUFFER (1U << 0) 284 #define IORING_CQE_F_MORE (1U << 1) 285 #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) 286 287 enum { 288 IORING_CQE_BUFFER_SHIFT = 16, 289 }; 290 291 /* 292 * Magic offsets for the application to mmap the data it needs 293 */ 294 #define IORING_OFF_SQ_RING 0ULL 295 #define IORING_OFF_CQ_RING 0x8000000ULL 296 #define IORING_OFF_SQES 0x10000000ULL 297 298 /* 299 * Filled with the offset for mmap(2) 300 */ 301 struct io_sqring_offsets { 302 __u32 head; 303 __u32 tail; 304 __u32 ring_mask; 305 __u32 ring_entries; 306 __u32 flags; 307 __u32 dropped; 308 __u32 array; 309 __u32 resv1; 310 __u64 resv2; 311 }; 312 313 /* 314 * sq_ring->flags 315 */ 316 #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ 317 #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ 318 #define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */ 319 320 struct io_cqring_offsets { 321 __u32 head; 322 __u32 tail; 323 __u32 ring_mask; 324 __u32 ring_entries; 325 __u32 overflow; 326 __u32 cqes; 327 __u32 flags; 328 __u32 resv1; 329 __u64 resv2; 330 }; 331 332 /* 333 * cq_ring->flags 334 */ 335 336 /* disable eventfd notifications */ 337 #define IORING_CQ_EVENTFD_DISABLED (1U << 0) 338 339 /* 340 * io_uring_enter(2) flags 341 */ 342 #define IORING_ENTER_GETEVENTS (1U << 0) 343 #define IORING_ENTER_SQ_WAKEUP (1U << 1) 344 #define IORING_ENTER_SQ_WAIT (1U << 2) 345 #define IORING_ENTER_EXT_ARG (1U << 3) 346 #define IORING_ENTER_REGISTERED_RING (1U << 4) 347 348 /* 349 * Passed in for io_uring_setup(2). Copied back with updated info on success 350 */ 351 struct io_uring_params { 352 __u32 sq_entries; 353 __u32 cq_entries; 354 __u32 flags; 355 __u32 sq_thread_cpu; 356 __u32 sq_thread_idle; 357 __u32 features; 358 __u32 wq_fd; 359 __u32 resv[3]; 360 struct io_sqring_offsets sq_off; 361 struct io_cqring_offsets cq_off; 362 }; 363 364 /* 365 * io_uring_params->features flags 366 */ 367 #define IORING_FEAT_SINGLE_MMAP (1U << 0) 368 #define IORING_FEAT_NODROP (1U << 1) 369 #define IORING_FEAT_SUBMIT_STABLE (1U << 2) 370 #define IORING_FEAT_RW_CUR_POS (1U << 3) 371 #define IORING_FEAT_CUR_PERSONALITY (1U << 4) 372 #define IORING_FEAT_FAST_POLL (1U << 5) 373 #define IORING_FEAT_POLL_32BITS (1U << 6) 374 #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) 375 #define IORING_FEAT_EXT_ARG (1U << 8) 376 #define IORING_FEAT_NATIVE_WORKERS (1U << 9) 377 #define IORING_FEAT_RSRC_TAGS (1U << 10) 378 #define IORING_FEAT_CQE_SKIP (1U << 11) 379 #define IORING_FEAT_LINKED_FILE (1U << 12) 380 381 /* 382 * io_uring_register(2) opcodes and arguments 383 */ 384 enum { 385 IORING_REGISTER_BUFFERS = 0, 386 IORING_UNREGISTER_BUFFERS = 1, 387 IORING_REGISTER_FILES = 2, 388 IORING_UNREGISTER_FILES = 3, 389 IORING_REGISTER_EVENTFD = 4, 390 IORING_UNREGISTER_EVENTFD = 5, 391 IORING_REGISTER_FILES_UPDATE = 6, 392 IORING_REGISTER_EVENTFD_ASYNC = 7, 393 IORING_REGISTER_PROBE = 8, 394 IORING_REGISTER_PERSONALITY = 9, 395 IORING_UNREGISTER_PERSONALITY = 10, 396 IORING_REGISTER_RESTRICTIONS = 11, 397 IORING_REGISTER_ENABLE_RINGS = 12, 398 399 /* extended with tagging */ 400 IORING_REGISTER_FILES2 = 13, 401 IORING_REGISTER_FILES_UPDATE2 = 14, 402 IORING_REGISTER_BUFFERS2 = 15, 403 IORING_REGISTER_BUFFERS_UPDATE = 16, 404 405 /* set/clear io-wq thread affinities */ 406 IORING_REGISTER_IOWQ_AFF = 17, 407 IORING_UNREGISTER_IOWQ_AFF = 18, 408 409 /* set/get max number of io-wq workers */ 410 IORING_REGISTER_IOWQ_MAX_WORKERS = 19, 411 412 /* register/unregister io_uring fd with the ring */ 413 IORING_REGISTER_RING_FDS = 20, 414 IORING_UNREGISTER_RING_FDS = 21, 415 416 /* register ring based provide buffer group */ 417 IORING_REGISTER_PBUF_RING = 22, 418 IORING_UNREGISTER_PBUF_RING = 23, 419 420 /* this goes last */ 421 IORING_REGISTER_LAST 422 }; 423 424 /* io-wq worker categories */ 425 enum { 426 IO_WQ_BOUND, 427 IO_WQ_UNBOUND, 428 }; 429 430 /* deprecated, see struct io_uring_rsrc_update */ 431 struct io_uring_files_update { 432 __u32 offset; 433 __u32 resv; 434 __aligned_u64 /* __s32 * */ fds; 435 }; 436 437 /* 438 * Register a fully sparse file space, rather than pass in an array of all 439 * -1 file descriptors. 440 */ 441 #define IORING_RSRC_REGISTER_SPARSE (1U << 0) 442 443 struct io_uring_rsrc_register { 444 __u32 nr; 445 __u32 flags; 446 __u64 resv2; 447 __aligned_u64 data; 448 __aligned_u64 tags; 449 }; 450 451 struct io_uring_rsrc_update { 452 __u32 offset; 453 __u32 resv; 454 __aligned_u64 data; 455 }; 456 457 struct io_uring_rsrc_update2 { 458 __u32 offset; 459 __u32 resv; 460 __aligned_u64 data; 461 __aligned_u64 tags; 462 __u32 nr; 463 __u32 resv2; 464 }; 465 466 /* Skip updating fd indexes set to this value in the fd table */ 467 #define IORING_REGISTER_FILES_SKIP (-2) 468 469 #define IO_URING_OP_SUPPORTED (1U << 0) 470 471 struct io_uring_probe_op { 472 __u8 op; 473 __u8 resv; 474 __u16 flags; /* IO_URING_OP_* flags */ 475 __u32 resv2; 476 }; 477 478 struct io_uring_probe { 479 __u8 last_op; /* last opcode supported */ 480 __u8 ops_len; /* length of ops[] array below */ 481 __u16 resv; 482 __u32 resv2[3]; 483 struct io_uring_probe_op ops[0]; 484 }; 485 486 struct io_uring_restriction { 487 __u16 opcode; 488 union { 489 __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */ 490 __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */ 491 __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */ 492 }; 493 __u8 resv; 494 __u32 resv2[3]; 495 }; 496 497 struct io_uring_buf { 498 __u64 addr; 499 __u32 len; 500 __u16 bid; 501 __u16 resv; 502 }; 503 504 struct io_uring_buf_ring { 505 union { 506 /* 507 * To avoid spilling into more pages than we need to, the 508 * ring tail is overlaid with the io_uring_buf->resv field. 509 */ 510 struct { 511 __u64 resv1; 512 __u32 resv2; 513 __u16 resv3; 514 __u16 tail; 515 }; 516 struct io_uring_buf bufs[0]; 517 }; 518 }; 519 520 /* argument for IORING_(UN)REGISTER_PBUF_RING */ 521 struct io_uring_buf_reg { 522 __u64 ring_addr; 523 __u32 ring_entries; 524 __u16 bgid; 525 __u16 pad; 526 __u64 resv[3]; 527 }; 528 529 /* 530 * io_uring_restriction->opcode values 531 */ 532 enum { 533 /* Allow an io_uring_register(2) opcode */ 534 IORING_RESTRICTION_REGISTER_OP = 0, 535 536 /* Allow an sqe opcode */ 537 IORING_RESTRICTION_SQE_OP = 1, 538 539 /* Allow sqe flags */ 540 IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2, 541 542 /* Require sqe flags (these flags must be set on each submission) */ 543 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 544 545 IORING_RESTRICTION_LAST 546 }; 547 548 struct io_uring_getevents_arg { 549 __u64 sigmask; 550 __u32 sigmask_sz; 551 __u32 pad; 552 __u64 ts; 553 }; 554 555 #endif 556