1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/namei.h> 9 #include <linux/poll.h> 10 #include <linux/io_uring.h> 11 12 #include <uapi/linux/io_uring.h> 13 14 #include "io_uring.h" 15 #include "opdef.h" 16 #include "kbuf.h" 17 18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) 19 20 #define BGID_ARRAY 64 21 22 struct io_provide_buf { 23 struct file *file; 24 __u64 addr; 25 __u32 len; 26 __u32 bgid; 27 __u16 nbufs; 28 __u16 bid; 29 }; 30 31 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 32 unsigned int bgid) 33 { 34 if (ctx->io_bl && bgid < BGID_ARRAY) 35 return &ctx->io_bl[bgid]; 36 37 return xa_load(&ctx->io_bl_xa, bgid); 38 } 39 40 static int io_buffer_add_list(struct io_ring_ctx *ctx, 41 struct io_buffer_list *bl, unsigned int bgid) 42 { 43 bl->bgid = bgid; 44 if (bgid < BGID_ARRAY) 45 return 0; 46 47 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 48 } 49 50 void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) 51 { 52 struct io_ring_ctx *ctx = req->ctx; 53 struct io_buffer_list *bl; 54 struct io_buffer *buf; 55 56 /* 57 * For legacy provided buffer mode, don't recycle if we already did 58 * IO to this buffer. For ring-mapped provided buffer mode, we should 59 * increment ring->head to explicitly monopolize the buffer to avoid 60 * multiple use. 61 */ 62 if (req->flags & REQ_F_PARTIAL_IO) 63 return; 64 65 io_ring_submit_lock(ctx, issue_flags); 66 67 buf = req->kbuf; 68 bl = io_buffer_get_list(ctx, buf->bgid); 69 list_add(&buf->list, &bl->buf_list); 70 req->flags &= ~REQ_F_BUFFER_SELECTED; 71 req->buf_index = buf->bgid; 72 73 io_ring_submit_unlock(ctx, issue_flags); 74 return; 75 } 76 77 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) 78 { 79 unsigned int cflags; 80 81 /* 82 * We can add this buffer back to two lists: 83 * 84 * 1) The io_buffers_cache list. This one is protected by the 85 * ctx->uring_lock. If we already hold this lock, add back to this 86 * list as we can grab it from issue as well. 87 * 2) The io_buffers_comp list. This one is protected by the 88 * ctx->completion_lock. 89 * 90 * We migrate buffers from the comp_list to the issue cache list 91 * when we need one. 92 */ 93 if (req->flags & REQ_F_BUFFER_RING) { 94 /* no buffers to recycle for this case */ 95 cflags = __io_put_kbuf_list(req, NULL); 96 } else if (issue_flags & IO_URING_F_UNLOCKED) { 97 struct io_ring_ctx *ctx = req->ctx; 98 99 spin_lock(&ctx->completion_lock); 100 cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); 101 spin_unlock(&ctx->completion_lock); 102 } else { 103 lockdep_assert_held(&req->ctx->uring_lock); 104 105 cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); 106 } 107 return cflags; 108 } 109 110 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, 111 struct io_buffer_list *bl) 112 { 113 if (!list_empty(&bl->buf_list)) { 114 struct io_buffer *kbuf; 115 116 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 117 list_del(&kbuf->list); 118 if (*len == 0 || *len > kbuf->len) 119 *len = kbuf->len; 120 req->flags |= REQ_F_BUFFER_SELECTED; 121 req->kbuf = kbuf; 122 req->buf_index = kbuf->bid; 123 return u64_to_user_ptr(kbuf->addr); 124 } 125 return NULL; 126 } 127 128 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 129 struct io_buffer_list *bl, 130 unsigned int issue_flags) 131 { 132 struct io_uring_buf_ring *br = bl->buf_ring; 133 struct io_uring_buf *buf; 134 __u16 head = bl->head; 135 136 if (unlikely(smp_load_acquire(&br->tail) == head)) 137 return NULL; 138 139 head &= bl->mask; 140 /* mmaped buffers are always contig */ 141 if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { 142 buf = &br->bufs[head]; 143 } else { 144 int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); 145 int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; 146 buf = page_address(bl->buf_pages[index]); 147 buf += off; 148 } 149 if (*len == 0 || *len > buf->len) 150 *len = buf->len; 151 req->flags |= REQ_F_BUFFER_RING; 152 req->buf_list = bl; 153 req->buf_index = buf->bid; 154 155 if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { 156 /* 157 * If we came in unlocked, we have no choice but to consume the 158 * buffer here, otherwise nothing ensures that the buffer won't 159 * get used by others. This does mean it'll be pinned until the 160 * IO completes, coming in unlocked means we're being called from 161 * io-wq context and there may be further retries in async hybrid 162 * mode. For the locked case, the caller must call commit when 163 * the transfer completes (or if we get -EAGAIN and must poll of 164 * retry). 165 */ 166 req->buf_list = NULL; 167 bl->head++; 168 } 169 return u64_to_user_ptr(buf->addr); 170 } 171 172 void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 173 unsigned int issue_flags) 174 { 175 struct io_ring_ctx *ctx = req->ctx; 176 struct io_buffer_list *bl; 177 void __user *ret = NULL; 178 179 io_ring_submit_lock(req->ctx, issue_flags); 180 181 bl = io_buffer_get_list(ctx, req->buf_index); 182 if (likely(bl)) { 183 if (bl->is_mapped) 184 ret = io_ring_buffer_select(req, len, bl, issue_flags); 185 else 186 ret = io_provided_buffer_select(req, len, bl); 187 } 188 io_ring_submit_unlock(req->ctx, issue_flags); 189 return ret; 190 } 191 192 static __cold int io_init_bl_list(struct io_ring_ctx *ctx) 193 { 194 int i; 195 196 ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), 197 GFP_KERNEL); 198 if (!ctx->io_bl) 199 return -ENOMEM; 200 201 for (i = 0; i < BGID_ARRAY; i++) { 202 INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); 203 ctx->io_bl[i].bgid = i; 204 } 205 206 return 0; 207 } 208 209 static int __io_remove_buffers(struct io_ring_ctx *ctx, 210 struct io_buffer_list *bl, unsigned nbufs) 211 { 212 unsigned i = 0; 213 214 /* shouldn't happen */ 215 if (!nbufs) 216 return 0; 217 218 if (bl->is_mapped) { 219 i = bl->buf_ring->tail - bl->head; 220 if (bl->is_mmap) { 221 struct page *page; 222 223 page = virt_to_head_page(bl->buf_ring); 224 if (put_page_testzero(page)) 225 free_compound_page(page); 226 bl->buf_ring = NULL; 227 bl->is_mmap = 0; 228 } else if (bl->buf_nr_pages) { 229 int j; 230 231 for (j = 0; j < bl->buf_nr_pages; j++) 232 unpin_user_page(bl->buf_pages[j]); 233 kvfree(bl->buf_pages); 234 bl->buf_pages = NULL; 235 bl->buf_nr_pages = 0; 236 } 237 /* make sure it's seen as empty */ 238 INIT_LIST_HEAD(&bl->buf_list); 239 bl->is_mapped = 0; 240 return i; 241 } 242 243 /* protects io_buffers_cache */ 244 lockdep_assert_held(&ctx->uring_lock); 245 246 while (!list_empty(&bl->buf_list)) { 247 struct io_buffer *nxt; 248 249 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 250 list_move(&nxt->list, &ctx->io_buffers_cache); 251 if (++i == nbufs) 252 return i; 253 cond_resched(); 254 } 255 256 return i; 257 } 258 259 void io_destroy_buffers(struct io_ring_ctx *ctx) 260 { 261 struct io_buffer_list *bl; 262 unsigned long index; 263 int i; 264 265 for (i = 0; i < BGID_ARRAY; i++) { 266 if (!ctx->io_bl) 267 break; 268 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); 269 } 270 271 xa_for_each(&ctx->io_bl_xa, index, bl) { 272 xa_erase(&ctx->io_bl_xa, bl->bgid); 273 __io_remove_buffers(ctx, bl, -1U); 274 kfree(bl); 275 } 276 277 while (!list_empty(&ctx->io_buffers_pages)) { 278 struct page *page; 279 280 page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); 281 list_del_init(&page->lru); 282 __free_page(page); 283 } 284 } 285 286 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 287 { 288 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 289 u64 tmp; 290 291 if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 292 sqe->splice_fd_in) 293 return -EINVAL; 294 295 tmp = READ_ONCE(sqe->fd); 296 if (!tmp || tmp > USHRT_MAX) 297 return -EINVAL; 298 299 memset(p, 0, sizeof(*p)); 300 p->nbufs = tmp; 301 p->bgid = READ_ONCE(sqe->buf_group); 302 return 0; 303 } 304 305 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 306 { 307 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 308 struct io_ring_ctx *ctx = req->ctx; 309 struct io_buffer_list *bl; 310 int ret = 0; 311 312 io_ring_submit_lock(ctx, issue_flags); 313 314 ret = -ENOENT; 315 bl = io_buffer_get_list(ctx, p->bgid); 316 if (bl) { 317 ret = -EINVAL; 318 /* can't use provide/remove buffers command on mapped buffers */ 319 if (!bl->is_mapped) 320 ret = __io_remove_buffers(ctx, bl, p->nbufs); 321 } 322 io_ring_submit_unlock(ctx, issue_flags); 323 if (ret < 0) 324 req_set_fail(req); 325 io_req_set_res(req, ret, 0); 326 return IOU_OK; 327 } 328 329 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 330 { 331 unsigned long size, tmp_check; 332 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 333 u64 tmp; 334 335 if (sqe->rw_flags || sqe->splice_fd_in) 336 return -EINVAL; 337 338 tmp = READ_ONCE(sqe->fd); 339 if (!tmp || tmp > USHRT_MAX) 340 return -E2BIG; 341 p->nbufs = tmp; 342 p->addr = READ_ONCE(sqe->addr); 343 p->len = READ_ONCE(sqe->len); 344 345 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 346 &size)) 347 return -EOVERFLOW; 348 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 349 return -EOVERFLOW; 350 351 size = (unsigned long)p->len * p->nbufs; 352 if (!access_ok(u64_to_user_ptr(p->addr), size)) 353 return -EFAULT; 354 355 p->bgid = READ_ONCE(sqe->buf_group); 356 tmp = READ_ONCE(sqe->off); 357 if (tmp > USHRT_MAX) 358 return -E2BIG; 359 if (tmp + p->nbufs >= USHRT_MAX) 360 return -EINVAL; 361 p->bid = tmp; 362 return 0; 363 } 364 365 static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 366 { 367 struct io_buffer *buf; 368 struct page *page; 369 int bufs_in_page; 370 371 /* 372 * Completions that don't happen inline (eg not under uring_lock) will 373 * add to ->io_buffers_comp. If we don't have any free buffers, check 374 * the completion list and splice those entries first. 375 */ 376 if (!list_empty_careful(&ctx->io_buffers_comp)) { 377 spin_lock(&ctx->completion_lock); 378 if (!list_empty(&ctx->io_buffers_comp)) { 379 list_splice_init(&ctx->io_buffers_comp, 380 &ctx->io_buffers_cache); 381 spin_unlock(&ctx->completion_lock); 382 return 0; 383 } 384 spin_unlock(&ctx->completion_lock); 385 } 386 387 /* 388 * No free buffers and no completion entries either. Allocate a new 389 * page worth of buffer entries and add those to our freelist. 390 */ 391 page = alloc_page(GFP_KERNEL_ACCOUNT); 392 if (!page) 393 return -ENOMEM; 394 395 list_add(&page->lru, &ctx->io_buffers_pages); 396 397 buf = page_address(page); 398 bufs_in_page = PAGE_SIZE / sizeof(*buf); 399 while (bufs_in_page) { 400 list_add_tail(&buf->list, &ctx->io_buffers_cache); 401 buf++; 402 bufs_in_page--; 403 } 404 405 return 0; 406 } 407 408 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 409 struct io_buffer_list *bl) 410 { 411 struct io_buffer *buf; 412 u64 addr = pbuf->addr; 413 int i, bid = pbuf->bid; 414 415 for (i = 0; i < pbuf->nbufs; i++) { 416 if (list_empty(&ctx->io_buffers_cache) && 417 io_refill_buffer_cache(ctx)) 418 break; 419 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 420 list); 421 list_move_tail(&buf->list, &bl->buf_list); 422 buf->addr = addr; 423 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 424 buf->bid = bid; 425 buf->bgid = pbuf->bgid; 426 addr += pbuf->len; 427 bid++; 428 cond_resched(); 429 } 430 431 return i ? 0 : -ENOMEM; 432 } 433 434 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 435 { 436 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 437 struct io_ring_ctx *ctx = req->ctx; 438 struct io_buffer_list *bl; 439 int ret = 0; 440 441 io_ring_submit_lock(ctx, issue_flags); 442 443 if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { 444 ret = io_init_bl_list(ctx); 445 if (ret) 446 goto err; 447 } 448 449 bl = io_buffer_get_list(ctx, p->bgid); 450 if (unlikely(!bl)) { 451 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); 452 if (!bl) { 453 ret = -ENOMEM; 454 goto err; 455 } 456 INIT_LIST_HEAD(&bl->buf_list); 457 ret = io_buffer_add_list(ctx, bl, p->bgid); 458 if (ret) { 459 kfree(bl); 460 goto err; 461 } 462 } 463 /* can't add buffers via this command for a mapped buffer ring */ 464 if (bl->is_mapped) { 465 ret = -EINVAL; 466 goto err; 467 } 468 469 ret = io_add_buffers(ctx, p, bl); 470 err: 471 io_ring_submit_unlock(ctx, issue_flags); 472 473 if (ret < 0) 474 req_set_fail(req); 475 io_req_set_res(req, ret, 0); 476 return IOU_OK; 477 } 478 479 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, 480 struct io_buffer_list *bl) 481 { 482 struct io_uring_buf_ring *br; 483 struct page **pages; 484 int nr_pages; 485 486 pages = io_pin_pages(reg->ring_addr, 487 flex_array_size(br, bufs, reg->ring_entries), 488 &nr_pages); 489 if (IS_ERR(pages)) 490 return PTR_ERR(pages); 491 492 br = page_address(pages[0]); 493 #ifdef SHM_COLOUR 494 /* 495 * On platforms that have specific aliasing requirements, SHM_COLOUR 496 * is set and we must guarantee that the kernel and user side align 497 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and 498 * the application mmap's the provided ring buffer. Fail the request 499 * if we, by chance, don't end up with aligned addresses. The app 500 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 501 * this transparently. 502 */ 503 if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { 504 int i; 505 506 for (i = 0; i < nr_pages; i++) 507 unpin_user_page(pages[i]); 508 return -EINVAL; 509 } 510 #endif 511 bl->buf_pages = pages; 512 bl->buf_nr_pages = nr_pages; 513 bl->buf_ring = br; 514 bl->is_mapped = 1; 515 bl->is_mmap = 0; 516 return 0; 517 } 518 519 static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, 520 struct io_buffer_list *bl) 521 { 522 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 523 size_t ring_size; 524 void *ptr; 525 526 ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 527 ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); 528 if (!ptr) 529 return -ENOMEM; 530 531 bl->buf_ring = ptr; 532 bl->is_mapped = 1; 533 bl->is_mmap = 1; 534 return 0; 535 } 536 537 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 538 { 539 struct io_uring_buf_reg reg; 540 struct io_buffer_list *bl, *free_bl = NULL; 541 int ret; 542 543 if (copy_from_user(®, arg, sizeof(reg))) 544 return -EFAULT; 545 546 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 547 return -EINVAL; 548 if (reg.flags & ~IOU_PBUF_RING_MMAP) 549 return -EINVAL; 550 if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 551 if (!reg.ring_addr) 552 return -EFAULT; 553 if (reg.ring_addr & ~PAGE_MASK) 554 return -EINVAL; 555 } else { 556 if (reg.ring_addr) 557 return -EINVAL; 558 } 559 560 if (!is_power_of_2(reg.ring_entries)) 561 return -EINVAL; 562 563 /* cannot disambiguate full vs empty due to head/tail size */ 564 if (reg.ring_entries >= 65536) 565 return -EINVAL; 566 567 if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { 568 int ret = io_init_bl_list(ctx); 569 if (ret) 570 return ret; 571 } 572 573 bl = io_buffer_get_list(ctx, reg.bgid); 574 if (bl) { 575 /* if mapped buffer ring OR classic exists, don't allow */ 576 if (bl->is_mapped || !list_empty(&bl->buf_list)) 577 return -EEXIST; 578 } else { 579 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); 580 if (!bl) 581 return -ENOMEM; 582 } 583 584 if (!(reg.flags & IOU_PBUF_RING_MMAP)) 585 ret = io_pin_pbuf_ring(®, bl); 586 else 587 ret = io_alloc_pbuf_ring(®, bl); 588 589 if (!ret) { 590 bl->nr_entries = reg.ring_entries; 591 bl->mask = reg.ring_entries - 1; 592 593 io_buffer_add_list(ctx, bl, reg.bgid); 594 return 0; 595 } 596 597 kfree(free_bl); 598 return ret; 599 } 600 601 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 602 { 603 struct io_uring_buf_reg reg; 604 struct io_buffer_list *bl; 605 606 if (copy_from_user(®, arg, sizeof(reg))) 607 return -EFAULT; 608 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 609 return -EINVAL; 610 if (reg.flags) 611 return -EINVAL; 612 613 bl = io_buffer_get_list(ctx, reg.bgid); 614 if (!bl) 615 return -ENOENT; 616 if (!bl->is_mapped) 617 return -EINVAL; 618 619 __io_remove_buffers(ctx, bl, -1U); 620 if (bl->bgid >= BGID_ARRAY) { 621 xa_erase(&ctx->io_bl_xa, bl->bgid); 622 kfree(bl); 623 } 624 return 0; 625 } 626 627 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) 628 { 629 struct io_buffer_list *bl; 630 631 bl = io_buffer_get_list(ctx, bgid); 632 if (!bl || !bl->is_mmap) 633 return NULL; 634 635 return bl->buf_ring; 636 } 637