1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 19 struct io_rsrc_update { 20 struct file *file; 21 u64 arg; 22 u32 nr_args; 23 u32 offset; 24 }; 25 26 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 27 struct io_mapped_ubuf **pimu, 28 struct page **last_hpage); 29 30 /* only define max */ 31 #define IORING_MAX_FIXED_FILES (1U << 20) 32 #define IORING_MAX_REG_BUFFERS (1U << 14) 33 34 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 35 { 36 unsigned long page_limit, cur_pages, new_pages; 37 38 if (!nr_pages) 39 return 0; 40 41 /* Don't allow more pages than we can safely lock */ 42 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 43 44 cur_pages = atomic_long_read(&user->locked_vm); 45 do { 46 new_pages = cur_pages + nr_pages; 47 if (new_pages > page_limit) 48 return -ENOMEM; 49 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 50 &cur_pages, new_pages)); 51 return 0; 52 } 53 54 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 55 { 56 if (ctx->user) 57 __io_unaccount_mem(ctx->user, nr_pages); 58 59 if (ctx->mm_account) 60 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 61 } 62 63 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 64 { 65 int ret; 66 67 if (ctx->user) { 68 ret = __io_account_mem(ctx->user, nr_pages); 69 if (ret) 70 return ret; 71 } 72 73 if (ctx->mm_account) 74 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 75 76 return 0; 77 } 78 79 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 80 void __user *arg, unsigned index) 81 { 82 struct iovec __user *src; 83 84 #ifdef CONFIG_COMPAT 85 if (ctx->compat) { 86 struct compat_iovec __user *ciovs; 87 struct compat_iovec ciov; 88 89 ciovs = (struct compat_iovec __user *) arg; 90 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 91 return -EFAULT; 92 93 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 94 dst->iov_len = ciov.iov_len; 95 return 0; 96 } 97 #endif 98 src = (struct iovec __user *) arg; 99 if (copy_from_user(dst, &src[index], sizeof(*dst))) 100 return -EFAULT; 101 return 0; 102 } 103 104 static int io_buffer_validate(struct iovec *iov) 105 { 106 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 107 108 /* 109 * Don't impose further limits on the size and buffer 110 * constraints here, we'll -EINVAL later when IO is 111 * submitted if they are wrong. 112 */ 113 if (!iov->iov_base) 114 return iov->iov_len ? -EFAULT : 0; 115 if (!iov->iov_len) 116 return -EFAULT; 117 118 /* arbitrary limit, but we need something */ 119 if (iov->iov_len > SZ_1G) 120 return -EFAULT; 121 122 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 123 return -EOVERFLOW; 124 125 return 0; 126 } 127 128 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 129 { 130 struct io_mapped_ubuf *imu = *slot; 131 unsigned int i; 132 133 if (imu != ctx->dummy_ubuf) { 134 for (i = 0; i < imu->nr_bvecs; i++) 135 unpin_user_page(imu->bvec[i].bv_page); 136 if (imu->acct_pages) 137 io_unaccount_mem(ctx, imu->acct_pages); 138 kvfree(imu); 139 } 140 *slot = NULL; 141 } 142 143 static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, 144 struct io_rsrc_put *prsrc) 145 { 146 struct io_ring_ctx *ctx = rsrc_data->ctx; 147 148 if (prsrc->tag) 149 io_post_aux_cqe(ctx, prsrc->tag, 0, 0); 150 rsrc_data->do_put(ctx, prsrc); 151 } 152 153 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 154 { 155 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 156 struct io_rsrc_put *prsrc, *tmp; 157 158 if (ref_node->inline_items) 159 io_rsrc_put_work_one(rsrc_data, &ref_node->item); 160 161 list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) { 162 list_del(&prsrc->list); 163 io_rsrc_put_work_one(rsrc_data, prsrc); 164 kfree(prsrc); 165 } 166 167 io_rsrc_node_destroy(rsrc_data->ctx, ref_node); 168 if (atomic_dec_and_test(&rsrc_data->refs)) 169 complete(&rsrc_data->done); 170 } 171 172 void io_wait_rsrc_data(struct io_rsrc_data *data) 173 { 174 if (data && !atomic_dec_and_test(&data->refs)) 175 wait_for_completion(&data->done); 176 } 177 178 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 179 { 180 if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) 181 kfree(node); 182 } 183 184 void io_rsrc_node_ref_zero(struct io_rsrc_node *node) 185 __must_hold(&node->rsrc_data->ctx->uring_lock) 186 { 187 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 188 189 node->done = true; 190 while (!list_empty(&ctx->rsrc_ref_list)) { 191 node = list_first_entry(&ctx->rsrc_ref_list, 192 struct io_rsrc_node, node); 193 /* recycle ref nodes in order */ 194 if (!node->done) 195 break; 196 197 list_del(&node->node); 198 __io_rsrc_put_work(node); 199 } 200 } 201 202 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 203 { 204 struct io_rsrc_node *ref_node; 205 struct io_cache_entry *entry; 206 207 entry = io_alloc_cache_get(&ctx->rsrc_node_cache); 208 if (entry) { 209 ref_node = container_of(entry, struct io_rsrc_node, cache); 210 } else { 211 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 212 if (!ref_node) 213 return NULL; 214 } 215 216 ref_node->refs = 1; 217 INIT_LIST_HEAD(&ref_node->node); 218 INIT_LIST_HEAD(&ref_node->item_list); 219 ref_node->done = false; 220 ref_node->inline_items = 0; 221 return ref_node; 222 } 223 224 void io_rsrc_node_switch(struct io_ring_ctx *ctx, 225 struct io_rsrc_data *data_to_kill) 226 __must_hold(&ctx->uring_lock) 227 { 228 WARN_ON_ONCE(!ctx->rsrc_backup_node); 229 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 230 231 if (data_to_kill) { 232 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 233 234 rsrc_node->rsrc_data = data_to_kill; 235 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 236 237 atomic_inc(&data_to_kill->refs); 238 /* put master ref */ 239 io_put_rsrc_node(ctx, rsrc_node); 240 ctx->rsrc_node = NULL; 241 } 242 243 if (!ctx->rsrc_node) { 244 ctx->rsrc_node = ctx->rsrc_backup_node; 245 ctx->rsrc_backup_node = NULL; 246 } 247 } 248 249 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 250 { 251 if (ctx->rsrc_backup_node) 252 return 0; 253 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); 254 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 255 } 256 257 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 258 struct io_ring_ctx *ctx) 259 { 260 int ret; 261 262 /* As we may drop ->uring_lock, other task may have started quiesce */ 263 if (data->quiesce) 264 return -ENXIO; 265 ret = io_rsrc_node_switch_start(ctx); 266 if (ret) 267 return ret; 268 io_rsrc_node_switch(ctx, data); 269 270 /* kill initial ref, already quiesced if zero */ 271 if (atomic_dec_and_test(&data->refs)) 272 return 0; 273 274 data->quiesce = true; 275 mutex_unlock(&ctx->uring_lock); 276 do { 277 ret = io_run_task_work_sig(ctx); 278 if (ret < 0) { 279 atomic_inc(&data->refs); 280 /* wait for all works potentially completing data->done */ 281 reinit_completion(&data->done); 282 mutex_lock(&ctx->uring_lock); 283 break; 284 } 285 286 ret = wait_for_completion_interruptible(&data->done); 287 if (!ret) { 288 mutex_lock(&ctx->uring_lock); 289 if (atomic_read(&data->refs) <= 0) 290 break; 291 /* 292 * it has been revived by another thread while 293 * we were unlocked 294 */ 295 mutex_unlock(&ctx->uring_lock); 296 } 297 } while (1); 298 data->quiesce = false; 299 300 return ret; 301 } 302 303 static void io_free_page_table(void **table, size_t size) 304 { 305 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 306 307 for (i = 0; i < nr_tables; i++) 308 kfree(table[i]); 309 kfree(table); 310 } 311 312 static void io_rsrc_data_free(struct io_rsrc_data *data) 313 { 314 size_t size = data->nr * sizeof(data->tags[0][0]); 315 316 if (data->tags) 317 io_free_page_table((void **)data->tags, size); 318 kfree(data); 319 } 320 321 static __cold void **io_alloc_page_table(size_t size) 322 { 323 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 324 size_t init_size = size; 325 void **table; 326 327 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 328 if (!table) 329 return NULL; 330 331 for (i = 0; i < nr_tables; i++) { 332 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 333 334 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 335 if (!table[i]) { 336 io_free_page_table(table, init_size); 337 return NULL; 338 } 339 size -= this_size; 340 } 341 return table; 342 } 343 344 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, 345 rsrc_put_fn *do_put, u64 __user *utags, 346 unsigned nr, struct io_rsrc_data **pdata) 347 { 348 struct io_rsrc_data *data; 349 int ret = 0; 350 unsigned i; 351 352 data = kzalloc(sizeof(*data), GFP_KERNEL); 353 if (!data) 354 return -ENOMEM; 355 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 356 if (!data->tags) { 357 kfree(data); 358 return -ENOMEM; 359 } 360 361 data->nr = nr; 362 data->ctx = ctx; 363 data->do_put = do_put; 364 if (utags) { 365 ret = -EFAULT; 366 for (i = 0; i < nr; i++) { 367 u64 *tag_slot = io_get_tag_slot(data, i); 368 369 if (copy_from_user(tag_slot, &utags[i], 370 sizeof(*tag_slot))) 371 goto fail; 372 } 373 } 374 375 atomic_set(&data->refs, 1); 376 init_completion(&data->done); 377 *pdata = data; 378 return 0; 379 fail: 380 io_rsrc_data_free(data); 381 return ret; 382 } 383 384 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 385 struct io_uring_rsrc_update2 *up, 386 unsigned nr_args) 387 { 388 u64 __user *tags = u64_to_user_ptr(up->tags); 389 __s32 __user *fds = u64_to_user_ptr(up->data); 390 struct io_rsrc_data *data = ctx->file_data; 391 struct io_fixed_file *file_slot; 392 struct file *file; 393 int fd, i, err = 0; 394 unsigned int done; 395 bool needs_switch = false; 396 397 if (!ctx->file_data) 398 return -ENXIO; 399 if (up->offset + nr_args > ctx->nr_user_files) 400 return -EINVAL; 401 402 for (done = 0; done < nr_args; done++) { 403 u64 tag = 0; 404 405 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 406 copy_from_user(&fd, &fds[done], sizeof(fd))) { 407 err = -EFAULT; 408 break; 409 } 410 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 411 err = -EINVAL; 412 break; 413 } 414 if (fd == IORING_REGISTER_FILES_SKIP) 415 continue; 416 417 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 418 file_slot = io_fixed_file_slot(&ctx->file_table, i); 419 420 if (file_slot->file_ptr) { 421 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 422 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 423 if (err) 424 break; 425 file_slot->file_ptr = 0; 426 io_file_bitmap_clear(&ctx->file_table, i); 427 needs_switch = true; 428 } 429 if (fd != -1) { 430 file = fget(fd); 431 if (!file) { 432 err = -EBADF; 433 break; 434 } 435 /* 436 * Don't allow io_uring instances to be registered. If 437 * UNIX isn't enabled, then this causes a reference 438 * cycle and this instance can never get freed. If UNIX 439 * is enabled we'll handle it just fine, but there's 440 * still no point in allowing a ring fd as it doesn't 441 * support regular read/write anyway. 442 */ 443 if (io_is_uring_fops(file)) { 444 fput(file); 445 err = -EBADF; 446 break; 447 } 448 err = io_scm_file_account(ctx, file); 449 if (err) { 450 fput(file); 451 break; 452 } 453 *io_get_tag_slot(data, i) = tag; 454 io_fixed_file_set(file_slot, file); 455 io_file_bitmap_set(&ctx->file_table, i); 456 } 457 } 458 459 if (needs_switch) 460 io_rsrc_node_switch(ctx, data); 461 return done ? done : err; 462 } 463 464 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 465 struct io_uring_rsrc_update2 *up, 466 unsigned int nr_args) 467 { 468 u64 __user *tags = u64_to_user_ptr(up->tags); 469 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 470 struct page *last_hpage = NULL; 471 bool needs_switch = false; 472 __u32 done; 473 int i, err; 474 475 if (!ctx->buf_data) 476 return -ENXIO; 477 if (up->offset + nr_args > ctx->nr_user_bufs) 478 return -EINVAL; 479 480 for (done = 0; done < nr_args; done++) { 481 struct io_mapped_ubuf *imu; 482 int offset = up->offset + done; 483 u64 tag = 0; 484 485 err = io_copy_iov(ctx, &iov, iovs, done); 486 if (err) 487 break; 488 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 489 err = -EFAULT; 490 break; 491 } 492 err = io_buffer_validate(&iov); 493 if (err) 494 break; 495 if (!iov.iov_base && tag) { 496 err = -EINVAL; 497 break; 498 } 499 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 500 if (err) 501 break; 502 503 i = array_index_nospec(offset, ctx->nr_user_bufs); 504 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 505 err = io_queue_rsrc_removal(ctx->buf_data, i, 506 ctx->rsrc_node, ctx->user_bufs[i]); 507 if (unlikely(err)) { 508 io_buffer_unmap(ctx, &imu); 509 break; 510 } 511 ctx->user_bufs[i] = ctx->dummy_ubuf; 512 needs_switch = true; 513 } 514 515 ctx->user_bufs[i] = imu; 516 *io_get_tag_slot(ctx->buf_data, offset) = tag; 517 } 518 519 if (needs_switch) 520 io_rsrc_node_switch(ctx, ctx->buf_data); 521 return done ? done : err; 522 } 523 524 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 525 struct io_uring_rsrc_update2 *up, 526 unsigned nr_args) 527 { 528 __u32 tmp; 529 int err; 530 531 if (check_add_overflow(up->offset, nr_args, &tmp)) 532 return -EOVERFLOW; 533 err = io_rsrc_node_switch_start(ctx); 534 if (err) 535 return err; 536 537 switch (type) { 538 case IORING_RSRC_FILE: 539 return __io_sqe_files_update(ctx, up, nr_args); 540 case IORING_RSRC_BUFFER: 541 return __io_sqe_buffers_update(ctx, up, nr_args); 542 } 543 return -EINVAL; 544 } 545 546 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 547 unsigned nr_args) 548 { 549 struct io_uring_rsrc_update2 up; 550 551 if (!nr_args) 552 return -EINVAL; 553 memset(&up, 0, sizeof(up)); 554 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 555 return -EFAULT; 556 if (up.resv || up.resv2) 557 return -EINVAL; 558 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 559 } 560 561 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 562 unsigned size, unsigned type) 563 { 564 struct io_uring_rsrc_update2 up; 565 566 if (size != sizeof(up)) 567 return -EINVAL; 568 if (copy_from_user(&up, arg, sizeof(up))) 569 return -EFAULT; 570 if (!up.nr || up.resv || up.resv2) 571 return -EINVAL; 572 return __io_register_rsrc_update(ctx, type, &up, up.nr); 573 } 574 575 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 576 unsigned int size, unsigned int type) 577 { 578 struct io_uring_rsrc_register rr; 579 580 /* keep it extendible */ 581 if (size != sizeof(rr)) 582 return -EINVAL; 583 584 memset(&rr, 0, sizeof(rr)); 585 if (copy_from_user(&rr, arg, size)) 586 return -EFAULT; 587 if (!rr.nr || rr.resv2) 588 return -EINVAL; 589 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 590 return -EINVAL; 591 592 switch (type) { 593 case IORING_RSRC_FILE: 594 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 595 break; 596 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 597 rr.nr, u64_to_user_ptr(rr.tags)); 598 case IORING_RSRC_BUFFER: 599 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 600 break; 601 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 602 rr.nr, u64_to_user_ptr(rr.tags)); 603 } 604 return -EINVAL; 605 } 606 607 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 608 { 609 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 610 611 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 612 return -EINVAL; 613 if (sqe->rw_flags || sqe->splice_fd_in) 614 return -EINVAL; 615 616 up->offset = READ_ONCE(sqe->off); 617 up->nr_args = READ_ONCE(sqe->len); 618 if (!up->nr_args) 619 return -EINVAL; 620 up->arg = READ_ONCE(sqe->addr); 621 return 0; 622 } 623 624 static int io_files_update_with_index_alloc(struct io_kiocb *req, 625 unsigned int issue_flags) 626 { 627 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 628 __s32 __user *fds = u64_to_user_ptr(up->arg); 629 unsigned int done; 630 struct file *file; 631 int ret, fd; 632 633 if (!req->ctx->file_data) 634 return -ENXIO; 635 636 for (done = 0; done < up->nr_args; done++) { 637 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 638 ret = -EFAULT; 639 break; 640 } 641 642 file = fget(fd); 643 if (!file) { 644 ret = -EBADF; 645 break; 646 } 647 ret = io_fixed_fd_install(req, issue_flags, file, 648 IORING_FILE_INDEX_ALLOC); 649 if (ret < 0) 650 break; 651 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 652 __io_close_fixed(req->ctx, issue_flags, ret); 653 ret = -EFAULT; 654 break; 655 } 656 } 657 658 if (done) 659 return done; 660 return ret; 661 } 662 663 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 664 { 665 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 666 struct io_ring_ctx *ctx = req->ctx; 667 struct io_uring_rsrc_update2 up2; 668 int ret; 669 670 up2.offset = up->offset; 671 up2.data = up->arg; 672 up2.nr = 0; 673 up2.tags = 0; 674 up2.resv = 0; 675 up2.resv2 = 0; 676 677 if (up->offset == IORING_FILE_INDEX_ALLOC) { 678 ret = io_files_update_with_index_alloc(req, issue_flags); 679 } else { 680 io_ring_submit_lock(ctx, issue_flags); 681 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 682 &up2, up->nr_args); 683 io_ring_submit_unlock(ctx, issue_flags); 684 } 685 686 if (ret < 0) 687 req_set_fail(req); 688 io_req_set_res(req, ret, 0); 689 return IOU_OK; 690 } 691 692 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 693 struct io_rsrc_node *node, void *rsrc) 694 { 695 u64 *tag_slot = io_get_tag_slot(data, idx); 696 struct io_rsrc_put *prsrc; 697 bool inline_item = true; 698 699 if (!node->inline_items) { 700 prsrc = &node->item; 701 node->inline_items++; 702 } else { 703 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 704 if (!prsrc) 705 return -ENOMEM; 706 inline_item = false; 707 } 708 709 prsrc->tag = *tag_slot; 710 *tag_slot = 0; 711 prsrc->rsrc = rsrc; 712 if (!inline_item) 713 list_add(&prsrc->list, &node->item_list); 714 return 0; 715 } 716 717 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 718 { 719 int i; 720 721 for (i = 0; i < ctx->nr_user_files; i++) { 722 struct file *file = io_file_from_index(&ctx->file_table, i); 723 724 /* skip scm accounted files, they'll be freed by ->ring_sock */ 725 if (!file || io_file_need_scm(file)) 726 continue; 727 io_file_bitmap_clear(&ctx->file_table, i); 728 fput(file); 729 } 730 731 #if defined(CONFIG_UNIX) 732 if (ctx->ring_sock) { 733 struct sock *sock = ctx->ring_sock->sk; 734 struct sk_buff *skb; 735 736 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 737 kfree_skb(skb); 738 } 739 #endif 740 io_free_file_tables(&ctx->file_table); 741 io_file_table_set_alloc_range(ctx, 0, 0); 742 io_rsrc_data_free(ctx->file_data); 743 ctx->file_data = NULL; 744 ctx->nr_user_files = 0; 745 } 746 747 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 748 { 749 unsigned nr = ctx->nr_user_files; 750 int ret; 751 752 if (!ctx->file_data) 753 return -ENXIO; 754 755 /* 756 * Quiesce may unlock ->uring_lock, and while it's not held 757 * prevent new requests using the table. 758 */ 759 ctx->nr_user_files = 0; 760 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 761 ctx->nr_user_files = nr; 762 if (!ret) 763 __io_sqe_files_unregister(ctx); 764 return ret; 765 } 766 767 /* 768 * Ensure the UNIX gc is aware of our file set, so we are certain that 769 * the io_uring can be safely unregistered on process exit, even if we have 770 * loops in the file referencing. We account only files that can hold other 771 * files because otherwise they can't form a loop and so are not interesting 772 * for GC. 773 */ 774 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 775 { 776 #if defined(CONFIG_UNIX) 777 struct sock *sk = ctx->ring_sock->sk; 778 struct sk_buff_head *head = &sk->sk_receive_queue; 779 struct scm_fp_list *fpl; 780 struct sk_buff *skb; 781 782 if (likely(!io_file_need_scm(file))) 783 return 0; 784 785 /* 786 * See if we can merge this file into an existing skb SCM_RIGHTS 787 * file set. If there's no room, fall back to allocating a new skb 788 * and filling it in. 789 */ 790 spin_lock_irq(&head->lock); 791 skb = skb_peek(head); 792 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 793 __skb_unlink(skb, head); 794 else 795 skb = NULL; 796 spin_unlock_irq(&head->lock); 797 798 if (!skb) { 799 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 800 if (!fpl) 801 return -ENOMEM; 802 803 skb = alloc_skb(0, GFP_KERNEL); 804 if (!skb) { 805 kfree(fpl); 806 return -ENOMEM; 807 } 808 809 fpl->user = get_uid(current_user()); 810 fpl->max = SCM_MAX_FD; 811 fpl->count = 0; 812 813 UNIXCB(skb).fp = fpl; 814 skb->sk = sk; 815 skb->scm_io_uring = 1; 816 skb->destructor = unix_destruct_scm; 817 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 818 } 819 820 fpl = UNIXCB(skb).fp; 821 fpl->fp[fpl->count++] = get_file(file); 822 unix_inflight(fpl->user, file); 823 skb_queue_head(head, skb); 824 fput(file); 825 #endif 826 return 0; 827 } 828 829 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 830 { 831 struct file *file = prsrc->file; 832 #if defined(CONFIG_UNIX) 833 struct sock *sock = ctx->ring_sock->sk; 834 struct sk_buff_head list, *head = &sock->sk_receive_queue; 835 struct sk_buff *skb; 836 int i; 837 838 if (!io_file_need_scm(file)) { 839 fput(file); 840 return; 841 } 842 843 __skb_queue_head_init(&list); 844 845 /* 846 * Find the skb that holds this file in its SCM_RIGHTS. When found, 847 * remove this entry and rearrange the file array. 848 */ 849 skb = skb_dequeue(head); 850 while (skb) { 851 struct scm_fp_list *fp; 852 853 fp = UNIXCB(skb).fp; 854 for (i = 0; i < fp->count; i++) { 855 int left; 856 857 if (fp->fp[i] != file) 858 continue; 859 860 unix_notinflight(fp->user, fp->fp[i]); 861 left = fp->count - 1 - i; 862 if (left) { 863 memmove(&fp->fp[i], &fp->fp[i + 1], 864 left * sizeof(struct file *)); 865 } 866 fp->count--; 867 if (!fp->count) { 868 kfree_skb(skb); 869 skb = NULL; 870 } else { 871 __skb_queue_tail(&list, skb); 872 } 873 fput(file); 874 file = NULL; 875 break; 876 } 877 878 if (!file) 879 break; 880 881 __skb_queue_tail(&list, skb); 882 883 skb = skb_dequeue(head); 884 } 885 886 if (skb_peek(&list)) { 887 spin_lock_irq(&head->lock); 888 while ((skb = __skb_dequeue(&list)) != NULL) 889 __skb_queue_tail(head, skb); 890 spin_unlock_irq(&head->lock); 891 } 892 #else 893 fput(file); 894 #endif 895 } 896 897 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 898 unsigned nr_args, u64 __user *tags) 899 { 900 __s32 __user *fds = (__s32 __user *) arg; 901 struct file *file; 902 int fd, ret; 903 unsigned i; 904 905 if (ctx->file_data) 906 return -EBUSY; 907 if (!nr_args) 908 return -EINVAL; 909 if (nr_args > IORING_MAX_FIXED_FILES) 910 return -EMFILE; 911 if (nr_args > rlimit(RLIMIT_NOFILE)) 912 return -EMFILE; 913 ret = io_rsrc_node_switch_start(ctx); 914 if (ret) 915 return ret; 916 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 917 &ctx->file_data); 918 if (ret) 919 return ret; 920 921 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 922 io_rsrc_data_free(ctx->file_data); 923 ctx->file_data = NULL; 924 return -ENOMEM; 925 } 926 927 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 928 struct io_fixed_file *file_slot; 929 930 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 931 ret = -EFAULT; 932 goto fail; 933 } 934 /* allow sparse sets */ 935 if (!fds || fd == -1) { 936 ret = -EINVAL; 937 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 938 goto fail; 939 continue; 940 } 941 942 file = fget(fd); 943 ret = -EBADF; 944 if (unlikely(!file)) 945 goto fail; 946 947 /* 948 * Don't allow io_uring instances to be registered. If UNIX 949 * isn't enabled, then this causes a reference cycle and this 950 * instance can never get freed. If UNIX is enabled we'll 951 * handle it just fine, but there's still no point in allowing 952 * a ring fd as it doesn't support regular read/write anyway. 953 */ 954 if (io_is_uring_fops(file)) { 955 fput(file); 956 goto fail; 957 } 958 ret = io_scm_file_account(ctx, file); 959 if (ret) { 960 fput(file); 961 goto fail; 962 } 963 file_slot = io_fixed_file_slot(&ctx->file_table, i); 964 io_fixed_file_set(file_slot, file); 965 io_file_bitmap_set(&ctx->file_table, i); 966 } 967 968 /* default it to the whole table */ 969 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 970 io_rsrc_node_switch(ctx, NULL); 971 return 0; 972 fail: 973 __io_sqe_files_unregister(ctx); 974 return ret; 975 } 976 977 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 978 { 979 io_buffer_unmap(ctx, &prsrc->buf); 980 prsrc->buf = NULL; 981 } 982 983 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 984 { 985 unsigned int i; 986 987 for (i = 0; i < ctx->nr_user_bufs; i++) 988 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 989 kfree(ctx->user_bufs); 990 io_rsrc_data_free(ctx->buf_data); 991 ctx->user_bufs = NULL; 992 ctx->buf_data = NULL; 993 ctx->nr_user_bufs = 0; 994 } 995 996 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 997 { 998 unsigned nr = ctx->nr_user_bufs; 999 int ret; 1000 1001 if (!ctx->buf_data) 1002 return -ENXIO; 1003 1004 /* 1005 * Quiesce may unlock ->uring_lock, and while it's not held 1006 * prevent new requests using the table. 1007 */ 1008 ctx->nr_user_bufs = 0; 1009 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 1010 ctx->nr_user_bufs = nr; 1011 if (!ret) 1012 __io_sqe_buffers_unregister(ctx); 1013 return ret; 1014 } 1015 1016 /* 1017 * Not super efficient, but this is just a registration time. And we do cache 1018 * the last compound head, so generally we'll only do a full search if we don't 1019 * match that one. 1020 * 1021 * We check if the given compound head page has already been accounted, to 1022 * avoid double accounting it. This allows us to account the full size of the 1023 * page, not just the constituent pages of a huge page. 1024 */ 1025 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 1026 int nr_pages, struct page *hpage) 1027 { 1028 int i, j; 1029 1030 /* check current page array */ 1031 for (i = 0; i < nr_pages; i++) { 1032 if (!PageCompound(pages[i])) 1033 continue; 1034 if (compound_head(pages[i]) == hpage) 1035 return true; 1036 } 1037 1038 /* check previously registered pages */ 1039 for (i = 0; i < ctx->nr_user_bufs; i++) { 1040 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 1041 1042 for (j = 0; j < imu->nr_bvecs; j++) { 1043 if (!PageCompound(imu->bvec[j].bv_page)) 1044 continue; 1045 if (compound_head(imu->bvec[j].bv_page) == hpage) 1046 return true; 1047 } 1048 } 1049 1050 return false; 1051 } 1052 1053 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1054 int nr_pages, struct io_mapped_ubuf *imu, 1055 struct page **last_hpage) 1056 { 1057 int i, ret; 1058 1059 imu->acct_pages = 0; 1060 for (i = 0; i < nr_pages; i++) { 1061 if (!PageCompound(pages[i])) { 1062 imu->acct_pages++; 1063 } else { 1064 struct page *hpage; 1065 1066 hpage = compound_head(pages[i]); 1067 if (hpage == *last_hpage) 1068 continue; 1069 *last_hpage = hpage; 1070 if (headpage_already_acct(ctx, pages, i, hpage)) 1071 continue; 1072 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1073 } 1074 } 1075 1076 if (!imu->acct_pages) 1077 return 0; 1078 1079 ret = io_account_mem(ctx, imu->acct_pages); 1080 if (ret) 1081 imu->acct_pages = 0; 1082 return ret; 1083 } 1084 1085 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1086 { 1087 unsigned long start, end, nr_pages; 1088 struct vm_area_struct **vmas = NULL; 1089 struct page **pages = NULL; 1090 int i, pret, ret = -ENOMEM; 1091 1092 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1093 start = ubuf >> PAGE_SHIFT; 1094 nr_pages = end - start; 1095 1096 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1097 if (!pages) 1098 goto done; 1099 1100 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1101 GFP_KERNEL); 1102 if (!vmas) 1103 goto done; 1104 1105 ret = 0; 1106 mmap_read_lock(current->mm); 1107 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1108 pages, vmas); 1109 if (pret == nr_pages) { 1110 struct file *file = vmas[0]->vm_file; 1111 1112 /* don't support file backed memory */ 1113 for (i = 0; i < nr_pages; i++) { 1114 if (vmas[i]->vm_file != file) { 1115 ret = -EINVAL; 1116 break; 1117 } 1118 if (!file) 1119 continue; 1120 if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { 1121 ret = -EOPNOTSUPP; 1122 break; 1123 } 1124 } 1125 *npages = nr_pages; 1126 } else { 1127 ret = pret < 0 ? pret : -EFAULT; 1128 } 1129 mmap_read_unlock(current->mm); 1130 if (ret) { 1131 /* 1132 * if we did partial map, or found file backed vmas, 1133 * release any pages we did get 1134 */ 1135 if (pret > 0) 1136 unpin_user_pages(pages, pret); 1137 goto done; 1138 } 1139 ret = 0; 1140 done: 1141 kvfree(vmas); 1142 if (ret < 0) { 1143 kvfree(pages); 1144 pages = ERR_PTR(ret); 1145 } 1146 return pages; 1147 } 1148 1149 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1150 struct io_mapped_ubuf **pimu, 1151 struct page **last_hpage) 1152 { 1153 struct io_mapped_ubuf *imu = NULL; 1154 struct page **pages = NULL; 1155 unsigned long off; 1156 size_t size; 1157 int ret, nr_pages, i; 1158 struct folio *folio = NULL; 1159 1160 *pimu = ctx->dummy_ubuf; 1161 if (!iov->iov_base) 1162 return 0; 1163 1164 ret = -ENOMEM; 1165 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1166 &nr_pages); 1167 if (IS_ERR(pages)) { 1168 ret = PTR_ERR(pages); 1169 pages = NULL; 1170 goto done; 1171 } 1172 1173 /* If it's a huge page, try to coalesce them into a single bvec entry */ 1174 if (nr_pages > 1) { 1175 folio = page_folio(pages[0]); 1176 for (i = 1; i < nr_pages; i++) { 1177 if (page_folio(pages[i]) != folio) { 1178 folio = NULL; 1179 break; 1180 } 1181 } 1182 if (folio) { 1183 /* 1184 * The pages are bound to the folio, it doesn't 1185 * actually unpin them but drops all but one reference, 1186 * which is usually put down by io_buffer_unmap(). 1187 * Note, needs a better helper. 1188 */ 1189 unpin_user_pages(&pages[1], nr_pages - 1); 1190 nr_pages = 1; 1191 } 1192 } 1193 1194 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1195 if (!imu) 1196 goto done; 1197 1198 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1199 if (ret) { 1200 unpin_user_pages(pages, nr_pages); 1201 goto done; 1202 } 1203 1204 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1205 size = iov->iov_len; 1206 /* store original address for later verification */ 1207 imu->ubuf = (unsigned long) iov->iov_base; 1208 imu->ubuf_end = imu->ubuf + iov->iov_len; 1209 imu->nr_bvecs = nr_pages; 1210 *pimu = imu; 1211 ret = 0; 1212 1213 if (folio) { 1214 bvec_set_page(&imu->bvec[0], pages[0], size, off); 1215 goto done; 1216 } 1217 for (i = 0; i < nr_pages; i++) { 1218 size_t vec_len; 1219 1220 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1221 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 1222 off = 0; 1223 size -= vec_len; 1224 } 1225 done: 1226 if (ret) 1227 kvfree(imu); 1228 kvfree(pages); 1229 return ret; 1230 } 1231 1232 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1233 { 1234 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1235 return ctx->user_bufs ? 0 : -ENOMEM; 1236 } 1237 1238 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1239 unsigned int nr_args, u64 __user *tags) 1240 { 1241 struct page *last_hpage = NULL; 1242 struct io_rsrc_data *data; 1243 int i, ret; 1244 struct iovec iov; 1245 1246 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1247 1248 if (ctx->user_bufs) 1249 return -EBUSY; 1250 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1251 return -EINVAL; 1252 ret = io_rsrc_node_switch_start(ctx); 1253 if (ret) 1254 return ret; 1255 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 1256 if (ret) 1257 return ret; 1258 ret = io_buffers_map_alloc(ctx, nr_args); 1259 if (ret) { 1260 io_rsrc_data_free(data); 1261 return ret; 1262 } 1263 1264 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1265 if (arg) { 1266 ret = io_copy_iov(ctx, &iov, arg, i); 1267 if (ret) 1268 break; 1269 ret = io_buffer_validate(&iov); 1270 if (ret) 1271 break; 1272 } else { 1273 memset(&iov, 0, sizeof(iov)); 1274 } 1275 1276 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1277 ret = -EINVAL; 1278 break; 1279 } 1280 1281 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1282 &last_hpage); 1283 if (ret) 1284 break; 1285 } 1286 1287 WARN_ON_ONCE(ctx->buf_data); 1288 1289 ctx->buf_data = data; 1290 if (ret) 1291 __io_sqe_buffers_unregister(ctx); 1292 else 1293 io_rsrc_node_switch(ctx, NULL); 1294 return ret; 1295 } 1296 1297 int io_import_fixed(int ddir, struct iov_iter *iter, 1298 struct io_mapped_ubuf *imu, 1299 u64 buf_addr, size_t len) 1300 { 1301 u64 buf_end; 1302 size_t offset; 1303 1304 if (WARN_ON_ONCE(!imu)) 1305 return -EFAULT; 1306 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1307 return -EFAULT; 1308 /* not inside the mapped region */ 1309 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1310 return -EFAULT; 1311 1312 /* 1313 * Might not be a start of buffer, set size appropriately 1314 * and advance us to the beginning. 1315 */ 1316 offset = buf_addr - imu->ubuf; 1317 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1318 1319 if (offset) { 1320 /* 1321 * Don't use iov_iter_advance() here, as it's really slow for 1322 * using the latter parts of a big fixed buffer - it iterates 1323 * over each segment manually. We can cheat a bit here, because 1324 * we know that: 1325 * 1326 * 1) it's a BVEC iter, we set it up 1327 * 2) all bvecs are PAGE_SIZE in size, except potentially the 1328 * first and last bvec 1329 * 1330 * So just find our index, and adjust the iterator afterwards. 1331 * If the offset is within the first bvec (or the whole first 1332 * bvec, just use iov_iter_advance(). This makes it easier 1333 * since we can just skip the first segment, which may not 1334 * be PAGE_SIZE aligned. 1335 */ 1336 const struct bio_vec *bvec = imu->bvec; 1337 1338 if (offset <= bvec->bv_len) { 1339 /* 1340 * Note, huge pages buffers consists of one large 1341 * bvec entry and should always go this way. The other 1342 * branch doesn't expect non PAGE_SIZE'd chunks. 1343 */ 1344 iter->bvec = bvec; 1345 iter->nr_segs = bvec->bv_len; 1346 iter->count -= offset; 1347 iter->iov_offset = offset; 1348 } else { 1349 unsigned long seg_skip; 1350 1351 /* skip first vec */ 1352 offset -= bvec->bv_len; 1353 seg_skip = 1 + (offset >> PAGE_SHIFT); 1354 1355 iter->bvec = bvec + seg_skip; 1356 iter->nr_segs -= seg_skip; 1357 iter->count -= bvec->bv_len + offset; 1358 iter->iov_offset = offset & ~PAGE_MASK; 1359 } 1360 } 1361 1362 return 0; 1363 } 1364