1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 19 struct io_rsrc_update { 20 struct file *file; 21 u64 arg; 22 u32 nr_args; 23 u32 offset; 24 }; 25 26 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 27 struct io_mapped_ubuf **pimu, 28 struct page **last_hpage); 29 30 /* only define max */ 31 #define IORING_MAX_FIXED_FILES (1U << 20) 32 #define IORING_MAX_REG_BUFFERS (1U << 14) 33 34 static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data) 35 { 36 return !--rsrc_data->refs; 37 } 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 60 { 61 if (ctx->user) 62 __io_unaccount_mem(ctx->user, nr_pages); 63 64 if (ctx->mm_account) 65 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 66 } 67 68 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 69 { 70 int ret; 71 72 if (ctx->user) { 73 ret = __io_account_mem(ctx->user, nr_pages); 74 if (ret) 75 return ret; 76 } 77 78 if (ctx->mm_account) 79 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 80 81 return 0; 82 } 83 84 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 85 void __user *arg, unsigned index) 86 { 87 struct iovec __user *src; 88 89 #ifdef CONFIG_COMPAT 90 if (ctx->compat) { 91 struct compat_iovec __user *ciovs; 92 struct compat_iovec ciov; 93 94 ciovs = (struct compat_iovec __user *) arg; 95 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 96 return -EFAULT; 97 98 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 99 dst->iov_len = ciov.iov_len; 100 return 0; 101 } 102 #endif 103 src = (struct iovec __user *) arg; 104 if (copy_from_user(dst, &src[index], sizeof(*dst))) 105 return -EFAULT; 106 return 0; 107 } 108 109 static int io_buffer_validate(struct iovec *iov) 110 { 111 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 112 113 /* 114 * Don't impose further limits on the size and buffer 115 * constraints here, we'll -EINVAL later when IO is 116 * submitted if they are wrong. 117 */ 118 if (!iov->iov_base) 119 return iov->iov_len ? -EFAULT : 0; 120 if (!iov->iov_len) 121 return -EFAULT; 122 123 /* arbitrary limit, but we need something */ 124 if (iov->iov_len > SZ_1G) 125 return -EFAULT; 126 127 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 128 return -EOVERFLOW; 129 130 return 0; 131 } 132 133 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 134 { 135 struct io_mapped_ubuf *imu = *slot; 136 unsigned int i; 137 138 if (imu != ctx->dummy_ubuf) { 139 for (i = 0; i < imu->nr_bvecs; i++) 140 unpin_user_page(imu->bvec[i].bv_page); 141 if (imu->acct_pages) 142 io_unaccount_mem(ctx, imu->acct_pages); 143 kvfree(imu); 144 } 145 *slot = NULL; 146 } 147 148 static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, 149 struct io_rsrc_put *prsrc) 150 { 151 struct io_ring_ctx *ctx = rsrc_data->ctx; 152 153 if (prsrc->tag) 154 io_post_aux_cqe(ctx, prsrc->tag, 0, 0); 155 rsrc_data->do_put(ctx, prsrc); 156 } 157 158 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 159 { 160 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 161 struct io_ring_ctx *ctx = rsrc_data->ctx; 162 struct io_rsrc_put *prsrc, *tmp; 163 164 if (ref_node->inline_items) 165 io_rsrc_put_work_one(rsrc_data, &ref_node->item); 166 167 list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) { 168 list_del(&prsrc->list); 169 io_rsrc_put_work_one(rsrc_data, prsrc); 170 kfree(prsrc); 171 } 172 173 io_rsrc_node_destroy(rsrc_data->ctx, ref_node); 174 if (io_put_rsrc_data_ref(rsrc_data)) 175 wake_up_all(&ctx->rsrc_quiesce_wq); 176 } 177 178 void io_wait_rsrc_data(struct io_rsrc_data *data) 179 { 180 if (data) 181 WARN_ON_ONCE(!io_put_rsrc_data_ref(data)); 182 } 183 184 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 185 { 186 if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) 187 kfree(node); 188 } 189 190 void io_rsrc_node_ref_zero(struct io_rsrc_node *node) 191 __must_hold(&node->rsrc_data->ctx->uring_lock) 192 { 193 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 194 195 while (!list_empty(&ctx->rsrc_ref_list)) { 196 node = list_first_entry(&ctx->rsrc_ref_list, 197 struct io_rsrc_node, node); 198 /* recycle ref nodes in order */ 199 if (node->refs) 200 break; 201 list_del(&node->node); 202 __io_rsrc_put_work(node); 203 } 204 } 205 206 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 207 { 208 struct io_rsrc_node *ref_node; 209 struct io_cache_entry *entry; 210 211 entry = io_alloc_cache_get(&ctx->rsrc_node_cache); 212 if (entry) { 213 ref_node = container_of(entry, struct io_rsrc_node, cache); 214 } else { 215 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 216 if (!ref_node) 217 return NULL; 218 } 219 220 ref_node->rsrc_data = NULL; 221 ref_node->refs = 1; 222 INIT_LIST_HEAD(&ref_node->node); 223 INIT_LIST_HEAD(&ref_node->item_list); 224 ref_node->inline_items = 0; 225 return ref_node; 226 } 227 228 void io_rsrc_node_switch(struct io_ring_ctx *ctx, 229 struct io_rsrc_data *data_to_kill) 230 __must_hold(&ctx->uring_lock) 231 { 232 struct io_rsrc_node *node = ctx->rsrc_node; 233 struct io_rsrc_node *backup = io_rsrc_node_alloc(ctx); 234 235 if (WARN_ON_ONCE(!backup)) 236 return; 237 238 data_to_kill->refs++; 239 node->rsrc_data = data_to_kill; 240 list_add_tail(&node->node, &ctx->rsrc_ref_list); 241 /* put master ref */ 242 io_put_rsrc_node(ctx, node); 243 ctx->rsrc_node = backup; 244 } 245 246 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 247 { 248 if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) { 249 struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL); 250 251 if (!node) 252 return -ENOMEM; 253 io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache); 254 } 255 return 0; 256 } 257 258 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 259 struct io_ring_ctx *ctx) 260 { 261 DEFINE_WAIT(we); 262 int ret; 263 264 /* As we may drop ->uring_lock, other task may have started quiesce */ 265 if (data->quiesce) 266 return -ENXIO; 267 ret = io_rsrc_node_switch_start(ctx); 268 if (ret) 269 return ret; 270 io_rsrc_node_switch(ctx, data); 271 272 /* kill initial ref */ 273 if (io_put_rsrc_data_ref(data)) 274 return 0; 275 276 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 277 atomic_set(&ctx->cq_wait_nr, 1); 278 smp_mb(); 279 } 280 281 data->quiesce = true; 282 do { 283 prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); 284 mutex_unlock(&ctx->uring_lock); 285 286 ret = io_run_task_work_sig(ctx); 287 if (ret < 0) { 288 mutex_lock(&ctx->uring_lock); 289 if (!data->refs) { 290 ret = 0; 291 } else { 292 /* restore the master reference */ 293 data->refs++; 294 } 295 break; 296 } 297 298 schedule(); 299 __set_current_state(TASK_RUNNING); 300 mutex_lock(&ctx->uring_lock); 301 ret = 0; 302 } while (data->refs); 303 304 finish_wait(&ctx->rsrc_quiesce_wq, &we); 305 data->quiesce = false; 306 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 307 atomic_set(&ctx->cq_wait_nr, 0); 308 smp_mb(); 309 } 310 return ret; 311 } 312 313 static void io_free_page_table(void **table, size_t size) 314 { 315 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 316 317 for (i = 0; i < nr_tables; i++) 318 kfree(table[i]); 319 kfree(table); 320 } 321 322 static void io_rsrc_data_free(struct io_rsrc_data *data) 323 { 324 size_t size = data->nr * sizeof(data->tags[0][0]); 325 326 if (data->tags) 327 io_free_page_table((void **)data->tags, size); 328 kfree(data); 329 } 330 331 static __cold void **io_alloc_page_table(size_t size) 332 { 333 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 334 size_t init_size = size; 335 void **table; 336 337 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 338 if (!table) 339 return NULL; 340 341 for (i = 0; i < nr_tables; i++) { 342 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 343 344 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 345 if (!table[i]) { 346 io_free_page_table(table, init_size); 347 return NULL; 348 } 349 size -= this_size; 350 } 351 return table; 352 } 353 354 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, 355 rsrc_put_fn *do_put, u64 __user *utags, 356 unsigned nr, struct io_rsrc_data **pdata) 357 { 358 struct io_rsrc_data *data; 359 int ret = 0; 360 unsigned i; 361 362 data = kzalloc(sizeof(*data), GFP_KERNEL); 363 if (!data) 364 return -ENOMEM; 365 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 366 if (!data->tags) { 367 kfree(data); 368 return -ENOMEM; 369 } 370 371 data->nr = nr; 372 data->ctx = ctx; 373 data->do_put = do_put; 374 data->refs = 1; 375 if (utags) { 376 ret = -EFAULT; 377 for (i = 0; i < nr; i++) { 378 u64 *tag_slot = io_get_tag_slot(data, i); 379 380 if (copy_from_user(tag_slot, &utags[i], 381 sizeof(*tag_slot))) 382 goto fail; 383 } 384 } 385 *pdata = data; 386 return 0; 387 fail: 388 io_rsrc_data_free(data); 389 return ret; 390 } 391 392 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 393 struct io_uring_rsrc_update2 *up, 394 unsigned nr_args) 395 { 396 u64 __user *tags = u64_to_user_ptr(up->tags); 397 __s32 __user *fds = u64_to_user_ptr(up->data); 398 struct io_rsrc_data *data = ctx->file_data; 399 struct io_fixed_file *file_slot; 400 struct file *file; 401 int fd, i, err = 0; 402 unsigned int done; 403 bool needs_switch = false; 404 405 if (!ctx->file_data) 406 return -ENXIO; 407 if (up->offset + nr_args > ctx->nr_user_files) 408 return -EINVAL; 409 410 for (done = 0; done < nr_args; done++) { 411 u64 tag = 0; 412 413 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 414 copy_from_user(&fd, &fds[done], sizeof(fd))) { 415 err = -EFAULT; 416 break; 417 } 418 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 419 err = -EINVAL; 420 break; 421 } 422 if (fd == IORING_REGISTER_FILES_SKIP) 423 continue; 424 425 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 426 file_slot = io_fixed_file_slot(&ctx->file_table, i); 427 428 if (file_slot->file_ptr) { 429 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 430 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 431 if (err) 432 break; 433 file_slot->file_ptr = 0; 434 io_file_bitmap_clear(&ctx->file_table, i); 435 needs_switch = true; 436 } 437 if (fd != -1) { 438 file = fget(fd); 439 if (!file) { 440 err = -EBADF; 441 break; 442 } 443 /* 444 * Don't allow io_uring instances to be registered. If 445 * UNIX isn't enabled, then this causes a reference 446 * cycle and this instance can never get freed. If UNIX 447 * is enabled we'll handle it just fine, but there's 448 * still no point in allowing a ring fd as it doesn't 449 * support regular read/write anyway. 450 */ 451 if (io_is_uring_fops(file)) { 452 fput(file); 453 err = -EBADF; 454 break; 455 } 456 err = io_scm_file_account(ctx, file); 457 if (err) { 458 fput(file); 459 break; 460 } 461 *io_get_tag_slot(data, i) = tag; 462 io_fixed_file_set(file_slot, file); 463 io_file_bitmap_set(&ctx->file_table, i); 464 } 465 } 466 467 if (needs_switch) 468 io_rsrc_node_switch(ctx, data); 469 return done ? done : err; 470 } 471 472 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 473 struct io_uring_rsrc_update2 *up, 474 unsigned int nr_args) 475 { 476 u64 __user *tags = u64_to_user_ptr(up->tags); 477 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 478 struct page *last_hpage = NULL; 479 bool needs_switch = false; 480 __u32 done; 481 int i, err; 482 483 if (!ctx->buf_data) 484 return -ENXIO; 485 if (up->offset + nr_args > ctx->nr_user_bufs) 486 return -EINVAL; 487 488 for (done = 0; done < nr_args; done++) { 489 struct io_mapped_ubuf *imu; 490 int offset = up->offset + done; 491 u64 tag = 0; 492 493 err = io_copy_iov(ctx, &iov, iovs, done); 494 if (err) 495 break; 496 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 497 err = -EFAULT; 498 break; 499 } 500 err = io_buffer_validate(&iov); 501 if (err) 502 break; 503 if (!iov.iov_base && tag) { 504 err = -EINVAL; 505 break; 506 } 507 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 508 if (err) 509 break; 510 511 i = array_index_nospec(offset, ctx->nr_user_bufs); 512 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 513 err = io_queue_rsrc_removal(ctx->buf_data, i, 514 ctx->rsrc_node, ctx->user_bufs[i]); 515 if (unlikely(err)) { 516 io_buffer_unmap(ctx, &imu); 517 break; 518 } 519 ctx->user_bufs[i] = ctx->dummy_ubuf; 520 needs_switch = true; 521 } 522 523 ctx->user_bufs[i] = imu; 524 *io_get_tag_slot(ctx->buf_data, i) = tag; 525 } 526 527 if (needs_switch) 528 io_rsrc_node_switch(ctx, ctx->buf_data); 529 return done ? done : err; 530 } 531 532 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 533 struct io_uring_rsrc_update2 *up, 534 unsigned nr_args) 535 { 536 __u32 tmp; 537 int err; 538 539 lockdep_assert_held(&ctx->uring_lock); 540 541 if (check_add_overflow(up->offset, nr_args, &tmp)) 542 return -EOVERFLOW; 543 err = io_rsrc_node_switch_start(ctx); 544 if (err) 545 return err; 546 547 switch (type) { 548 case IORING_RSRC_FILE: 549 return __io_sqe_files_update(ctx, up, nr_args); 550 case IORING_RSRC_BUFFER: 551 return __io_sqe_buffers_update(ctx, up, nr_args); 552 } 553 return -EINVAL; 554 } 555 556 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 557 unsigned nr_args) 558 { 559 struct io_uring_rsrc_update2 up; 560 561 if (!nr_args) 562 return -EINVAL; 563 memset(&up, 0, sizeof(up)); 564 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 565 return -EFAULT; 566 if (up.resv || up.resv2) 567 return -EINVAL; 568 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 569 } 570 571 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 572 unsigned size, unsigned type) 573 { 574 struct io_uring_rsrc_update2 up; 575 576 if (size != sizeof(up)) 577 return -EINVAL; 578 if (copy_from_user(&up, arg, sizeof(up))) 579 return -EFAULT; 580 if (!up.nr || up.resv || up.resv2) 581 return -EINVAL; 582 return __io_register_rsrc_update(ctx, type, &up, up.nr); 583 } 584 585 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 586 unsigned int size, unsigned int type) 587 { 588 struct io_uring_rsrc_register rr; 589 590 /* keep it extendible */ 591 if (size != sizeof(rr)) 592 return -EINVAL; 593 594 memset(&rr, 0, sizeof(rr)); 595 if (copy_from_user(&rr, arg, size)) 596 return -EFAULT; 597 if (!rr.nr || rr.resv2) 598 return -EINVAL; 599 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 600 return -EINVAL; 601 602 switch (type) { 603 case IORING_RSRC_FILE: 604 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 605 break; 606 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 607 rr.nr, u64_to_user_ptr(rr.tags)); 608 case IORING_RSRC_BUFFER: 609 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 610 break; 611 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 612 rr.nr, u64_to_user_ptr(rr.tags)); 613 } 614 return -EINVAL; 615 } 616 617 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 618 { 619 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 620 621 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 622 return -EINVAL; 623 if (sqe->rw_flags || sqe->splice_fd_in) 624 return -EINVAL; 625 626 up->offset = READ_ONCE(sqe->off); 627 up->nr_args = READ_ONCE(sqe->len); 628 if (!up->nr_args) 629 return -EINVAL; 630 up->arg = READ_ONCE(sqe->addr); 631 return 0; 632 } 633 634 static int io_files_update_with_index_alloc(struct io_kiocb *req, 635 unsigned int issue_flags) 636 { 637 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 638 __s32 __user *fds = u64_to_user_ptr(up->arg); 639 unsigned int done; 640 struct file *file; 641 int ret, fd; 642 643 if (!req->ctx->file_data) 644 return -ENXIO; 645 646 for (done = 0; done < up->nr_args; done++) { 647 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 648 ret = -EFAULT; 649 break; 650 } 651 652 file = fget(fd); 653 if (!file) { 654 ret = -EBADF; 655 break; 656 } 657 ret = io_fixed_fd_install(req, issue_flags, file, 658 IORING_FILE_INDEX_ALLOC); 659 if (ret < 0) 660 break; 661 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 662 __io_close_fixed(req->ctx, issue_flags, ret); 663 ret = -EFAULT; 664 break; 665 } 666 } 667 668 if (done) 669 return done; 670 return ret; 671 } 672 673 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 674 { 675 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 676 struct io_ring_ctx *ctx = req->ctx; 677 struct io_uring_rsrc_update2 up2; 678 int ret; 679 680 up2.offset = up->offset; 681 up2.data = up->arg; 682 up2.nr = 0; 683 up2.tags = 0; 684 up2.resv = 0; 685 up2.resv2 = 0; 686 687 if (up->offset == IORING_FILE_INDEX_ALLOC) { 688 ret = io_files_update_with_index_alloc(req, issue_flags); 689 } else { 690 io_ring_submit_lock(ctx, issue_flags); 691 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 692 &up2, up->nr_args); 693 io_ring_submit_unlock(ctx, issue_flags); 694 } 695 696 if (ret < 0) 697 req_set_fail(req); 698 io_req_set_res(req, ret, 0); 699 return IOU_OK; 700 } 701 702 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 703 struct io_rsrc_node *node, void *rsrc) 704 { 705 u64 *tag_slot = io_get_tag_slot(data, idx); 706 struct io_rsrc_put *prsrc; 707 bool inline_item = true; 708 709 if (!node->inline_items) { 710 prsrc = &node->item; 711 node->inline_items++; 712 } else { 713 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 714 if (!prsrc) 715 return -ENOMEM; 716 inline_item = false; 717 } 718 719 prsrc->tag = *tag_slot; 720 *tag_slot = 0; 721 prsrc->rsrc = rsrc; 722 if (!inline_item) 723 list_add(&prsrc->list, &node->item_list); 724 return 0; 725 } 726 727 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 728 { 729 int i; 730 731 for (i = 0; i < ctx->nr_user_files; i++) { 732 struct file *file = io_file_from_index(&ctx->file_table, i); 733 734 /* skip scm accounted files, they'll be freed by ->ring_sock */ 735 if (!file || io_file_need_scm(file)) 736 continue; 737 io_file_bitmap_clear(&ctx->file_table, i); 738 fput(file); 739 } 740 741 #if defined(CONFIG_UNIX) 742 if (ctx->ring_sock) { 743 struct sock *sock = ctx->ring_sock->sk; 744 struct sk_buff *skb; 745 746 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 747 kfree_skb(skb); 748 } 749 #endif 750 io_free_file_tables(&ctx->file_table); 751 io_file_table_set_alloc_range(ctx, 0, 0); 752 io_rsrc_data_free(ctx->file_data); 753 ctx->file_data = NULL; 754 ctx->nr_user_files = 0; 755 } 756 757 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 758 { 759 unsigned nr = ctx->nr_user_files; 760 int ret; 761 762 if (!ctx->file_data) 763 return -ENXIO; 764 765 /* 766 * Quiesce may unlock ->uring_lock, and while it's not held 767 * prevent new requests using the table. 768 */ 769 ctx->nr_user_files = 0; 770 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 771 ctx->nr_user_files = nr; 772 if (!ret) 773 __io_sqe_files_unregister(ctx); 774 return ret; 775 } 776 777 /* 778 * Ensure the UNIX gc is aware of our file set, so we are certain that 779 * the io_uring can be safely unregistered on process exit, even if we have 780 * loops in the file referencing. We account only files that can hold other 781 * files because otherwise they can't form a loop and so are not interesting 782 * for GC. 783 */ 784 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 785 { 786 #if defined(CONFIG_UNIX) 787 struct sock *sk = ctx->ring_sock->sk; 788 struct sk_buff_head *head = &sk->sk_receive_queue; 789 struct scm_fp_list *fpl; 790 struct sk_buff *skb; 791 792 if (likely(!io_file_need_scm(file))) 793 return 0; 794 795 /* 796 * See if we can merge this file into an existing skb SCM_RIGHTS 797 * file set. If there's no room, fall back to allocating a new skb 798 * and filling it in. 799 */ 800 spin_lock_irq(&head->lock); 801 skb = skb_peek(head); 802 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 803 __skb_unlink(skb, head); 804 else 805 skb = NULL; 806 spin_unlock_irq(&head->lock); 807 808 if (!skb) { 809 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 810 if (!fpl) 811 return -ENOMEM; 812 813 skb = alloc_skb(0, GFP_KERNEL); 814 if (!skb) { 815 kfree(fpl); 816 return -ENOMEM; 817 } 818 819 fpl->user = get_uid(current_user()); 820 fpl->max = SCM_MAX_FD; 821 fpl->count = 0; 822 823 UNIXCB(skb).fp = fpl; 824 skb->sk = sk; 825 skb->scm_io_uring = 1; 826 skb->destructor = unix_destruct_scm; 827 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 828 } 829 830 fpl = UNIXCB(skb).fp; 831 fpl->fp[fpl->count++] = get_file(file); 832 unix_inflight(fpl->user, file); 833 skb_queue_head(head, skb); 834 fput(file); 835 #endif 836 return 0; 837 } 838 839 static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) 840 { 841 #if defined(CONFIG_UNIX) 842 struct sock *sock = ctx->ring_sock->sk; 843 struct sk_buff_head list, *head = &sock->sk_receive_queue; 844 struct sk_buff *skb; 845 int i; 846 847 __skb_queue_head_init(&list); 848 849 /* 850 * Find the skb that holds this file in its SCM_RIGHTS. When found, 851 * remove this entry and rearrange the file array. 852 */ 853 skb = skb_dequeue(head); 854 while (skb) { 855 struct scm_fp_list *fp; 856 857 fp = UNIXCB(skb).fp; 858 for (i = 0; i < fp->count; i++) { 859 int left; 860 861 if (fp->fp[i] != file) 862 continue; 863 864 unix_notinflight(fp->user, fp->fp[i]); 865 left = fp->count - 1 - i; 866 if (left) { 867 memmove(&fp->fp[i], &fp->fp[i + 1], 868 left * sizeof(struct file *)); 869 } 870 fp->count--; 871 if (!fp->count) { 872 kfree_skb(skb); 873 skb = NULL; 874 } else { 875 __skb_queue_tail(&list, skb); 876 } 877 fput(file); 878 file = NULL; 879 break; 880 } 881 882 if (!file) 883 break; 884 885 __skb_queue_tail(&list, skb); 886 887 skb = skb_dequeue(head); 888 } 889 890 if (skb_peek(&list)) { 891 spin_lock_irq(&head->lock); 892 while ((skb = __skb_dequeue(&list)) != NULL) 893 __skb_queue_tail(head, skb); 894 spin_unlock_irq(&head->lock); 895 } 896 #endif 897 } 898 899 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 900 { 901 struct file *file = prsrc->file; 902 903 if (likely(!io_file_need_scm(file))) 904 fput(file); 905 else 906 io_rsrc_file_scm_put(ctx, file); 907 } 908 909 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 910 unsigned nr_args, u64 __user *tags) 911 { 912 __s32 __user *fds = (__s32 __user *) arg; 913 struct file *file; 914 int fd, ret; 915 unsigned i; 916 917 if (ctx->file_data) 918 return -EBUSY; 919 if (!nr_args) 920 return -EINVAL; 921 if (nr_args > IORING_MAX_FIXED_FILES) 922 return -EMFILE; 923 if (nr_args > rlimit(RLIMIT_NOFILE)) 924 return -EMFILE; 925 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 926 &ctx->file_data); 927 if (ret) 928 return ret; 929 930 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 931 io_rsrc_data_free(ctx->file_data); 932 ctx->file_data = NULL; 933 return -ENOMEM; 934 } 935 936 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 937 struct io_fixed_file *file_slot; 938 939 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 940 ret = -EFAULT; 941 goto fail; 942 } 943 /* allow sparse sets */ 944 if (!fds || fd == -1) { 945 ret = -EINVAL; 946 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 947 goto fail; 948 continue; 949 } 950 951 file = fget(fd); 952 ret = -EBADF; 953 if (unlikely(!file)) 954 goto fail; 955 956 /* 957 * Don't allow io_uring instances to be registered. If UNIX 958 * isn't enabled, then this causes a reference cycle and this 959 * instance can never get freed. If UNIX is enabled we'll 960 * handle it just fine, but there's still no point in allowing 961 * a ring fd as it doesn't support regular read/write anyway. 962 */ 963 if (io_is_uring_fops(file)) { 964 fput(file); 965 goto fail; 966 } 967 ret = io_scm_file_account(ctx, file); 968 if (ret) { 969 fput(file); 970 goto fail; 971 } 972 file_slot = io_fixed_file_slot(&ctx->file_table, i); 973 io_fixed_file_set(file_slot, file); 974 io_file_bitmap_set(&ctx->file_table, i); 975 } 976 977 /* default it to the whole table */ 978 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 979 return 0; 980 fail: 981 __io_sqe_files_unregister(ctx); 982 return ret; 983 } 984 985 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 986 { 987 io_buffer_unmap(ctx, &prsrc->buf); 988 prsrc->buf = NULL; 989 } 990 991 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 992 { 993 unsigned int i; 994 995 for (i = 0; i < ctx->nr_user_bufs; i++) 996 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 997 kfree(ctx->user_bufs); 998 io_rsrc_data_free(ctx->buf_data); 999 ctx->user_bufs = NULL; 1000 ctx->buf_data = NULL; 1001 ctx->nr_user_bufs = 0; 1002 } 1003 1004 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1005 { 1006 unsigned nr = ctx->nr_user_bufs; 1007 int ret; 1008 1009 if (!ctx->buf_data) 1010 return -ENXIO; 1011 1012 /* 1013 * Quiesce may unlock ->uring_lock, and while it's not held 1014 * prevent new requests using the table. 1015 */ 1016 ctx->nr_user_bufs = 0; 1017 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 1018 ctx->nr_user_bufs = nr; 1019 if (!ret) 1020 __io_sqe_buffers_unregister(ctx); 1021 return ret; 1022 } 1023 1024 /* 1025 * Not super efficient, but this is just a registration time. And we do cache 1026 * the last compound head, so generally we'll only do a full search if we don't 1027 * match that one. 1028 * 1029 * We check if the given compound head page has already been accounted, to 1030 * avoid double accounting it. This allows us to account the full size of the 1031 * page, not just the constituent pages of a huge page. 1032 */ 1033 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 1034 int nr_pages, struct page *hpage) 1035 { 1036 int i, j; 1037 1038 /* check current page array */ 1039 for (i = 0; i < nr_pages; i++) { 1040 if (!PageCompound(pages[i])) 1041 continue; 1042 if (compound_head(pages[i]) == hpage) 1043 return true; 1044 } 1045 1046 /* check previously registered pages */ 1047 for (i = 0; i < ctx->nr_user_bufs; i++) { 1048 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 1049 1050 for (j = 0; j < imu->nr_bvecs; j++) { 1051 if (!PageCompound(imu->bvec[j].bv_page)) 1052 continue; 1053 if (compound_head(imu->bvec[j].bv_page) == hpage) 1054 return true; 1055 } 1056 } 1057 1058 return false; 1059 } 1060 1061 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1062 int nr_pages, struct io_mapped_ubuf *imu, 1063 struct page **last_hpage) 1064 { 1065 int i, ret; 1066 1067 imu->acct_pages = 0; 1068 for (i = 0; i < nr_pages; i++) { 1069 if (!PageCompound(pages[i])) { 1070 imu->acct_pages++; 1071 } else { 1072 struct page *hpage; 1073 1074 hpage = compound_head(pages[i]); 1075 if (hpage == *last_hpage) 1076 continue; 1077 *last_hpage = hpage; 1078 if (headpage_already_acct(ctx, pages, i, hpage)) 1079 continue; 1080 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1081 } 1082 } 1083 1084 if (!imu->acct_pages) 1085 return 0; 1086 1087 ret = io_account_mem(ctx, imu->acct_pages); 1088 if (ret) 1089 imu->acct_pages = 0; 1090 return ret; 1091 } 1092 1093 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1094 { 1095 unsigned long start, end, nr_pages; 1096 struct vm_area_struct **vmas = NULL; 1097 struct page **pages = NULL; 1098 int i, pret, ret = -ENOMEM; 1099 1100 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1101 start = ubuf >> PAGE_SHIFT; 1102 nr_pages = end - start; 1103 1104 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1105 if (!pages) 1106 goto done; 1107 1108 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1109 GFP_KERNEL); 1110 if (!vmas) 1111 goto done; 1112 1113 ret = 0; 1114 mmap_read_lock(current->mm); 1115 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1116 pages, vmas); 1117 if (pret == nr_pages) { 1118 struct file *file = vmas[0]->vm_file; 1119 1120 /* don't support file backed memory */ 1121 for (i = 0; i < nr_pages; i++) { 1122 if (vmas[i]->vm_file != file) { 1123 ret = -EINVAL; 1124 break; 1125 } 1126 if (!file) 1127 continue; 1128 if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { 1129 ret = -EOPNOTSUPP; 1130 break; 1131 } 1132 } 1133 *npages = nr_pages; 1134 } else { 1135 ret = pret < 0 ? pret : -EFAULT; 1136 } 1137 mmap_read_unlock(current->mm); 1138 if (ret) { 1139 /* 1140 * if we did partial map, or found file backed vmas, 1141 * release any pages we did get 1142 */ 1143 if (pret > 0) 1144 unpin_user_pages(pages, pret); 1145 goto done; 1146 } 1147 ret = 0; 1148 done: 1149 kvfree(vmas); 1150 if (ret < 0) { 1151 kvfree(pages); 1152 pages = ERR_PTR(ret); 1153 } 1154 return pages; 1155 } 1156 1157 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1158 struct io_mapped_ubuf **pimu, 1159 struct page **last_hpage) 1160 { 1161 struct io_mapped_ubuf *imu = NULL; 1162 struct page **pages = NULL; 1163 unsigned long off; 1164 size_t size; 1165 int ret, nr_pages, i; 1166 struct folio *folio = NULL; 1167 1168 *pimu = ctx->dummy_ubuf; 1169 if (!iov->iov_base) 1170 return 0; 1171 1172 ret = -ENOMEM; 1173 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1174 &nr_pages); 1175 if (IS_ERR(pages)) { 1176 ret = PTR_ERR(pages); 1177 pages = NULL; 1178 goto done; 1179 } 1180 1181 /* If it's a huge page, try to coalesce them into a single bvec entry */ 1182 if (nr_pages > 1) { 1183 folio = page_folio(pages[0]); 1184 for (i = 1; i < nr_pages; i++) { 1185 if (page_folio(pages[i]) != folio) { 1186 folio = NULL; 1187 break; 1188 } 1189 } 1190 if (folio) { 1191 /* 1192 * The pages are bound to the folio, it doesn't 1193 * actually unpin them but drops all but one reference, 1194 * which is usually put down by io_buffer_unmap(). 1195 * Note, needs a better helper. 1196 */ 1197 unpin_user_pages(&pages[1], nr_pages - 1); 1198 nr_pages = 1; 1199 } 1200 } 1201 1202 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1203 if (!imu) 1204 goto done; 1205 1206 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1207 if (ret) { 1208 unpin_user_pages(pages, nr_pages); 1209 goto done; 1210 } 1211 1212 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1213 size = iov->iov_len; 1214 /* store original address for later verification */ 1215 imu->ubuf = (unsigned long) iov->iov_base; 1216 imu->ubuf_end = imu->ubuf + iov->iov_len; 1217 imu->nr_bvecs = nr_pages; 1218 *pimu = imu; 1219 ret = 0; 1220 1221 if (folio) { 1222 bvec_set_page(&imu->bvec[0], pages[0], size, off); 1223 goto done; 1224 } 1225 for (i = 0; i < nr_pages; i++) { 1226 size_t vec_len; 1227 1228 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1229 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 1230 off = 0; 1231 size -= vec_len; 1232 } 1233 done: 1234 if (ret) 1235 kvfree(imu); 1236 kvfree(pages); 1237 return ret; 1238 } 1239 1240 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1241 { 1242 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1243 return ctx->user_bufs ? 0 : -ENOMEM; 1244 } 1245 1246 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1247 unsigned int nr_args, u64 __user *tags) 1248 { 1249 struct page *last_hpage = NULL; 1250 struct io_rsrc_data *data; 1251 int i, ret; 1252 struct iovec iov; 1253 1254 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1255 1256 if (ctx->user_bufs) 1257 return -EBUSY; 1258 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1259 return -EINVAL; 1260 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 1261 if (ret) 1262 return ret; 1263 ret = io_buffers_map_alloc(ctx, nr_args); 1264 if (ret) { 1265 io_rsrc_data_free(data); 1266 return ret; 1267 } 1268 1269 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1270 if (arg) { 1271 ret = io_copy_iov(ctx, &iov, arg, i); 1272 if (ret) 1273 break; 1274 ret = io_buffer_validate(&iov); 1275 if (ret) 1276 break; 1277 } else { 1278 memset(&iov, 0, sizeof(iov)); 1279 } 1280 1281 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1282 ret = -EINVAL; 1283 break; 1284 } 1285 1286 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1287 &last_hpage); 1288 if (ret) 1289 break; 1290 } 1291 1292 WARN_ON_ONCE(ctx->buf_data); 1293 1294 ctx->buf_data = data; 1295 if (ret) 1296 __io_sqe_buffers_unregister(ctx); 1297 return ret; 1298 } 1299 1300 int io_import_fixed(int ddir, struct iov_iter *iter, 1301 struct io_mapped_ubuf *imu, 1302 u64 buf_addr, size_t len) 1303 { 1304 u64 buf_end; 1305 size_t offset; 1306 1307 if (WARN_ON_ONCE(!imu)) 1308 return -EFAULT; 1309 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1310 return -EFAULT; 1311 /* not inside the mapped region */ 1312 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1313 return -EFAULT; 1314 1315 /* 1316 * Might not be a start of buffer, set size appropriately 1317 * and advance us to the beginning. 1318 */ 1319 offset = buf_addr - imu->ubuf; 1320 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1321 1322 if (offset) { 1323 /* 1324 * Don't use iov_iter_advance() here, as it's really slow for 1325 * using the latter parts of a big fixed buffer - it iterates 1326 * over each segment manually. We can cheat a bit here, because 1327 * we know that: 1328 * 1329 * 1) it's a BVEC iter, we set it up 1330 * 2) all bvecs are PAGE_SIZE in size, except potentially the 1331 * first and last bvec 1332 * 1333 * So just find our index, and adjust the iterator afterwards. 1334 * If the offset is within the first bvec (or the whole first 1335 * bvec, just use iov_iter_advance(). This makes it easier 1336 * since we can just skip the first segment, which may not 1337 * be PAGE_SIZE aligned. 1338 */ 1339 const struct bio_vec *bvec = imu->bvec; 1340 1341 if (offset <= bvec->bv_len) { 1342 /* 1343 * Note, huge pages buffers consists of one large 1344 * bvec entry and should always go this way. The other 1345 * branch doesn't expect non PAGE_SIZE'd chunks. 1346 */ 1347 iter->bvec = bvec; 1348 iter->nr_segs = bvec->bv_len; 1349 iter->count -= offset; 1350 iter->iov_offset = offset; 1351 } else { 1352 unsigned long seg_skip; 1353 1354 /* skip first vec */ 1355 offset -= bvec->bv_len; 1356 seg_skip = 1 + (offset >> PAGE_SHIFT); 1357 1358 iter->bvec = bvec + seg_skip; 1359 iter->nr_segs -= seg_skip; 1360 iter->count -= bvec->bv_len + offset; 1361 iter->iov_offset = offset & ~PAGE_MASK; 1362 } 1363 } 1364 1365 return 0; 1366 } 1367