1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 19 struct io_rsrc_update { 20 struct file *file; 21 u64 arg; 22 u32 nr_args; 23 u32 offset; 24 }; 25 26 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 27 struct io_mapped_ubuf **pimu, 28 struct page **last_hpage); 29 30 /* only define max */ 31 #define IORING_MAX_FIXED_FILES (1U << 20) 32 #define IORING_MAX_REG_BUFFERS (1U << 14) 33 34 static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data) 35 { 36 return !--rsrc_data->refs; 37 } 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 60 { 61 if (ctx->user) 62 __io_unaccount_mem(ctx->user, nr_pages); 63 64 if (ctx->mm_account) 65 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 66 } 67 68 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 69 { 70 int ret; 71 72 if (ctx->user) { 73 ret = __io_account_mem(ctx->user, nr_pages); 74 if (ret) 75 return ret; 76 } 77 78 if (ctx->mm_account) 79 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 80 81 return 0; 82 } 83 84 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 85 void __user *arg, unsigned index) 86 { 87 struct iovec __user *src; 88 89 #ifdef CONFIG_COMPAT 90 if (ctx->compat) { 91 struct compat_iovec __user *ciovs; 92 struct compat_iovec ciov; 93 94 ciovs = (struct compat_iovec __user *) arg; 95 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 96 return -EFAULT; 97 98 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 99 dst->iov_len = ciov.iov_len; 100 return 0; 101 } 102 #endif 103 src = (struct iovec __user *) arg; 104 if (copy_from_user(dst, &src[index], sizeof(*dst))) 105 return -EFAULT; 106 return 0; 107 } 108 109 static int io_buffer_validate(struct iovec *iov) 110 { 111 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 112 113 /* 114 * Don't impose further limits on the size and buffer 115 * constraints here, we'll -EINVAL later when IO is 116 * submitted if they are wrong. 117 */ 118 if (!iov->iov_base) 119 return iov->iov_len ? -EFAULT : 0; 120 if (!iov->iov_len) 121 return -EFAULT; 122 123 /* arbitrary limit, but we need something */ 124 if (iov->iov_len > SZ_1G) 125 return -EFAULT; 126 127 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 128 return -EOVERFLOW; 129 130 return 0; 131 } 132 133 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 134 { 135 struct io_mapped_ubuf *imu = *slot; 136 unsigned int i; 137 138 if (imu != ctx->dummy_ubuf) { 139 for (i = 0; i < imu->nr_bvecs; i++) 140 unpin_user_page(imu->bvec[i].bv_page); 141 if (imu->acct_pages) 142 io_unaccount_mem(ctx, imu->acct_pages); 143 kvfree(imu); 144 } 145 *slot = NULL; 146 } 147 148 static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, 149 struct io_rsrc_put *prsrc) 150 { 151 struct io_ring_ctx *ctx = rsrc_data->ctx; 152 153 if (prsrc->tag) 154 io_post_aux_cqe(ctx, prsrc->tag, 0, 0); 155 rsrc_data->do_put(ctx, prsrc); 156 } 157 158 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 159 { 160 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 161 struct io_ring_ctx *ctx = rsrc_data->ctx; 162 struct io_rsrc_put *prsrc, *tmp; 163 164 if (ref_node->inline_items) 165 io_rsrc_put_work_one(rsrc_data, &ref_node->item); 166 167 list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) { 168 list_del(&prsrc->list); 169 io_rsrc_put_work_one(rsrc_data, prsrc); 170 kfree(prsrc); 171 } 172 173 io_rsrc_node_destroy(rsrc_data->ctx, ref_node); 174 if (io_put_rsrc_data_ref(rsrc_data)) 175 wake_up_all(&ctx->rsrc_quiesce_wq); 176 } 177 178 void io_wait_rsrc_data(struct io_rsrc_data *data) 179 { 180 if (data) 181 WARN_ON_ONCE(!io_put_rsrc_data_ref(data)); 182 } 183 184 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 185 { 186 if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) 187 kfree(node); 188 } 189 190 void io_rsrc_node_ref_zero(struct io_rsrc_node *node) 191 __must_hold(&node->rsrc_data->ctx->uring_lock) 192 { 193 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 194 195 while (!list_empty(&ctx->rsrc_ref_list)) { 196 node = list_first_entry(&ctx->rsrc_ref_list, 197 struct io_rsrc_node, node); 198 /* recycle ref nodes in order */ 199 if (node->refs) 200 break; 201 list_del(&node->node); 202 __io_rsrc_put_work(node); 203 } 204 } 205 206 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 207 { 208 struct io_rsrc_node *ref_node; 209 struct io_cache_entry *entry; 210 211 entry = io_alloc_cache_get(&ctx->rsrc_node_cache); 212 if (entry) { 213 ref_node = container_of(entry, struct io_rsrc_node, cache); 214 } else { 215 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 216 if (!ref_node) 217 return NULL; 218 } 219 220 ref_node->rsrc_data = NULL; 221 ref_node->refs = 1; 222 INIT_LIST_HEAD(&ref_node->node); 223 INIT_LIST_HEAD(&ref_node->item_list); 224 ref_node->inline_items = 0; 225 return ref_node; 226 } 227 228 void io_rsrc_node_switch(struct io_ring_ctx *ctx, 229 struct io_rsrc_data *data_to_kill) 230 __must_hold(&ctx->uring_lock) 231 { 232 struct io_rsrc_node *node = ctx->rsrc_node; 233 struct io_rsrc_node *backup = io_rsrc_node_alloc(ctx); 234 235 if (WARN_ON_ONCE(!backup)) 236 return; 237 238 data_to_kill->refs++; 239 node->rsrc_data = data_to_kill; 240 list_add_tail(&node->node, &ctx->rsrc_ref_list); 241 /* put master ref */ 242 io_put_rsrc_node(ctx, node); 243 ctx->rsrc_node = backup; 244 } 245 246 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 247 { 248 if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) { 249 struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL); 250 251 if (!node) 252 return -ENOMEM; 253 io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache); 254 } 255 return 0; 256 } 257 258 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 259 struct io_ring_ctx *ctx) 260 { 261 DEFINE_WAIT(we); 262 int ret; 263 264 /* As we may drop ->uring_lock, other task may have started quiesce */ 265 if (data->quiesce) 266 return -ENXIO; 267 ret = io_rsrc_node_switch_start(ctx); 268 if (ret) 269 return ret; 270 io_rsrc_node_switch(ctx, data); 271 272 /* kill initial ref */ 273 if (io_put_rsrc_data_ref(data)) 274 return 0; 275 276 data->quiesce = true; 277 do { 278 prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); 279 mutex_unlock(&ctx->uring_lock); 280 281 ret = io_run_task_work_sig(ctx); 282 if (ret < 0) { 283 mutex_lock(&ctx->uring_lock); 284 if (!data->refs) { 285 ret = 0; 286 } else { 287 /* restore the master reference */ 288 data->refs++; 289 } 290 break; 291 } 292 293 schedule(); 294 __set_current_state(TASK_RUNNING); 295 mutex_lock(&ctx->uring_lock); 296 ret = 0; 297 } while (data->refs); 298 299 finish_wait(&ctx->rsrc_quiesce_wq, &we); 300 data->quiesce = false; 301 return ret; 302 } 303 304 static void io_free_page_table(void **table, size_t size) 305 { 306 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 307 308 for (i = 0; i < nr_tables; i++) 309 kfree(table[i]); 310 kfree(table); 311 } 312 313 static void io_rsrc_data_free(struct io_rsrc_data *data) 314 { 315 size_t size = data->nr * sizeof(data->tags[0][0]); 316 317 if (data->tags) 318 io_free_page_table((void **)data->tags, size); 319 kfree(data); 320 } 321 322 static __cold void **io_alloc_page_table(size_t size) 323 { 324 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 325 size_t init_size = size; 326 void **table; 327 328 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 329 if (!table) 330 return NULL; 331 332 for (i = 0; i < nr_tables; i++) { 333 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 334 335 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 336 if (!table[i]) { 337 io_free_page_table(table, init_size); 338 return NULL; 339 } 340 size -= this_size; 341 } 342 return table; 343 } 344 345 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, 346 rsrc_put_fn *do_put, u64 __user *utags, 347 unsigned nr, struct io_rsrc_data **pdata) 348 { 349 struct io_rsrc_data *data; 350 int ret = 0; 351 unsigned i; 352 353 data = kzalloc(sizeof(*data), GFP_KERNEL); 354 if (!data) 355 return -ENOMEM; 356 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 357 if (!data->tags) { 358 kfree(data); 359 return -ENOMEM; 360 } 361 362 data->nr = nr; 363 data->ctx = ctx; 364 data->do_put = do_put; 365 data->refs = 1; 366 if (utags) { 367 ret = -EFAULT; 368 for (i = 0; i < nr; i++) { 369 u64 *tag_slot = io_get_tag_slot(data, i); 370 371 if (copy_from_user(tag_slot, &utags[i], 372 sizeof(*tag_slot))) 373 goto fail; 374 } 375 } 376 *pdata = data; 377 return 0; 378 fail: 379 io_rsrc_data_free(data); 380 return ret; 381 } 382 383 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 384 struct io_uring_rsrc_update2 *up, 385 unsigned nr_args) 386 { 387 u64 __user *tags = u64_to_user_ptr(up->tags); 388 __s32 __user *fds = u64_to_user_ptr(up->data); 389 struct io_rsrc_data *data = ctx->file_data; 390 struct io_fixed_file *file_slot; 391 struct file *file; 392 int fd, i, err = 0; 393 unsigned int done; 394 bool needs_switch = false; 395 396 if (!ctx->file_data) 397 return -ENXIO; 398 if (up->offset + nr_args > ctx->nr_user_files) 399 return -EINVAL; 400 401 for (done = 0; done < nr_args; done++) { 402 u64 tag = 0; 403 404 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 405 copy_from_user(&fd, &fds[done], sizeof(fd))) { 406 err = -EFAULT; 407 break; 408 } 409 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 410 err = -EINVAL; 411 break; 412 } 413 if (fd == IORING_REGISTER_FILES_SKIP) 414 continue; 415 416 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 417 file_slot = io_fixed_file_slot(&ctx->file_table, i); 418 419 if (file_slot->file_ptr) { 420 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 421 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 422 if (err) 423 break; 424 file_slot->file_ptr = 0; 425 io_file_bitmap_clear(&ctx->file_table, i); 426 needs_switch = true; 427 } 428 if (fd != -1) { 429 file = fget(fd); 430 if (!file) { 431 err = -EBADF; 432 break; 433 } 434 /* 435 * Don't allow io_uring instances to be registered. If 436 * UNIX isn't enabled, then this causes a reference 437 * cycle and this instance can never get freed. If UNIX 438 * is enabled we'll handle it just fine, but there's 439 * still no point in allowing a ring fd as it doesn't 440 * support regular read/write anyway. 441 */ 442 if (io_is_uring_fops(file)) { 443 fput(file); 444 err = -EBADF; 445 break; 446 } 447 err = io_scm_file_account(ctx, file); 448 if (err) { 449 fput(file); 450 break; 451 } 452 *io_get_tag_slot(data, i) = tag; 453 io_fixed_file_set(file_slot, file); 454 io_file_bitmap_set(&ctx->file_table, i); 455 } 456 } 457 458 if (needs_switch) 459 io_rsrc_node_switch(ctx, data); 460 return done ? done : err; 461 } 462 463 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 464 struct io_uring_rsrc_update2 *up, 465 unsigned int nr_args) 466 { 467 u64 __user *tags = u64_to_user_ptr(up->tags); 468 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 469 struct page *last_hpage = NULL; 470 bool needs_switch = false; 471 __u32 done; 472 int i, err; 473 474 if (!ctx->buf_data) 475 return -ENXIO; 476 if (up->offset + nr_args > ctx->nr_user_bufs) 477 return -EINVAL; 478 479 for (done = 0; done < nr_args; done++) { 480 struct io_mapped_ubuf *imu; 481 int offset = up->offset + done; 482 u64 tag = 0; 483 484 err = io_copy_iov(ctx, &iov, iovs, done); 485 if (err) 486 break; 487 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 488 err = -EFAULT; 489 break; 490 } 491 err = io_buffer_validate(&iov); 492 if (err) 493 break; 494 if (!iov.iov_base && tag) { 495 err = -EINVAL; 496 break; 497 } 498 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 499 if (err) 500 break; 501 502 i = array_index_nospec(offset, ctx->nr_user_bufs); 503 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 504 err = io_queue_rsrc_removal(ctx->buf_data, i, 505 ctx->rsrc_node, ctx->user_bufs[i]); 506 if (unlikely(err)) { 507 io_buffer_unmap(ctx, &imu); 508 break; 509 } 510 ctx->user_bufs[i] = ctx->dummy_ubuf; 511 needs_switch = true; 512 } 513 514 ctx->user_bufs[i] = imu; 515 *io_get_tag_slot(ctx->buf_data, i) = tag; 516 } 517 518 if (needs_switch) 519 io_rsrc_node_switch(ctx, ctx->buf_data); 520 return done ? done : err; 521 } 522 523 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 524 struct io_uring_rsrc_update2 *up, 525 unsigned nr_args) 526 { 527 __u32 tmp; 528 int err; 529 530 lockdep_assert_held(&ctx->uring_lock); 531 532 if (check_add_overflow(up->offset, nr_args, &tmp)) 533 return -EOVERFLOW; 534 err = io_rsrc_node_switch_start(ctx); 535 if (err) 536 return err; 537 538 switch (type) { 539 case IORING_RSRC_FILE: 540 return __io_sqe_files_update(ctx, up, nr_args); 541 case IORING_RSRC_BUFFER: 542 return __io_sqe_buffers_update(ctx, up, nr_args); 543 } 544 return -EINVAL; 545 } 546 547 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 548 unsigned nr_args) 549 { 550 struct io_uring_rsrc_update2 up; 551 552 if (!nr_args) 553 return -EINVAL; 554 memset(&up, 0, sizeof(up)); 555 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 556 return -EFAULT; 557 if (up.resv || up.resv2) 558 return -EINVAL; 559 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 560 } 561 562 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 563 unsigned size, unsigned type) 564 { 565 struct io_uring_rsrc_update2 up; 566 567 if (size != sizeof(up)) 568 return -EINVAL; 569 if (copy_from_user(&up, arg, sizeof(up))) 570 return -EFAULT; 571 if (!up.nr || up.resv || up.resv2) 572 return -EINVAL; 573 return __io_register_rsrc_update(ctx, type, &up, up.nr); 574 } 575 576 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 577 unsigned int size, unsigned int type) 578 { 579 struct io_uring_rsrc_register rr; 580 581 /* keep it extendible */ 582 if (size != sizeof(rr)) 583 return -EINVAL; 584 585 memset(&rr, 0, sizeof(rr)); 586 if (copy_from_user(&rr, arg, size)) 587 return -EFAULT; 588 if (!rr.nr || rr.resv2) 589 return -EINVAL; 590 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 591 return -EINVAL; 592 593 switch (type) { 594 case IORING_RSRC_FILE: 595 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 596 break; 597 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 598 rr.nr, u64_to_user_ptr(rr.tags)); 599 case IORING_RSRC_BUFFER: 600 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 601 break; 602 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 603 rr.nr, u64_to_user_ptr(rr.tags)); 604 } 605 return -EINVAL; 606 } 607 608 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 609 { 610 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 611 612 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 613 return -EINVAL; 614 if (sqe->rw_flags || sqe->splice_fd_in) 615 return -EINVAL; 616 617 up->offset = READ_ONCE(sqe->off); 618 up->nr_args = READ_ONCE(sqe->len); 619 if (!up->nr_args) 620 return -EINVAL; 621 up->arg = READ_ONCE(sqe->addr); 622 return 0; 623 } 624 625 static int io_files_update_with_index_alloc(struct io_kiocb *req, 626 unsigned int issue_flags) 627 { 628 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 629 __s32 __user *fds = u64_to_user_ptr(up->arg); 630 unsigned int done; 631 struct file *file; 632 int ret, fd; 633 634 if (!req->ctx->file_data) 635 return -ENXIO; 636 637 for (done = 0; done < up->nr_args; done++) { 638 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 639 ret = -EFAULT; 640 break; 641 } 642 643 file = fget(fd); 644 if (!file) { 645 ret = -EBADF; 646 break; 647 } 648 ret = io_fixed_fd_install(req, issue_flags, file, 649 IORING_FILE_INDEX_ALLOC); 650 if (ret < 0) 651 break; 652 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 653 __io_close_fixed(req->ctx, issue_flags, ret); 654 ret = -EFAULT; 655 break; 656 } 657 } 658 659 if (done) 660 return done; 661 return ret; 662 } 663 664 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 665 { 666 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 667 struct io_ring_ctx *ctx = req->ctx; 668 struct io_uring_rsrc_update2 up2; 669 int ret; 670 671 up2.offset = up->offset; 672 up2.data = up->arg; 673 up2.nr = 0; 674 up2.tags = 0; 675 up2.resv = 0; 676 up2.resv2 = 0; 677 678 if (up->offset == IORING_FILE_INDEX_ALLOC) { 679 ret = io_files_update_with_index_alloc(req, issue_flags); 680 } else { 681 io_ring_submit_lock(ctx, issue_flags); 682 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 683 &up2, up->nr_args); 684 io_ring_submit_unlock(ctx, issue_flags); 685 } 686 687 if (ret < 0) 688 req_set_fail(req); 689 io_req_set_res(req, ret, 0); 690 return IOU_OK; 691 } 692 693 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 694 struct io_rsrc_node *node, void *rsrc) 695 { 696 u64 *tag_slot = io_get_tag_slot(data, idx); 697 struct io_rsrc_put *prsrc; 698 bool inline_item = true; 699 700 if (!node->inline_items) { 701 prsrc = &node->item; 702 node->inline_items++; 703 } else { 704 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 705 if (!prsrc) 706 return -ENOMEM; 707 inline_item = false; 708 } 709 710 prsrc->tag = *tag_slot; 711 *tag_slot = 0; 712 prsrc->rsrc = rsrc; 713 if (!inline_item) 714 list_add(&prsrc->list, &node->item_list); 715 return 0; 716 } 717 718 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 719 { 720 int i; 721 722 for (i = 0; i < ctx->nr_user_files; i++) { 723 struct file *file = io_file_from_index(&ctx->file_table, i); 724 725 /* skip scm accounted files, they'll be freed by ->ring_sock */ 726 if (!file || io_file_need_scm(file)) 727 continue; 728 io_file_bitmap_clear(&ctx->file_table, i); 729 fput(file); 730 } 731 732 #if defined(CONFIG_UNIX) 733 if (ctx->ring_sock) { 734 struct sock *sock = ctx->ring_sock->sk; 735 struct sk_buff *skb; 736 737 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 738 kfree_skb(skb); 739 } 740 #endif 741 io_free_file_tables(&ctx->file_table); 742 io_file_table_set_alloc_range(ctx, 0, 0); 743 io_rsrc_data_free(ctx->file_data); 744 ctx->file_data = NULL; 745 ctx->nr_user_files = 0; 746 } 747 748 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 749 { 750 unsigned nr = ctx->nr_user_files; 751 int ret; 752 753 if (!ctx->file_data) 754 return -ENXIO; 755 756 /* 757 * Quiesce may unlock ->uring_lock, and while it's not held 758 * prevent new requests using the table. 759 */ 760 ctx->nr_user_files = 0; 761 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 762 ctx->nr_user_files = nr; 763 if (!ret) 764 __io_sqe_files_unregister(ctx); 765 return ret; 766 } 767 768 /* 769 * Ensure the UNIX gc is aware of our file set, so we are certain that 770 * the io_uring can be safely unregistered on process exit, even if we have 771 * loops in the file referencing. We account only files that can hold other 772 * files because otherwise they can't form a loop and so are not interesting 773 * for GC. 774 */ 775 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 776 { 777 #if defined(CONFIG_UNIX) 778 struct sock *sk = ctx->ring_sock->sk; 779 struct sk_buff_head *head = &sk->sk_receive_queue; 780 struct scm_fp_list *fpl; 781 struct sk_buff *skb; 782 783 if (likely(!io_file_need_scm(file))) 784 return 0; 785 786 /* 787 * See if we can merge this file into an existing skb SCM_RIGHTS 788 * file set. If there's no room, fall back to allocating a new skb 789 * and filling it in. 790 */ 791 spin_lock_irq(&head->lock); 792 skb = skb_peek(head); 793 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 794 __skb_unlink(skb, head); 795 else 796 skb = NULL; 797 spin_unlock_irq(&head->lock); 798 799 if (!skb) { 800 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 801 if (!fpl) 802 return -ENOMEM; 803 804 skb = alloc_skb(0, GFP_KERNEL); 805 if (!skb) { 806 kfree(fpl); 807 return -ENOMEM; 808 } 809 810 fpl->user = get_uid(current_user()); 811 fpl->max = SCM_MAX_FD; 812 fpl->count = 0; 813 814 UNIXCB(skb).fp = fpl; 815 skb->sk = sk; 816 skb->scm_io_uring = 1; 817 skb->destructor = unix_destruct_scm; 818 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 819 } 820 821 fpl = UNIXCB(skb).fp; 822 fpl->fp[fpl->count++] = get_file(file); 823 unix_inflight(fpl->user, file); 824 skb_queue_head(head, skb); 825 fput(file); 826 #endif 827 return 0; 828 } 829 830 static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) 831 { 832 #if defined(CONFIG_UNIX) 833 struct sock *sock = ctx->ring_sock->sk; 834 struct sk_buff_head list, *head = &sock->sk_receive_queue; 835 struct sk_buff *skb; 836 int i; 837 838 __skb_queue_head_init(&list); 839 840 /* 841 * Find the skb that holds this file in its SCM_RIGHTS. When found, 842 * remove this entry and rearrange the file array. 843 */ 844 skb = skb_dequeue(head); 845 while (skb) { 846 struct scm_fp_list *fp; 847 848 fp = UNIXCB(skb).fp; 849 for (i = 0; i < fp->count; i++) { 850 int left; 851 852 if (fp->fp[i] != file) 853 continue; 854 855 unix_notinflight(fp->user, fp->fp[i]); 856 left = fp->count - 1 - i; 857 if (left) { 858 memmove(&fp->fp[i], &fp->fp[i + 1], 859 left * sizeof(struct file *)); 860 } 861 fp->count--; 862 if (!fp->count) { 863 kfree_skb(skb); 864 skb = NULL; 865 } else { 866 __skb_queue_tail(&list, skb); 867 } 868 fput(file); 869 file = NULL; 870 break; 871 } 872 873 if (!file) 874 break; 875 876 __skb_queue_tail(&list, skb); 877 878 skb = skb_dequeue(head); 879 } 880 881 if (skb_peek(&list)) { 882 spin_lock_irq(&head->lock); 883 while ((skb = __skb_dequeue(&list)) != NULL) 884 __skb_queue_tail(head, skb); 885 spin_unlock_irq(&head->lock); 886 } 887 #endif 888 } 889 890 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 891 { 892 struct file *file = prsrc->file; 893 894 if (likely(!io_file_need_scm(file))) 895 fput(file); 896 else 897 io_rsrc_file_scm_put(ctx, file); 898 } 899 900 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 901 unsigned nr_args, u64 __user *tags) 902 { 903 __s32 __user *fds = (__s32 __user *) arg; 904 struct file *file; 905 int fd, ret; 906 unsigned i; 907 908 if (ctx->file_data) 909 return -EBUSY; 910 if (!nr_args) 911 return -EINVAL; 912 if (nr_args > IORING_MAX_FIXED_FILES) 913 return -EMFILE; 914 if (nr_args > rlimit(RLIMIT_NOFILE)) 915 return -EMFILE; 916 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 917 &ctx->file_data); 918 if (ret) 919 return ret; 920 921 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 922 io_rsrc_data_free(ctx->file_data); 923 ctx->file_data = NULL; 924 return -ENOMEM; 925 } 926 927 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 928 struct io_fixed_file *file_slot; 929 930 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 931 ret = -EFAULT; 932 goto fail; 933 } 934 /* allow sparse sets */ 935 if (!fds || fd == -1) { 936 ret = -EINVAL; 937 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 938 goto fail; 939 continue; 940 } 941 942 file = fget(fd); 943 ret = -EBADF; 944 if (unlikely(!file)) 945 goto fail; 946 947 /* 948 * Don't allow io_uring instances to be registered. If UNIX 949 * isn't enabled, then this causes a reference cycle and this 950 * instance can never get freed. If UNIX is enabled we'll 951 * handle it just fine, but there's still no point in allowing 952 * a ring fd as it doesn't support regular read/write anyway. 953 */ 954 if (io_is_uring_fops(file)) { 955 fput(file); 956 goto fail; 957 } 958 ret = io_scm_file_account(ctx, file); 959 if (ret) { 960 fput(file); 961 goto fail; 962 } 963 file_slot = io_fixed_file_slot(&ctx->file_table, i); 964 io_fixed_file_set(file_slot, file); 965 io_file_bitmap_set(&ctx->file_table, i); 966 } 967 968 /* default it to the whole table */ 969 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 970 return 0; 971 fail: 972 __io_sqe_files_unregister(ctx); 973 return ret; 974 } 975 976 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 977 { 978 io_buffer_unmap(ctx, &prsrc->buf); 979 prsrc->buf = NULL; 980 } 981 982 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 983 { 984 unsigned int i; 985 986 for (i = 0; i < ctx->nr_user_bufs; i++) 987 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 988 kfree(ctx->user_bufs); 989 io_rsrc_data_free(ctx->buf_data); 990 ctx->user_bufs = NULL; 991 ctx->buf_data = NULL; 992 ctx->nr_user_bufs = 0; 993 } 994 995 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 996 { 997 unsigned nr = ctx->nr_user_bufs; 998 int ret; 999 1000 if (!ctx->buf_data) 1001 return -ENXIO; 1002 1003 /* 1004 * Quiesce may unlock ->uring_lock, and while it's not held 1005 * prevent new requests using the table. 1006 */ 1007 ctx->nr_user_bufs = 0; 1008 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 1009 ctx->nr_user_bufs = nr; 1010 if (!ret) 1011 __io_sqe_buffers_unregister(ctx); 1012 return ret; 1013 } 1014 1015 /* 1016 * Not super efficient, but this is just a registration time. And we do cache 1017 * the last compound head, so generally we'll only do a full search if we don't 1018 * match that one. 1019 * 1020 * We check if the given compound head page has already been accounted, to 1021 * avoid double accounting it. This allows us to account the full size of the 1022 * page, not just the constituent pages of a huge page. 1023 */ 1024 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 1025 int nr_pages, struct page *hpage) 1026 { 1027 int i, j; 1028 1029 /* check current page array */ 1030 for (i = 0; i < nr_pages; i++) { 1031 if (!PageCompound(pages[i])) 1032 continue; 1033 if (compound_head(pages[i]) == hpage) 1034 return true; 1035 } 1036 1037 /* check previously registered pages */ 1038 for (i = 0; i < ctx->nr_user_bufs; i++) { 1039 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 1040 1041 for (j = 0; j < imu->nr_bvecs; j++) { 1042 if (!PageCompound(imu->bvec[j].bv_page)) 1043 continue; 1044 if (compound_head(imu->bvec[j].bv_page) == hpage) 1045 return true; 1046 } 1047 } 1048 1049 return false; 1050 } 1051 1052 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1053 int nr_pages, struct io_mapped_ubuf *imu, 1054 struct page **last_hpage) 1055 { 1056 int i, ret; 1057 1058 imu->acct_pages = 0; 1059 for (i = 0; i < nr_pages; i++) { 1060 if (!PageCompound(pages[i])) { 1061 imu->acct_pages++; 1062 } else { 1063 struct page *hpage; 1064 1065 hpage = compound_head(pages[i]); 1066 if (hpage == *last_hpage) 1067 continue; 1068 *last_hpage = hpage; 1069 if (headpage_already_acct(ctx, pages, i, hpage)) 1070 continue; 1071 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1072 } 1073 } 1074 1075 if (!imu->acct_pages) 1076 return 0; 1077 1078 ret = io_account_mem(ctx, imu->acct_pages); 1079 if (ret) 1080 imu->acct_pages = 0; 1081 return ret; 1082 } 1083 1084 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1085 { 1086 unsigned long start, end, nr_pages; 1087 struct vm_area_struct **vmas = NULL; 1088 struct page **pages = NULL; 1089 int i, pret, ret = -ENOMEM; 1090 1091 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1092 start = ubuf >> PAGE_SHIFT; 1093 nr_pages = end - start; 1094 1095 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1096 if (!pages) 1097 goto done; 1098 1099 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1100 GFP_KERNEL); 1101 if (!vmas) 1102 goto done; 1103 1104 ret = 0; 1105 mmap_read_lock(current->mm); 1106 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1107 pages, vmas); 1108 if (pret == nr_pages) { 1109 struct file *file = vmas[0]->vm_file; 1110 1111 /* don't support file backed memory */ 1112 for (i = 0; i < nr_pages; i++) { 1113 if (vmas[i]->vm_file != file) { 1114 ret = -EINVAL; 1115 break; 1116 } 1117 if (!file) 1118 continue; 1119 if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { 1120 ret = -EOPNOTSUPP; 1121 break; 1122 } 1123 } 1124 *npages = nr_pages; 1125 } else { 1126 ret = pret < 0 ? pret : -EFAULT; 1127 } 1128 mmap_read_unlock(current->mm); 1129 if (ret) { 1130 /* 1131 * if we did partial map, or found file backed vmas, 1132 * release any pages we did get 1133 */ 1134 if (pret > 0) 1135 unpin_user_pages(pages, pret); 1136 goto done; 1137 } 1138 ret = 0; 1139 done: 1140 kvfree(vmas); 1141 if (ret < 0) { 1142 kvfree(pages); 1143 pages = ERR_PTR(ret); 1144 } 1145 return pages; 1146 } 1147 1148 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1149 struct io_mapped_ubuf **pimu, 1150 struct page **last_hpage) 1151 { 1152 struct io_mapped_ubuf *imu = NULL; 1153 struct page **pages = NULL; 1154 unsigned long off; 1155 size_t size; 1156 int ret, nr_pages, i; 1157 struct folio *folio = NULL; 1158 1159 *pimu = ctx->dummy_ubuf; 1160 if (!iov->iov_base) 1161 return 0; 1162 1163 ret = -ENOMEM; 1164 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1165 &nr_pages); 1166 if (IS_ERR(pages)) { 1167 ret = PTR_ERR(pages); 1168 pages = NULL; 1169 goto done; 1170 } 1171 1172 /* If it's a huge page, try to coalesce them into a single bvec entry */ 1173 if (nr_pages > 1) { 1174 folio = page_folio(pages[0]); 1175 for (i = 1; i < nr_pages; i++) { 1176 if (page_folio(pages[i]) != folio) { 1177 folio = NULL; 1178 break; 1179 } 1180 } 1181 if (folio) { 1182 /* 1183 * The pages are bound to the folio, it doesn't 1184 * actually unpin them but drops all but one reference, 1185 * which is usually put down by io_buffer_unmap(). 1186 * Note, needs a better helper. 1187 */ 1188 unpin_user_pages(&pages[1], nr_pages - 1); 1189 nr_pages = 1; 1190 } 1191 } 1192 1193 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1194 if (!imu) 1195 goto done; 1196 1197 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1198 if (ret) { 1199 unpin_user_pages(pages, nr_pages); 1200 goto done; 1201 } 1202 1203 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1204 size = iov->iov_len; 1205 /* store original address for later verification */ 1206 imu->ubuf = (unsigned long) iov->iov_base; 1207 imu->ubuf_end = imu->ubuf + iov->iov_len; 1208 imu->nr_bvecs = nr_pages; 1209 *pimu = imu; 1210 ret = 0; 1211 1212 if (folio) { 1213 bvec_set_page(&imu->bvec[0], pages[0], size, off); 1214 goto done; 1215 } 1216 for (i = 0; i < nr_pages; i++) { 1217 size_t vec_len; 1218 1219 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1220 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 1221 off = 0; 1222 size -= vec_len; 1223 } 1224 done: 1225 if (ret) 1226 kvfree(imu); 1227 kvfree(pages); 1228 return ret; 1229 } 1230 1231 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1232 { 1233 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1234 return ctx->user_bufs ? 0 : -ENOMEM; 1235 } 1236 1237 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1238 unsigned int nr_args, u64 __user *tags) 1239 { 1240 struct page *last_hpage = NULL; 1241 struct io_rsrc_data *data; 1242 int i, ret; 1243 struct iovec iov; 1244 1245 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1246 1247 if (ctx->user_bufs) 1248 return -EBUSY; 1249 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1250 return -EINVAL; 1251 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 1252 if (ret) 1253 return ret; 1254 ret = io_buffers_map_alloc(ctx, nr_args); 1255 if (ret) { 1256 io_rsrc_data_free(data); 1257 return ret; 1258 } 1259 1260 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1261 if (arg) { 1262 ret = io_copy_iov(ctx, &iov, arg, i); 1263 if (ret) 1264 break; 1265 ret = io_buffer_validate(&iov); 1266 if (ret) 1267 break; 1268 } else { 1269 memset(&iov, 0, sizeof(iov)); 1270 } 1271 1272 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1273 ret = -EINVAL; 1274 break; 1275 } 1276 1277 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1278 &last_hpage); 1279 if (ret) 1280 break; 1281 } 1282 1283 WARN_ON_ONCE(ctx->buf_data); 1284 1285 ctx->buf_data = data; 1286 if (ret) 1287 __io_sqe_buffers_unregister(ctx); 1288 return ret; 1289 } 1290 1291 int io_import_fixed(int ddir, struct iov_iter *iter, 1292 struct io_mapped_ubuf *imu, 1293 u64 buf_addr, size_t len) 1294 { 1295 u64 buf_end; 1296 size_t offset; 1297 1298 if (WARN_ON_ONCE(!imu)) 1299 return -EFAULT; 1300 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1301 return -EFAULT; 1302 /* not inside the mapped region */ 1303 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1304 return -EFAULT; 1305 1306 /* 1307 * Might not be a start of buffer, set size appropriately 1308 * and advance us to the beginning. 1309 */ 1310 offset = buf_addr - imu->ubuf; 1311 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1312 1313 if (offset) { 1314 /* 1315 * Don't use iov_iter_advance() here, as it's really slow for 1316 * using the latter parts of a big fixed buffer - it iterates 1317 * over each segment manually. We can cheat a bit here, because 1318 * we know that: 1319 * 1320 * 1) it's a BVEC iter, we set it up 1321 * 2) all bvecs are PAGE_SIZE in size, except potentially the 1322 * first and last bvec 1323 * 1324 * So just find our index, and adjust the iterator afterwards. 1325 * If the offset is within the first bvec (or the whole first 1326 * bvec, just use iov_iter_advance(). This makes it easier 1327 * since we can just skip the first segment, which may not 1328 * be PAGE_SIZE aligned. 1329 */ 1330 const struct bio_vec *bvec = imu->bvec; 1331 1332 if (offset <= bvec->bv_len) { 1333 /* 1334 * Note, huge pages buffers consists of one large 1335 * bvec entry and should always go this way. The other 1336 * branch doesn't expect non PAGE_SIZE'd chunks. 1337 */ 1338 iter->bvec = bvec; 1339 iter->nr_segs = bvec->bv_len; 1340 iter->count -= offset; 1341 iter->iov_offset = offset; 1342 } else { 1343 unsigned long seg_skip; 1344 1345 /* skip first vec */ 1346 offset -= bvec->bv_len; 1347 seg_skip = 1 + (offset >> PAGE_SHIFT); 1348 1349 iter->bvec = bvec + seg_skip; 1350 iter->nr_segs -= seg_skip; 1351 iter->count -= bvec->bv_len + offset; 1352 iter->iov_offset = offset & ~PAGE_MASK; 1353 } 1354 } 1355 1356 return 0; 1357 } 1358