1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 19 struct io_rsrc_update { 20 struct file *file; 21 u64 arg; 22 u32 nr_args; 23 u32 offset; 24 }; 25 26 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 27 struct io_mapped_ubuf **pimu, 28 struct page **last_hpage); 29 30 /* only define max */ 31 #define IORING_MAX_FIXED_FILES (1U << 20) 32 #define IORING_MAX_REG_BUFFERS (1U << 14) 33 34 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 35 { 36 unsigned long page_limit, cur_pages, new_pages; 37 38 if (!nr_pages) 39 return 0; 40 41 /* Don't allow more pages than we can safely lock */ 42 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 43 44 cur_pages = atomic_long_read(&user->locked_vm); 45 do { 46 new_pages = cur_pages + nr_pages; 47 if (new_pages > page_limit) 48 return -ENOMEM; 49 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 50 &cur_pages, new_pages)); 51 return 0; 52 } 53 54 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 55 { 56 if (ctx->user) 57 __io_unaccount_mem(ctx->user, nr_pages); 58 59 if (ctx->mm_account) 60 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 61 } 62 63 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 64 { 65 int ret; 66 67 if (ctx->user) { 68 ret = __io_account_mem(ctx->user, nr_pages); 69 if (ret) 70 return ret; 71 } 72 73 if (ctx->mm_account) 74 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 75 76 return 0; 77 } 78 79 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 80 void __user *arg, unsigned index) 81 { 82 struct iovec __user *src; 83 84 #ifdef CONFIG_COMPAT 85 if (ctx->compat) { 86 struct compat_iovec __user *ciovs; 87 struct compat_iovec ciov; 88 89 ciovs = (struct compat_iovec __user *) arg; 90 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 91 return -EFAULT; 92 93 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 94 dst->iov_len = ciov.iov_len; 95 return 0; 96 } 97 #endif 98 src = (struct iovec __user *) arg; 99 if (copy_from_user(dst, &src[index], sizeof(*dst))) 100 return -EFAULT; 101 return 0; 102 } 103 104 static int io_buffer_validate(struct iovec *iov) 105 { 106 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 107 108 /* 109 * Don't impose further limits on the size and buffer 110 * constraints here, we'll -EINVAL later when IO is 111 * submitted if they are wrong. 112 */ 113 if (!iov->iov_base) 114 return iov->iov_len ? -EFAULT : 0; 115 if (!iov->iov_len) 116 return -EFAULT; 117 118 /* arbitrary limit, but we need something */ 119 if (iov->iov_len > SZ_1G) 120 return -EFAULT; 121 122 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 123 return -EOVERFLOW; 124 125 return 0; 126 } 127 128 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 129 { 130 struct io_mapped_ubuf *imu = *slot; 131 unsigned int i; 132 133 if (imu != ctx->dummy_ubuf) { 134 for (i = 0; i < imu->nr_bvecs; i++) 135 unpin_user_page(imu->bvec[i].bv_page); 136 if (imu->acct_pages) 137 io_unaccount_mem(ctx, imu->acct_pages); 138 kvfree(imu); 139 } 140 *slot = NULL; 141 } 142 143 static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, 144 struct io_rsrc_put *prsrc) 145 { 146 struct io_ring_ctx *ctx = rsrc_data->ctx; 147 148 if (prsrc->tag) 149 io_post_aux_cqe(ctx, prsrc->tag, 0, 0); 150 rsrc_data->do_put(ctx, prsrc); 151 } 152 153 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 154 { 155 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 156 struct io_rsrc_put *prsrc, *tmp; 157 158 if (ref_node->inline_items) 159 io_rsrc_put_work_one(rsrc_data, &ref_node->item); 160 161 list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) { 162 list_del(&prsrc->list); 163 io_rsrc_put_work_one(rsrc_data, prsrc); 164 kfree(prsrc); 165 } 166 167 io_rsrc_node_destroy(rsrc_data->ctx, ref_node); 168 } 169 170 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 171 { 172 if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) 173 kfree(node); 174 } 175 176 void io_rsrc_node_ref_zero(struct io_rsrc_node *node) 177 __must_hold(&node->rsrc_data->ctx->uring_lock) 178 { 179 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 180 181 while (!list_empty(&ctx->rsrc_ref_list)) { 182 node = list_first_entry(&ctx->rsrc_ref_list, 183 struct io_rsrc_node, node); 184 /* recycle ref nodes in order */ 185 if (node->refs) 186 break; 187 list_del(&node->node); 188 __io_rsrc_put_work(node); 189 } 190 if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) 191 wake_up_all(&ctx->rsrc_quiesce_wq); 192 } 193 194 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 195 { 196 struct io_rsrc_node *ref_node; 197 struct io_cache_entry *entry; 198 199 entry = io_alloc_cache_get(&ctx->rsrc_node_cache); 200 if (entry) { 201 ref_node = container_of(entry, struct io_rsrc_node, cache); 202 } else { 203 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 204 if (!ref_node) 205 return NULL; 206 } 207 208 ref_node->rsrc_data = NULL; 209 ref_node->refs = 1; 210 INIT_LIST_HEAD(&ref_node->node); 211 INIT_LIST_HEAD(&ref_node->item_list); 212 ref_node->inline_items = 0; 213 return ref_node; 214 } 215 216 void io_rsrc_node_switch(struct io_ring_ctx *ctx, 217 struct io_rsrc_data *data_to_kill) 218 __must_hold(&ctx->uring_lock) 219 { 220 struct io_rsrc_node *node = ctx->rsrc_node; 221 struct io_rsrc_node *backup = io_rsrc_node_alloc(ctx); 222 223 if (WARN_ON_ONCE(!backup)) 224 return; 225 226 node->rsrc_data = data_to_kill; 227 list_add_tail(&node->node, &ctx->rsrc_ref_list); 228 /* put master ref */ 229 io_put_rsrc_node(ctx, node); 230 ctx->rsrc_node = backup; 231 } 232 233 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 234 { 235 if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) { 236 struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL); 237 238 if (!node) 239 return -ENOMEM; 240 io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache); 241 } 242 return 0; 243 } 244 245 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 246 struct io_ring_ctx *ctx) 247 { 248 DEFINE_WAIT(we); 249 int ret; 250 251 /* As we may drop ->uring_lock, other task may have started quiesce */ 252 if (data->quiesce) 253 return -ENXIO; 254 ret = io_rsrc_node_switch_start(ctx); 255 if (ret) 256 return ret; 257 io_rsrc_node_switch(ctx, data); 258 259 if (list_empty(&ctx->rsrc_ref_list)) 260 return 0; 261 262 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 263 atomic_set(&ctx->cq_wait_nr, 1); 264 smp_mb(); 265 } 266 267 ctx->rsrc_quiesce++; 268 data->quiesce = true; 269 do { 270 prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); 271 mutex_unlock(&ctx->uring_lock); 272 273 ret = io_run_task_work_sig(ctx); 274 if (ret < 0) { 275 mutex_lock(&ctx->uring_lock); 276 if (list_empty(&ctx->rsrc_ref_list)) 277 ret = 0; 278 break; 279 } 280 281 schedule(); 282 __set_current_state(TASK_RUNNING); 283 mutex_lock(&ctx->uring_lock); 284 ret = 0; 285 } while (!list_empty(&ctx->rsrc_ref_list)); 286 287 finish_wait(&ctx->rsrc_quiesce_wq, &we); 288 data->quiesce = false; 289 ctx->rsrc_quiesce--; 290 291 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 292 atomic_set(&ctx->cq_wait_nr, 0); 293 smp_mb(); 294 } 295 return ret; 296 } 297 298 static void io_free_page_table(void **table, size_t size) 299 { 300 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 301 302 for (i = 0; i < nr_tables; i++) 303 kfree(table[i]); 304 kfree(table); 305 } 306 307 static void io_rsrc_data_free(struct io_rsrc_data *data) 308 { 309 size_t size = data->nr * sizeof(data->tags[0][0]); 310 311 if (data->tags) 312 io_free_page_table((void **)data->tags, size); 313 kfree(data); 314 } 315 316 static __cold void **io_alloc_page_table(size_t size) 317 { 318 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 319 size_t init_size = size; 320 void **table; 321 322 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 323 if (!table) 324 return NULL; 325 326 for (i = 0; i < nr_tables; i++) { 327 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 328 329 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 330 if (!table[i]) { 331 io_free_page_table(table, init_size); 332 return NULL; 333 } 334 size -= this_size; 335 } 336 return table; 337 } 338 339 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, 340 rsrc_put_fn *do_put, u64 __user *utags, 341 unsigned nr, struct io_rsrc_data **pdata) 342 { 343 struct io_rsrc_data *data; 344 int ret = 0; 345 unsigned i; 346 347 data = kzalloc(sizeof(*data), GFP_KERNEL); 348 if (!data) 349 return -ENOMEM; 350 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 351 if (!data->tags) { 352 kfree(data); 353 return -ENOMEM; 354 } 355 356 data->nr = nr; 357 data->ctx = ctx; 358 data->do_put = do_put; 359 if (utags) { 360 ret = -EFAULT; 361 for (i = 0; i < nr; i++) { 362 u64 *tag_slot = io_get_tag_slot(data, i); 363 364 if (copy_from_user(tag_slot, &utags[i], 365 sizeof(*tag_slot))) 366 goto fail; 367 } 368 } 369 *pdata = data; 370 return 0; 371 fail: 372 io_rsrc_data_free(data); 373 return ret; 374 } 375 376 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 377 struct io_uring_rsrc_update2 *up, 378 unsigned nr_args) 379 { 380 u64 __user *tags = u64_to_user_ptr(up->tags); 381 __s32 __user *fds = u64_to_user_ptr(up->data); 382 struct io_rsrc_data *data = ctx->file_data; 383 struct io_fixed_file *file_slot; 384 struct file *file; 385 int fd, i, err = 0; 386 unsigned int done; 387 bool needs_switch = false; 388 389 if (!ctx->file_data) 390 return -ENXIO; 391 if (up->offset + nr_args > ctx->nr_user_files) 392 return -EINVAL; 393 394 for (done = 0; done < nr_args; done++) { 395 u64 tag = 0; 396 397 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 398 copy_from_user(&fd, &fds[done], sizeof(fd))) { 399 err = -EFAULT; 400 break; 401 } 402 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 403 err = -EINVAL; 404 break; 405 } 406 if (fd == IORING_REGISTER_FILES_SKIP) 407 continue; 408 409 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 410 file_slot = io_fixed_file_slot(&ctx->file_table, i); 411 412 if (file_slot->file_ptr) { 413 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 414 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 415 if (err) 416 break; 417 file_slot->file_ptr = 0; 418 io_file_bitmap_clear(&ctx->file_table, i); 419 needs_switch = true; 420 } 421 if (fd != -1) { 422 file = fget(fd); 423 if (!file) { 424 err = -EBADF; 425 break; 426 } 427 /* 428 * Don't allow io_uring instances to be registered. If 429 * UNIX isn't enabled, then this causes a reference 430 * cycle and this instance can never get freed. If UNIX 431 * is enabled we'll handle it just fine, but there's 432 * still no point in allowing a ring fd as it doesn't 433 * support regular read/write anyway. 434 */ 435 if (io_is_uring_fops(file)) { 436 fput(file); 437 err = -EBADF; 438 break; 439 } 440 err = io_scm_file_account(ctx, file); 441 if (err) { 442 fput(file); 443 break; 444 } 445 *io_get_tag_slot(data, i) = tag; 446 io_fixed_file_set(file_slot, file); 447 io_file_bitmap_set(&ctx->file_table, i); 448 } 449 } 450 451 if (needs_switch) 452 io_rsrc_node_switch(ctx, data); 453 return done ? done : err; 454 } 455 456 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 457 struct io_uring_rsrc_update2 *up, 458 unsigned int nr_args) 459 { 460 u64 __user *tags = u64_to_user_ptr(up->tags); 461 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 462 struct page *last_hpage = NULL; 463 bool needs_switch = false; 464 __u32 done; 465 int i, err; 466 467 if (!ctx->buf_data) 468 return -ENXIO; 469 if (up->offset + nr_args > ctx->nr_user_bufs) 470 return -EINVAL; 471 472 for (done = 0; done < nr_args; done++) { 473 struct io_mapped_ubuf *imu; 474 int offset = up->offset + done; 475 u64 tag = 0; 476 477 err = io_copy_iov(ctx, &iov, iovs, done); 478 if (err) 479 break; 480 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 481 err = -EFAULT; 482 break; 483 } 484 err = io_buffer_validate(&iov); 485 if (err) 486 break; 487 if (!iov.iov_base && tag) { 488 err = -EINVAL; 489 break; 490 } 491 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 492 if (err) 493 break; 494 495 i = array_index_nospec(offset, ctx->nr_user_bufs); 496 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 497 err = io_queue_rsrc_removal(ctx->buf_data, i, 498 ctx->rsrc_node, ctx->user_bufs[i]); 499 if (unlikely(err)) { 500 io_buffer_unmap(ctx, &imu); 501 break; 502 } 503 ctx->user_bufs[i] = ctx->dummy_ubuf; 504 needs_switch = true; 505 } 506 507 ctx->user_bufs[i] = imu; 508 *io_get_tag_slot(ctx->buf_data, i) = tag; 509 } 510 511 if (needs_switch) 512 io_rsrc_node_switch(ctx, ctx->buf_data); 513 return done ? done : err; 514 } 515 516 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 517 struct io_uring_rsrc_update2 *up, 518 unsigned nr_args) 519 { 520 __u32 tmp; 521 int err; 522 523 lockdep_assert_held(&ctx->uring_lock); 524 525 if (check_add_overflow(up->offset, nr_args, &tmp)) 526 return -EOVERFLOW; 527 err = io_rsrc_node_switch_start(ctx); 528 if (err) 529 return err; 530 531 switch (type) { 532 case IORING_RSRC_FILE: 533 return __io_sqe_files_update(ctx, up, nr_args); 534 case IORING_RSRC_BUFFER: 535 return __io_sqe_buffers_update(ctx, up, nr_args); 536 } 537 return -EINVAL; 538 } 539 540 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 541 unsigned nr_args) 542 { 543 struct io_uring_rsrc_update2 up; 544 545 if (!nr_args) 546 return -EINVAL; 547 memset(&up, 0, sizeof(up)); 548 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 549 return -EFAULT; 550 if (up.resv || up.resv2) 551 return -EINVAL; 552 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 553 } 554 555 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 556 unsigned size, unsigned type) 557 { 558 struct io_uring_rsrc_update2 up; 559 560 if (size != sizeof(up)) 561 return -EINVAL; 562 if (copy_from_user(&up, arg, sizeof(up))) 563 return -EFAULT; 564 if (!up.nr || up.resv || up.resv2) 565 return -EINVAL; 566 return __io_register_rsrc_update(ctx, type, &up, up.nr); 567 } 568 569 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 570 unsigned int size, unsigned int type) 571 { 572 struct io_uring_rsrc_register rr; 573 574 /* keep it extendible */ 575 if (size != sizeof(rr)) 576 return -EINVAL; 577 578 memset(&rr, 0, sizeof(rr)); 579 if (copy_from_user(&rr, arg, size)) 580 return -EFAULT; 581 if (!rr.nr || rr.resv2) 582 return -EINVAL; 583 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 584 return -EINVAL; 585 586 switch (type) { 587 case IORING_RSRC_FILE: 588 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 589 break; 590 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 591 rr.nr, u64_to_user_ptr(rr.tags)); 592 case IORING_RSRC_BUFFER: 593 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 594 break; 595 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 596 rr.nr, u64_to_user_ptr(rr.tags)); 597 } 598 return -EINVAL; 599 } 600 601 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 602 { 603 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 604 605 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 606 return -EINVAL; 607 if (sqe->rw_flags || sqe->splice_fd_in) 608 return -EINVAL; 609 610 up->offset = READ_ONCE(sqe->off); 611 up->nr_args = READ_ONCE(sqe->len); 612 if (!up->nr_args) 613 return -EINVAL; 614 up->arg = READ_ONCE(sqe->addr); 615 return 0; 616 } 617 618 static int io_files_update_with_index_alloc(struct io_kiocb *req, 619 unsigned int issue_flags) 620 { 621 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 622 __s32 __user *fds = u64_to_user_ptr(up->arg); 623 unsigned int done; 624 struct file *file; 625 int ret, fd; 626 627 if (!req->ctx->file_data) 628 return -ENXIO; 629 630 for (done = 0; done < up->nr_args; done++) { 631 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 632 ret = -EFAULT; 633 break; 634 } 635 636 file = fget(fd); 637 if (!file) { 638 ret = -EBADF; 639 break; 640 } 641 ret = io_fixed_fd_install(req, issue_flags, file, 642 IORING_FILE_INDEX_ALLOC); 643 if (ret < 0) 644 break; 645 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 646 __io_close_fixed(req->ctx, issue_flags, ret); 647 ret = -EFAULT; 648 break; 649 } 650 } 651 652 if (done) 653 return done; 654 return ret; 655 } 656 657 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 658 { 659 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 660 struct io_ring_ctx *ctx = req->ctx; 661 struct io_uring_rsrc_update2 up2; 662 int ret; 663 664 up2.offset = up->offset; 665 up2.data = up->arg; 666 up2.nr = 0; 667 up2.tags = 0; 668 up2.resv = 0; 669 up2.resv2 = 0; 670 671 if (up->offset == IORING_FILE_INDEX_ALLOC) { 672 ret = io_files_update_with_index_alloc(req, issue_flags); 673 } else { 674 io_ring_submit_lock(ctx, issue_flags); 675 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 676 &up2, up->nr_args); 677 io_ring_submit_unlock(ctx, issue_flags); 678 } 679 680 if (ret < 0) 681 req_set_fail(req); 682 io_req_set_res(req, ret, 0); 683 return IOU_OK; 684 } 685 686 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 687 struct io_rsrc_node *node, void *rsrc) 688 { 689 u64 *tag_slot = io_get_tag_slot(data, idx); 690 struct io_rsrc_put *prsrc; 691 bool inline_item = true; 692 693 if (!node->inline_items) { 694 prsrc = &node->item; 695 node->inline_items++; 696 } else { 697 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 698 if (!prsrc) 699 return -ENOMEM; 700 inline_item = false; 701 } 702 703 prsrc->tag = *tag_slot; 704 *tag_slot = 0; 705 prsrc->rsrc = rsrc; 706 if (!inline_item) 707 list_add(&prsrc->list, &node->item_list); 708 return 0; 709 } 710 711 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 712 { 713 int i; 714 715 for (i = 0; i < ctx->nr_user_files; i++) { 716 struct file *file = io_file_from_index(&ctx->file_table, i); 717 718 /* skip scm accounted files, they'll be freed by ->ring_sock */ 719 if (!file || io_file_need_scm(file)) 720 continue; 721 io_file_bitmap_clear(&ctx->file_table, i); 722 fput(file); 723 } 724 725 #if defined(CONFIG_UNIX) 726 if (ctx->ring_sock) { 727 struct sock *sock = ctx->ring_sock->sk; 728 struct sk_buff *skb; 729 730 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 731 kfree_skb(skb); 732 } 733 #endif 734 io_free_file_tables(&ctx->file_table); 735 io_file_table_set_alloc_range(ctx, 0, 0); 736 io_rsrc_data_free(ctx->file_data); 737 ctx->file_data = NULL; 738 ctx->nr_user_files = 0; 739 } 740 741 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 742 { 743 unsigned nr = ctx->nr_user_files; 744 int ret; 745 746 if (!ctx->file_data) 747 return -ENXIO; 748 749 /* 750 * Quiesce may unlock ->uring_lock, and while it's not held 751 * prevent new requests using the table. 752 */ 753 ctx->nr_user_files = 0; 754 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 755 ctx->nr_user_files = nr; 756 if (!ret) 757 __io_sqe_files_unregister(ctx); 758 return ret; 759 } 760 761 /* 762 * Ensure the UNIX gc is aware of our file set, so we are certain that 763 * the io_uring can be safely unregistered on process exit, even if we have 764 * loops in the file referencing. We account only files that can hold other 765 * files because otherwise they can't form a loop and so are not interesting 766 * for GC. 767 */ 768 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 769 { 770 #if defined(CONFIG_UNIX) 771 struct sock *sk = ctx->ring_sock->sk; 772 struct sk_buff_head *head = &sk->sk_receive_queue; 773 struct scm_fp_list *fpl; 774 struct sk_buff *skb; 775 776 if (likely(!io_file_need_scm(file))) 777 return 0; 778 779 /* 780 * See if we can merge this file into an existing skb SCM_RIGHTS 781 * file set. If there's no room, fall back to allocating a new skb 782 * and filling it in. 783 */ 784 spin_lock_irq(&head->lock); 785 skb = skb_peek(head); 786 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 787 __skb_unlink(skb, head); 788 else 789 skb = NULL; 790 spin_unlock_irq(&head->lock); 791 792 if (!skb) { 793 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 794 if (!fpl) 795 return -ENOMEM; 796 797 skb = alloc_skb(0, GFP_KERNEL); 798 if (!skb) { 799 kfree(fpl); 800 return -ENOMEM; 801 } 802 803 fpl->user = get_uid(current_user()); 804 fpl->max = SCM_MAX_FD; 805 fpl->count = 0; 806 807 UNIXCB(skb).fp = fpl; 808 skb->sk = sk; 809 skb->scm_io_uring = 1; 810 skb->destructor = unix_destruct_scm; 811 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 812 } 813 814 fpl = UNIXCB(skb).fp; 815 fpl->fp[fpl->count++] = get_file(file); 816 unix_inflight(fpl->user, file); 817 skb_queue_head(head, skb); 818 fput(file); 819 #endif 820 return 0; 821 } 822 823 static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) 824 { 825 #if defined(CONFIG_UNIX) 826 struct sock *sock = ctx->ring_sock->sk; 827 struct sk_buff_head list, *head = &sock->sk_receive_queue; 828 struct sk_buff *skb; 829 int i; 830 831 __skb_queue_head_init(&list); 832 833 /* 834 * Find the skb that holds this file in its SCM_RIGHTS. When found, 835 * remove this entry and rearrange the file array. 836 */ 837 skb = skb_dequeue(head); 838 while (skb) { 839 struct scm_fp_list *fp; 840 841 fp = UNIXCB(skb).fp; 842 for (i = 0; i < fp->count; i++) { 843 int left; 844 845 if (fp->fp[i] != file) 846 continue; 847 848 unix_notinflight(fp->user, fp->fp[i]); 849 left = fp->count - 1 - i; 850 if (left) { 851 memmove(&fp->fp[i], &fp->fp[i + 1], 852 left * sizeof(struct file *)); 853 } 854 fp->count--; 855 if (!fp->count) { 856 kfree_skb(skb); 857 skb = NULL; 858 } else { 859 __skb_queue_tail(&list, skb); 860 } 861 fput(file); 862 file = NULL; 863 break; 864 } 865 866 if (!file) 867 break; 868 869 __skb_queue_tail(&list, skb); 870 871 skb = skb_dequeue(head); 872 } 873 874 if (skb_peek(&list)) { 875 spin_lock_irq(&head->lock); 876 while ((skb = __skb_dequeue(&list)) != NULL) 877 __skb_queue_tail(head, skb); 878 spin_unlock_irq(&head->lock); 879 } 880 #endif 881 } 882 883 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 884 { 885 struct file *file = prsrc->file; 886 887 if (likely(!io_file_need_scm(file))) 888 fput(file); 889 else 890 io_rsrc_file_scm_put(ctx, file); 891 } 892 893 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 894 unsigned nr_args, u64 __user *tags) 895 { 896 __s32 __user *fds = (__s32 __user *) arg; 897 struct file *file; 898 int fd, ret; 899 unsigned i; 900 901 if (ctx->file_data) 902 return -EBUSY; 903 if (!nr_args) 904 return -EINVAL; 905 if (nr_args > IORING_MAX_FIXED_FILES) 906 return -EMFILE; 907 if (nr_args > rlimit(RLIMIT_NOFILE)) 908 return -EMFILE; 909 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 910 &ctx->file_data); 911 if (ret) 912 return ret; 913 914 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 915 io_rsrc_data_free(ctx->file_data); 916 ctx->file_data = NULL; 917 return -ENOMEM; 918 } 919 920 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 921 struct io_fixed_file *file_slot; 922 923 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 924 ret = -EFAULT; 925 goto fail; 926 } 927 /* allow sparse sets */ 928 if (!fds || fd == -1) { 929 ret = -EINVAL; 930 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 931 goto fail; 932 continue; 933 } 934 935 file = fget(fd); 936 ret = -EBADF; 937 if (unlikely(!file)) 938 goto fail; 939 940 /* 941 * Don't allow io_uring instances to be registered. If UNIX 942 * isn't enabled, then this causes a reference cycle and this 943 * instance can never get freed. If UNIX is enabled we'll 944 * handle it just fine, but there's still no point in allowing 945 * a ring fd as it doesn't support regular read/write anyway. 946 */ 947 if (io_is_uring_fops(file)) { 948 fput(file); 949 goto fail; 950 } 951 ret = io_scm_file_account(ctx, file); 952 if (ret) { 953 fput(file); 954 goto fail; 955 } 956 file_slot = io_fixed_file_slot(&ctx->file_table, i); 957 io_fixed_file_set(file_slot, file); 958 io_file_bitmap_set(&ctx->file_table, i); 959 } 960 961 /* default it to the whole table */ 962 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 963 return 0; 964 fail: 965 __io_sqe_files_unregister(ctx); 966 return ret; 967 } 968 969 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 970 { 971 io_buffer_unmap(ctx, &prsrc->buf); 972 prsrc->buf = NULL; 973 } 974 975 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 976 { 977 unsigned int i; 978 979 for (i = 0; i < ctx->nr_user_bufs; i++) 980 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 981 kfree(ctx->user_bufs); 982 io_rsrc_data_free(ctx->buf_data); 983 ctx->user_bufs = NULL; 984 ctx->buf_data = NULL; 985 ctx->nr_user_bufs = 0; 986 } 987 988 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 989 { 990 unsigned nr = ctx->nr_user_bufs; 991 int ret; 992 993 if (!ctx->buf_data) 994 return -ENXIO; 995 996 /* 997 * Quiesce may unlock ->uring_lock, and while it's not held 998 * prevent new requests using the table. 999 */ 1000 ctx->nr_user_bufs = 0; 1001 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 1002 ctx->nr_user_bufs = nr; 1003 if (!ret) 1004 __io_sqe_buffers_unregister(ctx); 1005 return ret; 1006 } 1007 1008 /* 1009 * Not super efficient, but this is just a registration time. And we do cache 1010 * the last compound head, so generally we'll only do a full search if we don't 1011 * match that one. 1012 * 1013 * We check if the given compound head page has already been accounted, to 1014 * avoid double accounting it. This allows us to account the full size of the 1015 * page, not just the constituent pages of a huge page. 1016 */ 1017 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 1018 int nr_pages, struct page *hpage) 1019 { 1020 int i, j; 1021 1022 /* check current page array */ 1023 for (i = 0; i < nr_pages; i++) { 1024 if (!PageCompound(pages[i])) 1025 continue; 1026 if (compound_head(pages[i]) == hpage) 1027 return true; 1028 } 1029 1030 /* check previously registered pages */ 1031 for (i = 0; i < ctx->nr_user_bufs; i++) { 1032 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 1033 1034 for (j = 0; j < imu->nr_bvecs; j++) { 1035 if (!PageCompound(imu->bvec[j].bv_page)) 1036 continue; 1037 if (compound_head(imu->bvec[j].bv_page) == hpage) 1038 return true; 1039 } 1040 } 1041 1042 return false; 1043 } 1044 1045 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1046 int nr_pages, struct io_mapped_ubuf *imu, 1047 struct page **last_hpage) 1048 { 1049 int i, ret; 1050 1051 imu->acct_pages = 0; 1052 for (i = 0; i < nr_pages; i++) { 1053 if (!PageCompound(pages[i])) { 1054 imu->acct_pages++; 1055 } else { 1056 struct page *hpage; 1057 1058 hpage = compound_head(pages[i]); 1059 if (hpage == *last_hpage) 1060 continue; 1061 *last_hpage = hpage; 1062 if (headpage_already_acct(ctx, pages, i, hpage)) 1063 continue; 1064 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1065 } 1066 } 1067 1068 if (!imu->acct_pages) 1069 return 0; 1070 1071 ret = io_account_mem(ctx, imu->acct_pages); 1072 if (ret) 1073 imu->acct_pages = 0; 1074 return ret; 1075 } 1076 1077 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1078 { 1079 unsigned long start, end, nr_pages; 1080 struct vm_area_struct **vmas = NULL; 1081 struct page **pages = NULL; 1082 int i, pret, ret = -ENOMEM; 1083 1084 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1085 start = ubuf >> PAGE_SHIFT; 1086 nr_pages = end - start; 1087 1088 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1089 if (!pages) 1090 goto done; 1091 1092 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1093 GFP_KERNEL); 1094 if (!vmas) 1095 goto done; 1096 1097 ret = 0; 1098 mmap_read_lock(current->mm); 1099 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1100 pages, vmas); 1101 if (pret == nr_pages) { 1102 struct file *file = vmas[0]->vm_file; 1103 1104 /* don't support file backed memory */ 1105 for (i = 0; i < nr_pages; i++) { 1106 if (vmas[i]->vm_file != file) { 1107 ret = -EINVAL; 1108 break; 1109 } 1110 if (!file) 1111 continue; 1112 if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { 1113 ret = -EOPNOTSUPP; 1114 break; 1115 } 1116 } 1117 *npages = nr_pages; 1118 } else { 1119 ret = pret < 0 ? pret : -EFAULT; 1120 } 1121 mmap_read_unlock(current->mm); 1122 if (ret) { 1123 /* 1124 * if we did partial map, or found file backed vmas, 1125 * release any pages we did get 1126 */ 1127 if (pret > 0) 1128 unpin_user_pages(pages, pret); 1129 goto done; 1130 } 1131 ret = 0; 1132 done: 1133 kvfree(vmas); 1134 if (ret < 0) { 1135 kvfree(pages); 1136 pages = ERR_PTR(ret); 1137 } 1138 return pages; 1139 } 1140 1141 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1142 struct io_mapped_ubuf **pimu, 1143 struct page **last_hpage) 1144 { 1145 struct io_mapped_ubuf *imu = NULL; 1146 struct page **pages = NULL; 1147 unsigned long off; 1148 size_t size; 1149 int ret, nr_pages, i; 1150 struct folio *folio = NULL; 1151 1152 *pimu = ctx->dummy_ubuf; 1153 if (!iov->iov_base) 1154 return 0; 1155 1156 ret = -ENOMEM; 1157 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1158 &nr_pages); 1159 if (IS_ERR(pages)) { 1160 ret = PTR_ERR(pages); 1161 pages = NULL; 1162 goto done; 1163 } 1164 1165 /* If it's a huge page, try to coalesce them into a single bvec entry */ 1166 if (nr_pages > 1) { 1167 folio = page_folio(pages[0]); 1168 for (i = 1; i < nr_pages; i++) { 1169 if (page_folio(pages[i]) != folio) { 1170 folio = NULL; 1171 break; 1172 } 1173 } 1174 if (folio) { 1175 /* 1176 * The pages are bound to the folio, it doesn't 1177 * actually unpin them but drops all but one reference, 1178 * which is usually put down by io_buffer_unmap(). 1179 * Note, needs a better helper. 1180 */ 1181 unpin_user_pages(&pages[1], nr_pages - 1); 1182 nr_pages = 1; 1183 } 1184 } 1185 1186 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1187 if (!imu) 1188 goto done; 1189 1190 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1191 if (ret) { 1192 unpin_user_pages(pages, nr_pages); 1193 goto done; 1194 } 1195 1196 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1197 size = iov->iov_len; 1198 /* store original address for later verification */ 1199 imu->ubuf = (unsigned long) iov->iov_base; 1200 imu->ubuf_end = imu->ubuf + iov->iov_len; 1201 imu->nr_bvecs = nr_pages; 1202 *pimu = imu; 1203 ret = 0; 1204 1205 if (folio) { 1206 bvec_set_page(&imu->bvec[0], pages[0], size, off); 1207 goto done; 1208 } 1209 for (i = 0; i < nr_pages; i++) { 1210 size_t vec_len; 1211 1212 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1213 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 1214 off = 0; 1215 size -= vec_len; 1216 } 1217 done: 1218 if (ret) 1219 kvfree(imu); 1220 kvfree(pages); 1221 return ret; 1222 } 1223 1224 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1225 { 1226 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1227 return ctx->user_bufs ? 0 : -ENOMEM; 1228 } 1229 1230 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1231 unsigned int nr_args, u64 __user *tags) 1232 { 1233 struct page *last_hpage = NULL; 1234 struct io_rsrc_data *data; 1235 int i, ret; 1236 struct iovec iov; 1237 1238 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1239 1240 if (ctx->user_bufs) 1241 return -EBUSY; 1242 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1243 return -EINVAL; 1244 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 1245 if (ret) 1246 return ret; 1247 ret = io_buffers_map_alloc(ctx, nr_args); 1248 if (ret) { 1249 io_rsrc_data_free(data); 1250 return ret; 1251 } 1252 1253 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1254 if (arg) { 1255 ret = io_copy_iov(ctx, &iov, arg, i); 1256 if (ret) 1257 break; 1258 ret = io_buffer_validate(&iov); 1259 if (ret) 1260 break; 1261 } else { 1262 memset(&iov, 0, sizeof(iov)); 1263 } 1264 1265 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1266 ret = -EINVAL; 1267 break; 1268 } 1269 1270 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1271 &last_hpage); 1272 if (ret) 1273 break; 1274 } 1275 1276 WARN_ON_ONCE(ctx->buf_data); 1277 1278 ctx->buf_data = data; 1279 if (ret) 1280 __io_sqe_buffers_unregister(ctx); 1281 return ret; 1282 } 1283 1284 int io_import_fixed(int ddir, struct iov_iter *iter, 1285 struct io_mapped_ubuf *imu, 1286 u64 buf_addr, size_t len) 1287 { 1288 u64 buf_end; 1289 size_t offset; 1290 1291 if (WARN_ON_ONCE(!imu)) 1292 return -EFAULT; 1293 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1294 return -EFAULT; 1295 /* not inside the mapped region */ 1296 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1297 return -EFAULT; 1298 1299 /* 1300 * Might not be a start of buffer, set size appropriately 1301 * and advance us to the beginning. 1302 */ 1303 offset = buf_addr - imu->ubuf; 1304 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1305 1306 if (offset) { 1307 /* 1308 * Don't use iov_iter_advance() here, as it's really slow for 1309 * using the latter parts of a big fixed buffer - it iterates 1310 * over each segment manually. We can cheat a bit here, because 1311 * we know that: 1312 * 1313 * 1) it's a BVEC iter, we set it up 1314 * 2) all bvecs are PAGE_SIZE in size, except potentially the 1315 * first and last bvec 1316 * 1317 * So just find our index, and adjust the iterator afterwards. 1318 * If the offset is within the first bvec (or the whole first 1319 * bvec, just use iov_iter_advance(). This makes it easier 1320 * since we can just skip the first segment, which may not 1321 * be PAGE_SIZE aligned. 1322 */ 1323 const struct bio_vec *bvec = imu->bvec; 1324 1325 if (offset <= bvec->bv_len) { 1326 /* 1327 * Note, huge pages buffers consists of one large 1328 * bvec entry and should always go this way. The other 1329 * branch doesn't expect non PAGE_SIZE'd chunks. 1330 */ 1331 iter->bvec = bvec; 1332 iter->nr_segs = bvec->bv_len; 1333 iter->count -= offset; 1334 iter->iov_offset = offset; 1335 } else { 1336 unsigned long seg_skip; 1337 1338 /* skip first vec */ 1339 offset -= bvec->bv_len; 1340 seg_skip = 1 + (offset >> PAGE_SHIFT); 1341 1342 iter->bvec = bvec + seg_skip; 1343 iter->nr_segs -= seg_skip; 1344 iter->count -= bvec->bv_len + offset; 1345 iter->iov_offset = offset & ~PAGE_MASK; 1346 } 1347 } 1348 1349 return 0; 1350 } 1351