1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 19 struct io_rsrc_update { 20 struct file *file; 21 u64 arg; 22 u32 nr_args; 23 u32 offset; 24 }; 25 26 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 27 struct io_mapped_ubuf **pimu, 28 struct page **last_hpage); 29 30 #define IO_RSRC_REF_BATCH 100 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 void io_rsrc_refs_drop(struct io_ring_ctx *ctx) 37 __must_hold(&ctx->uring_lock) 38 { 39 if (ctx->rsrc_cached_refs) { 40 io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); 41 ctx->rsrc_cached_refs = 0; 42 } 43 } 44 45 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 46 { 47 unsigned long page_limit, cur_pages, new_pages; 48 49 if (!nr_pages) 50 return 0; 51 52 /* Don't allow more pages than we can safely lock */ 53 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 54 55 cur_pages = atomic_long_read(&user->locked_vm); 56 do { 57 new_pages = cur_pages + nr_pages; 58 if (new_pages > page_limit) 59 return -ENOMEM; 60 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 61 &cur_pages, new_pages)); 62 return 0; 63 } 64 65 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 66 { 67 if (ctx->user) 68 __io_unaccount_mem(ctx->user, nr_pages); 69 70 if (ctx->mm_account) 71 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 72 } 73 74 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 75 { 76 int ret; 77 78 if (ctx->user) { 79 ret = __io_account_mem(ctx->user, nr_pages); 80 if (ret) 81 return ret; 82 } 83 84 if (ctx->mm_account) 85 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 86 87 return 0; 88 } 89 90 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 91 void __user *arg, unsigned index) 92 { 93 struct iovec __user *src; 94 95 #ifdef CONFIG_COMPAT 96 if (ctx->compat) { 97 struct compat_iovec __user *ciovs; 98 struct compat_iovec ciov; 99 100 ciovs = (struct compat_iovec __user *) arg; 101 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 102 return -EFAULT; 103 104 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 105 dst->iov_len = ciov.iov_len; 106 return 0; 107 } 108 #endif 109 src = (struct iovec __user *) arg; 110 if (copy_from_user(dst, &src[index], sizeof(*dst))) 111 return -EFAULT; 112 return 0; 113 } 114 115 static int io_buffer_validate(struct iovec *iov) 116 { 117 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 118 119 /* 120 * Don't impose further limits on the size and buffer 121 * constraints here, we'll -EINVAL later when IO is 122 * submitted if they are wrong. 123 */ 124 if (!iov->iov_base) 125 return iov->iov_len ? -EFAULT : 0; 126 if (!iov->iov_len) 127 return -EFAULT; 128 129 /* arbitrary limit, but we need something */ 130 if (iov->iov_len > SZ_1G) 131 return -EFAULT; 132 133 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 134 return -EOVERFLOW; 135 136 return 0; 137 } 138 139 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 140 { 141 struct io_mapped_ubuf *imu = *slot; 142 unsigned int i; 143 144 if (imu != ctx->dummy_ubuf) { 145 for (i = 0; i < imu->nr_bvecs; i++) 146 unpin_user_page(imu->bvec[i].bv_page); 147 if (imu->acct_pages) 148 io_unaccount_mem(ctx, imu->acct_pages); 149 kvfree(imu); 150 } 151 *slot = NULL; 152 } 153 154 void io_rsrc_refs_refill(struct io_ring_ctx *ctx) 155 __must_hold(&ctx->uring_lock) 156 { 157 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; 158 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); 159 } 160 161 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 162 { 163 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 164 struct io_ring_ctx *ctx = rsrc_data->ctx; 165 struct io_rsrc_put *prsrc, *tmp; 166 167 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 168 list_del(&prsrc->list); 169 170 if (prsrc->tag) { 171 if (ctx->flags & IORING_SETUP_IOPOLL) { 172 mutex_lock(&ctx->uring_lock); 173 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); 174 mutex_unlock(&ctx->uring_lock); 175 } else { 176 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); 177 } 178 } 179 180 rsrc_data->do_put(ctx, prsrc); 181 kfree(prsrc); 182 } 183 184 io_rsrc_node_destroy(ref_node); 185 if (atomic_dec_and_test(&rsrc_data->refs)) 186 complete(&rsrc_data->done); 187 } 188 189 void io_rsrc_put_work(struct work_struct *work) 190 { 191 struct io_ring_ctx *ctx; 192 struct llist_node *node; 193 194 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 195 node = llist_del_all(&ctx->rsrc_put_llist); 196 197 while (node) { 198 struct io_rsrc_node *ref_node; 199 struct llist_node *next = node->next; 200 201 ref_node = llist_entry(node, struct io_rsrc_node, llist); 202 __io_rsrc_put_work(ref_node); 203 node = next; 204 } 205 } 206 207 void io_wait_rsrc_data(struct io_rsrc_data *data) 208 { 209 if (data && !atomic_dec_and_test(&data->refs)) 210 wait_for_completion(&data->done); 211 } 212 213 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 214 { 215 percpu_ref_exit(&ref_node->refs); 216 kfree(ref_node); 217 } 218 219 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) 220 { 221 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 222 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 223 unsigned long flags; 224 bool first_add = false; 225 unsigned long delay = HZ; 226 227 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 228 node->done = true; 229 230 /* if we are mid-quiesce then do not delay */ 231 if (node->rsrc_data->quiesce) 232 delay = 0; 233 234 while (!list_empty(&ctx->rsrc_ref_list)) { 235 node = list_first_entry(&ctx->rsrc_ref_list, 236 struct io_rsrc_node, node); 237 /* recycle ref nodes in order */ 238 if (!node->done) 239 break; 240 list_del(&node->node); 241 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 242 } 243 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 244 245 if (first_add) 246 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); 247 } 248 249 static struct io_rsrc_node *io_rsrc_node_alloc(void) 250 { 251 struct io_rsrc_node *ref_node; 252 253 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 254 if (!ref_node) 255 return NULL; 256 257 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 258 0, GFP_KERNEL)) { 259 kfree(ref_node); 260 return NULL; 261 } 262 INIT_LIST_HEAD(&ref_node->node); 263 INIT_LIST_HEAD(&ref_node->rsrc_list); 264 ref_node->done = false; 265 return ref_node; 266 } 267 268 void io_rsrc_node_switch(struct io_ring_ctx *ctx, 269 struct io_rsrc_data *data_to_kill) 270 __must_hold(&ctx->uring_lock) 271 { 272 WARN_ON_ONCE(!ctx->rsrc_backup_node); 273 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 274 275 io_rsrc_refs_drop(ctx); 276 277 if (data_to_kill) { 278 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 279 280 rsrc_node->rsrc_data = data_to_kill; 281 spin_lock_irq(&ctx->rsrc_ref_lock); 282 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 283 spin_unlock_irq(&ctx->rsrc_ref_lock); 284 285 atomic_inc(&data_to_kill->refs); 286 percpu_ref_kill(&rsrc_node->refs); 287 ctx->rsrc_node = NULL; 288 } 289 290 if (!ctx->rsrc_node) { 291 ctx->rsrc_node = ctx->rsrc_backup_node; 292 ctx->rsrc_backup_node = NULL; 293 } 294 } 295 296 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 297 { 298 if (ctx->rsrc_backup_node) 299 return 0; 300 ctx->rsrc_backup_node = io_rsrc_node_alloc(); 301 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 302 } 303 304 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 305 struct io_ring_ctx *ctx) 306 { 307 int ret; 308 309 /* As we may drop ->uring_lock, other task may have started quiesce */ 310 if (data->quiesce) 311 return -ENXIO; 312 313 data->quiesce = true; 314 do { 315 ret = io_rsrc_node_switch_start(ctx); 316 if (ret) 317 break; 318 io_rsrc_node_switch(ctx, data); 319 320 /* kill initial ref, already quiesced if zero */ 321 if (atomic_dec_and_test(&data->refs)) 322 break; 323 mutex_unlock(&ctx->uring_lock); 324 flush_delayed_work(&ctx->rsrc_put_work); 325 ret = wait_for_completion_interruptible(&data->done); 326 if (!ret) { 327 mutex_lock(&ctx->uring_lock); 328 if (atomic_read(&data->refs) > 0) { 329 /* 330 * it has been revived by another thread while 331 * we were unlocked 332 */ 333 mutex_unlock(&ctx->uring_lock); 334 } else { 335 break; 336 } 337 } 338 339 atomic_inc(&data->refs); 340 /* wait for all works potentially completing data->done */ 341 flush_delayed_work(&ctx->rsrc_put_work); 342 reinit_completion(&data->done); 343 344 ret = io_run_task_work_sig(ctx); 345 mutex_lock(&ctx->uring_lock); 346 } while (ret >= 0); 347 data->quiesce = false; 348 349 return ret; 350 } 351 352 static void io_free_page_table(void **table, size_t size) 353 { 354 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 355 356 for (i = 0; i < nr_tables; i++) 357 kfree(table[i]); 358 kfree(table); 359 } 360 361 static void io_rsrc_data_free(struct io_rsrc_data *data) 362 { 363 size_t size = data->nr * sizeof(data->tags[0][0]); 364 365 if (data->tags) 366 io_free_page_table((void **)data->tags, size); 367 kfree(data); 368 } 369 370 static __cold void **io_alloc_page_table(size_t size) 371 { 372 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 373 size_t init_size = size; 374 void **table; 375 376 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 377 if (!table) 378 return NULL; 379 380 for (i = 0; i < nr_tables; i++) { 381 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 382 383 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 384 if (!table[i]) { 385 io_free_page_table(table, init_size); 386 return NULL; 387 } 388 size -= this_size; 389 } 390 return table; 391 } 392 393 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, 394 rsrc_put_fn *do_put, u64 __user *utags, 395 unsigned nr, struct io_rsrc_data **pdata) 396 { 397 struct io_rsrc_data *data; 398 int ret = -ENOMEM; 399 unsigned i; 400 401 data = kzalloc(sizeof(*data), GFP_KERNEL); 402 if (!data) 403 return -ENOMEM; 404 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 405 if (!data->tags) { 406 kfree(data); 407 return -ENOMEM; 408 } 409 410 data->nr = nr; 411 data->ctx = ctx; 412 data->do_put = do_put; 413 if (utags) { 414 ret = -EFAULT; 415 for (i = 0; i < nr; i++) { 416 u64 *tag_slot = io_get_tag_slot(data, i); 417 418 if (copy_from_user(tag_slot, &utags[i], 419 sizeof(*tag_slot))) 420 goto fail; 421 } 422 } 423 424 atomic_set(&data->refs, 1); 425 init_completion(&data->done); 426 *pdata = data; 427 return 0; 428 fail: 429 io_rsrc_data_free(data); 430 return ret; 431 } 432 433 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 434 struct io_uring_rsrc_update2 *up, 435 unsigned nr_args) 436 { 437 u64 __user *tags = u64_to_user_ptr(up->tags); 438 __s32 __user *fds = u64_to_user_ptr(up->data); 439 struct io_rsrc_data *data = ctx->file_data; 440 struct io_fixed_file *file_slot; 441 struct file *file; 442 int fd, i, err = 0; 443 unsigned int done; 444 bool needs_switch = false; 445 446 if (!ctx->file_data) 447 return -ENXIO; 448 if (up->offset + nr_args > ctx->nr_user_files) 449 return -EINVAL; 450 451 for (done = 0; done < nr_args; done++) { 452 u64 tag = 0; 453 454 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 455 copy_from_user(&fd, &fds[done], sizeof(fd))) { 456 err = -EFAULT; 457 break; 458 } 459 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 460 err = -EINVAL; 461 break; 462 } 463 if (fd == IORING_REGISTER_FILES_SKIP) 464 continue; 465 466 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 467 file_slot = io_fixed_file_slot(&ctx->file_table, i); 468 469 if (file_slot->file_ptr) { 470 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 471 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 472 if (err) 473 break; 474 file_slot->file_ptr = 0; 475 io_file_bitmap_clear(&ctx->file_table, i); 476 needs_switch = true; 477 } 478 if (fd != -1) { 479 file = fget(fd); 480 if (!file) { 481 err = -EBADF; 482 break; 483 } 484 /* 485 * Don't allow io_uring instances to be registered. If 486 * UNIX isn't enabled, then this causes a reference 487 * cycle and this instance can never get freed. If UNIX 488 * is enabled we'll handle it just fine, but there's 489 * still no point in allowing a ring fd as it doesn't 490 * support regular read/write anyway. 491 */ 492 if (io_is_uring_fops(file)) { 493 fput(file); 494 err = -EBADF; 495 break; 496 } 497 err = io_scm_file_account(ctx, file); 498 if (err) { 499 fput(file); 500 break; 501 } 502 *io_get_tag_slot(data, i) = tag; 503 io_fixed_file_set(file_slot, file); 504 io_file_bitmap_set(&ctx->file_table, i); 505 } 506 } 507 508 if (needs_switch) 509 io_rsrc_node_switch(ctx, data); 510 return done ? done : err; 511 } 512 513 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 514 struct io_uring_rsrc_update2 *up, 515 unsigned int nr_args) 516 { 517 u64 __user *tags = u64_to_user_ptr(up->tags); 518 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 519 struct page *last_hpage = NULL; 520 bool needs_switch = false; 521 __u32 done; 522 int i, err; 523 524 if (!ctx->buf_data) 525 return -ENXIO; 526 if (up->offset + nr_args > ctx->nr_user_bufs) 527 return -EINVAL; 528 529 for (done = 0; done < nr_args; done++) { 530 struct io_mapped_ubuf *imu; 531 int offset = up->offset + done; 532 u64 tag = 0; 533 534 err = io_copy_iov(ctx, &iov, iovs, done); 535 if (err) 536 break; 537 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 538 err = -EFAULT; 539 break; 540 } 541 err = io_buffer_validate(&iov); 542 if (err) 543 break; 544 if (!iov.iov_base && tag) { 545 err = -EINVAL; 546 break; 547 } 548 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 549 if (err) 550 break; 551 552 i = array_index_nospec(offset, ctx->nr_user_bufs); 553 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 554 err = io_queue_rsrc_removal(ctx->buf_data, i, 555 ctx->rsrc_node, ctx->user_bufs[i]); 556 if (unlikely(err)) { 557 io_buffer_unmap(ctx, &imu); 558 break; 559 } 560 ctx->user_bufs[i] = ctx->dummy_ubuf; 561 needs_switch = true; 562 } 563 564 ctx->user_bufs[i] = imu; 565 *io_get_tag_slot(ctx->buf_data, offset) = tag; 566 } 567 568 if (needs_switch) 569 io_rsrc_node_switch(ctx, ctx->buf_data); 570 return done ? done : err; 571 } 572 573 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 574 struct io_uring_rsrc_update2 *up, 575 unsigned nr_args) 576 { 577 __u32 tmp; 578 int err; 579 580 if (check_add_overflow(up->offset, nr_args, &tmp)) 581 return -EOVERFLOW; 582 err = io_rsrc_node_switch_start(ctx); 583 if (err) 584 return err; 585 586 switch (type) { 587 case IORING_RSRC_FILE: 588 return __io_sqe_files_update(ctx, up, nr_args); 589 case IORING_RSRC_BUFFER: 590 return __io_sqe_buffers_update(ctx, up, nr_args); 591 } 592 return -EINVAL; 593 } 594 595 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 596 unsigned nr_args) 597 { 598 struct io_uring_rsrc_update2 up; 599 600 if (!nr_args) 601 return -EINVAL; 602 memset(&up, 0, sizeof(up)); 603 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 604 return -EFAULT; 605 if (up.resv || up.resv2) 606 return -EINVAL; 607 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 608 } 609 610 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 611 unsigned size, unsigned type) 612 { 613 struct io_uring_rsrc_update2 up; 614 615 if (size != sizeof(up)) 616 return -EINVAL; 617 if (copy_from_user(&up, arg, sizeof(up))) 618 return -EFAULT; 619 if (!up.nr || up.resv || up.resv2) 620 return -EINVAL; 621 return __io_register_rsrc_update(ctx, type, &up, up.nr); 622 } 623 624 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 625 unsigned int size, unsigned int type) 626 { 627 struct io_uring_rsrc_register rr; 628 629 /* keep it extendible */ 630 if (size != sizeof(rr)) 631 return -EINVAL; 632 633 memset(&rr, 0, sizeof(rr)); 634 if (copy_from_user(&rr, arg, size)) 635 return -EFAULT; 636 if (!rr.nr || rr.resv2) 637 return -EINVAL; 638 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 639 return -EINVAL; 640 641 switch (type) { 642 case IORING_RSRC_FILE: 643 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 644 break; 645 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 646 rr.nr, u64_to_user_ptr(rr.tags)); 647 case IORING_RSRC_BUFFER: 648 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 649 break; 650 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 651 rr.nr, u64_to_user_ptr(rr.tags)); 652 } 653 return -EINVAL; 654 } 655 656 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 657 { 658 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 659 660 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 661 return -EINVAL; 662 if (sqe->rw_flags || sqe->splice_fd_in) 663 return -EINVAL; 664 665 up->offset = READ_ONCE(sqe->off); 666 up->nr_args = READ_ONCE(sqe->len); 667 if (!up->nr_args) 668 return -EINVAL; 669 up->arg = READ_ONCE(sqe->addr); 670 return 0; 671 } 672 673 static int io_files_update_with_index_alloc(struct io_kiocb *req, 674 unsigned int issue_flags) 675 { 676 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 677 __s32 __user *fds = u64_to_user_ptr(up->arg); 678 unsigned int done; 679 struct file *file; 680 int ret, fd; 681 682 if (!req->ctx->file_data) 683 return -ENXIO; 684 685 for (done = 0; done < up->nr_args; done++) { 686 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 687 ret = -EFAULT; 688 break; 689 } 690 691 file = fget(fd); 692 if (!file) { 693 ret = -EBADF; 694 break; 695 } 696 ret = io_fixed_fd_install(req, issue_flags, file, 697 IORING_FILE_INDEX_ALLOC); 698 if (ret < 0) 699 break; 700 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 701 __io_close_fixed(req->ctx, issue_flags, ret); 702 ret = -EFAULT; 703 break; 704 } 705 } 706 707 if (done) 708 return done; 709 return ret; 710 } 711 712 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 713 { 714 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 715 struct io_ring_ctx *ctx = req->ctx; 716 struct io_uring_rsrc_update2 up2; 717 int ret; 718 719 up2.offset = up->offset; 720 up2.data = up->arg; 721 up2.nr = 0; 722 up2.tags = 0; 723 up2.resv = 0; 724 up2.resv2 = 0; 725 726 if (up->offset == IORING_FILE_INDEX_ALLOC) { 727 ret = io_files_update_with_index_alloc(req, issue_flags); 728 } else { 729 io_ring_submit_lock(ctx, issue_flags); 730 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 731 &up2, up->nr_args); 732 io_ring_submit_unlock(ctx, issue_flags); 733 } 734 735 if (ret < 0) 736 req_set_fail(req); 737 io_req_set_res(req, ret, 0); 738 return IOU_OK; 739 } 740 741 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 742 struct io_rsrc_node *node, void *rsrc) 743 { 744 u64 *tag_slot = io_get_tag_slot(data, idx); 745 struct io_rsrc_put *prsrc; 746 747 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 748 if (!prsrc) 749 return -ENOMEM; 750 751 prsrc->tag = *tag_slot; 752 *tag_slot = 0; 753 prsrc->rsrc = rsrc; 754 list_add(&prsrc->list, &node->rsrc_list); 755 return 0; 756 } 757 758 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 759 { 760 #if !defined(IO_URING_SCM_ALL) 761 int i; 762 763 for (i = 0; i < ctx->nr_user_files; i++) { 764 struct file *file = io_file_from_index(&ctx->file_table, i); 765 766 if (!file) 767 continue; 768 if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) 769 continue; 770 io_file_bitmap_clear(&ctx->file_table, i); 771 fput(file); 772 } 773 #endif 774 775 #if defined(CONFIG_UNIX) 776 if (ctx->ring_sock) { 777 struct sock *sock = ctx->ring_sock->sk; 778 struct sk_buff *skb; 779 780 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 781 kfree_skb(skb); 782 } 783 #endif 784 io_free_file_tables(&ctx->file_table); 785 io_rsrc_data_free(ctx->file_data); 786 ctx->file_data = NULL; 787 ctx->nr_user_files = 0; 788 } 789 790 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 791 { 792 unsigned nr = ctx->nr_user_files; 793 int ret; 794 795 if (!ctx->file_data) 796 return -ENXIO; 797 798 /* 799 * Quiesce may unlock ->uring_lock, and while it's not held 800 * prevent new requests using the table. 801 */ 802 ctx->nr_user_files = 0; 803 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 804 ctx->nr_user_files = nr; 805 if (!ret) 806 __io_sqe_files_unregister(ctx); 807 return ret; 808 } 809 810 /* 811 * Ensure the UNIX gc is aware of our file set, so we are certain that 812 * the io_uring can be safely unregistered on process exit, even if we have 813 * loops in the file referencing. We account only files that can hold other 814 * files because otherwise they can't form a loop and so are not interesting 815 * for GC. 816 */ 817 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 818 { 819 #if defined(CONFIG_UNIX) 820 struct sock *sk = ctx->ring_sock->sk; 821 struct sk_buff_head *head = &sk->sk_receive_queue; 822 struct scm_fp_list *fpl; 823 struct sk_buff *skb; 824 825 if (likely(!io_file_need_scm(file))) 826 return 0; 827 828 /* 829 * See if we can merge this file into an existing skb SCM_RIGHTS 830 * file set. If there's no room, fall back to allocating a new skb 831 * and filling it in. 832 */ 833 spin_lock_irq(&head->lock); 834 skb = skb_peek(head); 835 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 836 __skb_unlink(skb, head); 837 else 838 skb = NULL; 839 spin_unlock_irq(&head->lock); 840 841 if (!skb) { 842 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 843 if (!fpl) 844 return -ENOMEM; 845 846 skb = alloc_skb(0, GFP_KERNEL); 847 if (!skb) { 848 kfree(fpl); 849 return -ENOMEM; 850 } 851 852 fpl->user = get_uid(current_user()); 853 fpl->max = SCM_MAX_FD; 854 fpl->count = 0; 855 856 UNIXCB(skb).fp = fpl; 857 skb->sk = sk; 858 skb->scm_io_uring = 1; 859 skb->destructor = unix_destruct_scm; 860 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 861 } 862 863 fpl = UNIXCB(skb).fp; 864 fpl->fp[fpl->count++] = get_file(file); 865 unix_inflight(fpl->user, file); 866 skb_queue_head(head, skb); 867 fput(file); 868 #endif 869 return 0; 870 } 871 872 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 873 { 874 struct file *file = prsrc->file; 875 #if defined(CONFIG_UNIX) 876 struct sock *sock = ctx->ring_sock->sk; 877 struct sk_buff_head list, *head = &sock->sk_receive_queue; 878 struct sk_buff *skb; 879 int i; 880 881 if (!io_file_need_scm(file)) { 882 fput(file); 883 return; 884 } 885 886 __skb_queue_head_init(&list); 887 888 /* 889 * Find the skb that holds this file in its SCM_RIGHTS. When found, 890 * remove this entry and rearrange the file array. 891 */ 892 skb = skb_dequeue(head); 893 while (skb) { 894 struct scm_fp_list *fp; 895 896 fp = UNIXCB(skb).fp; 897 for (i = 0; i < fp->count; i++) { 898 int left; 899 900 if (fp->fp[i] != file) 901 continue; 902 903 unix_notinflight(fp->user, fp->fp[i]); 904 left = fp->count - 1 - i; 905 if (left) { 906 memmove(&fp->fp[i], &fp->fp[i + 1], 907 left * sizeof(struct file *)); 908 } 909 fp->count--; 910 if (!fp->count) { 911 kfree_skb(skb); 912 skb = NULL; 913 } else { 914 __skb_queue_tail(&list, skb); 915 } 916 fput(file); 917 file = NULL; 918 break; 919 } 920 921 if (!file) 922 break; 923 924 __skb_queue_tail(&list, skb); 925 926 skb = skb_dequeue(head); 927 } 928 929 if (skb_peek(&list)) { 930 spin_lock_irq(&head->lock); 931 while ((skb = __skb_dequeue(&list)) != NULL) 932 __skb_queue_tail(head, skb); 933 spin_unlock_irq(&head->lock); 934 } 935 #else 936 fput(file); 937 #endif 938 } 939 940 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 941 unsigned nr_args, u64 __user *tags) 942 { 943 __s32 __user *fds = (__s32 __user *) arg; 944 struct file *file; 945 int fd, ret; 946 unsigned i; 947 948 if (ctx->file_data) 949 return -EBUSY; 950 if (!nr_args) 951 return -EINVAL; 952 if (nr_args > IORING_MAX_FIXED_FILES) 953 return -EMFILE; 954 if (nr_args > rlimit(RLIMIT_NOFILE)) 955 return -EMFILE; 956 ret = io_rsrc_node_switch_start(ctx); 957 if (ret) 958 return ret; 959 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 960 &ctx->file_data); 961 if (ret) 962 return ret; 963 964 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 965 io_rsrc_data_free(ctx->file_data); 966 ctx->file_data = NULL; 967 return -ENOMEM; 968 } 969 970 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 971 struct io_fixed_file *file_slot; 972 973 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 974 ret = -EFAULT; 975 goto fail; 976 } 977 /* allow sparse sets */ 978 if (!fds || fd == -1) { 979 ret = -EINVAL; 980 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 981 goto fail; 982 continue; 983 } 984 985 file = fget(fd); 986 ret = -EBADF; 987 if (unlikely(!file)) 988 goto fail; 989 990 /* 991 * Don't allow io_uring instances to be registered. If UNIX 992 * isn't enabled, then this causes a reference cycle and this 993 * instance can never get freed. If UNIX is enabled we'll 994 * handle it just fine, but there's still no point in allowing 995 * a ring fd as it doesn't support regular read/write anyway. 996 */ 997 if (io_is_uring_fops(file)) { 998 fput(file); 999 goto fail; 1000 } 1001 ret = io_scm_file_account(ctx, file); 1002 if (ret) { 1003 fput(file); 1004 goto fail; 1005 } 1006 file_slot = io_fixed_file_slot(&ctx->file_table, i); 1007 io_fixed_file_set(file_slot, file); 1008 io_file_bitmap_set(&ctx->file_table, i); 1009 } 1010 1011 /* default it to the whole table */ 1012 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 1013 io_rsrc_node_switch(ctx, NULL); 1014 return 0; 1015 fail: 1016 __io_sqe_files_unregister(ctx); 1017 return ret; 1018 } 1019 1020 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 1021 { 1022 io_buffer_unmap(ctx, &prsrc->buf); 1023 prsrc->buf = NULL; 1024 } 1025 1026 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1027 { 1028 unsigned int i; 1029 1030 for (i = 0; i < ctx->nr_user_bufs; i++) 1031 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 1032 kfree(ctx->user_bufs); 1033 io_rsrc_data_free(ctx->buf_data); 1034 ctx->user_bufs = NULL; 1035 ctx->buf_data = NULL; 1036 ctx->nr_user_bufs = 0; 1037 } 1038 1039 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1040 { 1041 unsigned nr = ctx->nr_user_bufs; 1042 int ret; 1043 1044 if (!ctx->buf_data) 1045 return -ENXIO; 1046 1047 /* 1048 * Quiesce may unlock ->uring_lock, and while it's not held 1049 * prevent new requests using the table. 1050 */ 1051 ctx->nr_user_bufs = 0; 1052 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 1053 ctx->nr_user_bufs = nr; 1054 if (!ret) 1055 __io_sqe_buffers_unregister(ctx); 1056 return ret; 1057 } 1058 1059 /* 1060 * Not super efficient, but this is just a registration time. And we do cache 1061 * the last compound head, so generally we'll only do a full search if we don't 1062 * match that one. 1063 * 1064 * We check if the given compound head page has already been accounted, to 1065 * avoid double accounting it. This allows us to account the full size of the 1066 * page, not just the constituent pages of a huge page. 1067 */ 1068 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 1069 int nr_pages, struct page *hpage) 1070 { 1071 int i, j; 1072 1073 /* check current page array */ 1074 for (i = 0; i < nr_pages; i++) { 1075 if (!PageCompound(pages[i])) 1076 continue; 1077 if (compound_head(pages[i]) == hpage) 1078 return true; 1079 } 1080 1081 /* check previously registered pages */ 1082 for (i = 0; i < ctx->nr_user_bufs; i++) { 1083 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 1084 1085 for (j = 0; j < imu->nr_bvecs; j++) { 1086 if (!PageCompound(imu->bvec[j].bv_page)) 1087 continue; 1088 if (compound_head(imu->bvec[j].bv_page) == hpage) 1089 return true; 1090 } 1091 } 1092 1093 return false; 1094 } 1095 1096 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1097 int nr_pages, struct io_mapped_ubuf *imu, 1098 struct page **last_hpage) 1099 { 1100 int i, ret; 1101 1102 imu->acct_pages = 0; 1103 for (i = 0; i < nr_pages; i++) { 1104 if (!PageCompound(pages[i])) { 1105 imu->acct_pages++; 1106 } else { 1107 struct page *hpage; 1108 1109 hpage = compound_head(pages[i]); 1110 if (hpage == *last_hpage) 1111 continue; 1112 *last_hpage = hpage; 1113 if (headpage_already_acct(ctx, pages, i, hpage)) 1114 continue; 1115 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1116 } 1117 } 1118 1119 if (!imu->acct_pages) 1120 return 0; 1121 1122 ret = io_account_mem(ctx, imu->acct_pages); 1123 if (ret) 1124 imu->acct_pages = 0; 1125 return ret; 1126 } 1127 1128 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1129 { 1130 unsigned long start, end, nr_pages; 1131 struct vm_area_struct **vmas = NULL; 1132 struct page **pages = NULL; 1133 int i, pret, ret = -ENOMEM; 1134 1135 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1136 start = ubuf >> PAGE_SHIFT; 1137 nr_pages = end - start; 1138 1139 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1140 if (!pages) 1141 goto done; 1142 1143 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1144 GFP_KERNEL); 1145 if (!vmas) 1146 goto done; 1147 1148 ret = 0; 1149 mmap_read_lock(current->mm); 1150 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1151 pages, vmas); 1152 if (pret == nr_pages) { 1153 /* don't support file backed memory */ 1154 for (i = 0; i < nr_pages; i++) { 1155 struct vm_area_struct *vma = vmas[i]; 1156 1157 if (vma_is_shmem(vma)) 1158 continue; 1159 if (vma->vm_file && 1160 !is_file_hugepages(vma->vm_file)) { 1161 ret = -EOPNOTSUPP; 1162 break; 1163 } 1164 } 1165 *npages = nr_pages; 1166 } else { 1167 ret = pret < 0 ? pret : -EFAULT; 1168 } 1169 mmap_read_unlock(current->mm); 1170 if (ret) { 1171 /* 1172 * if we did partial map, or found file backed vmas, 1173 * release any pages we did get 1174 */ 1175 if (pret > 0) 1176 unpin_user_pages(pages, pret); 1177 goto done; 1178 } 1179 ret = 0; 1180 done: 1181 kvfree(vmas); 1182 if (ret < 0) { 1183 kvfree(pages); 1184 pages = ERR_PTR(ret); 1185 } 1186 return pages; 1187 } 1188 1189 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1190 struct io_mapped_ubuf **pimu, 1191 struct page **last_hpage) 1192 { 1193 struct io_mapped_ubuf *imu = NULL; 1194 struct page **pages = NULL; 1195 unsigned long off; 1196 size_t size; 1197 int ret, nr_pages, i; 1198 1199 *pimu = ctx->dummy_ubuf; 1200 if (!iov->iov_base) 1201 return 0; 1202 1203 ret = -ENOMEM; 1204 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1205 &nr_pages); 1206 if (IS_ERR(pages)) { 1207 ret = PTR_ERR(pages); 1208 pages = NULL; 1209 goto done; 1210 } 1211 1212 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1213 if (!imu) 1214 goto done; 1215 1216 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1217 if (ret) { 1218 unpin_user_pages(pages, nr_pages); 1219 goto done; 1220 } 1221 1222 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1223 size = iov->iov_len; 1224 for (i = 0; i < nr_pages; i++) { 1225 size_t vec_len; 1226 1227 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1228 imu->bvec[i].bv_page = pages[i]; 1229 imu->bvec[i].bv_len = vec_len; 1230 imu->bvec[i].bv_offset = off; 1231 off = 0; 1232 size -= vec_len; 1233 } 1234 /* store original address for later verification */ 1235 imu->ubuf = (unsigned long) iov->iov_base; 1236 imu->ubuf_end = imu->ubuf + iov->iov_len; 1237 imu->nr_bvecs = nr_pages; 1238 *pimu = imu; 1239 ret = 0; 1240 done: 1241 if (ret) 1242 kvfree(imu); 1243 kvfree(pages); 1244 return ret; 1245 } 1246 1247 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1248 { 1249 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1250 return ctx->user_bufs ? 0 : -ENOMEM; 1251 } 1252 1253 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1254 unsigned int nr_args, u64 __user *tags) 1255 { 1256 struct page *last_hpage = NULL; 1257 struct io_rsrc_data *data; 1258 int i, ret; 1259 struct iovec iov; 1260 1261 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1262 1263 if (ctx->user_bufs) 1264 return -EBUSY; 1265 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1266 return -EINVAL; 1267 ret = io_rsrc_node_switch_start(ctx); 1268 if (ret) 1269 return ret; 1270 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 1271 if (ret) 1272 return ret; 1273 ret = io_buffers_map_alloc(ctx, nr_args); 1274 if (ret) { 1275 io_rsrc_data_free(data); 1276 return ret; 1277 } 1278 1279 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1280 if (arg) { 1281 ret = io_copy_iov(ctx, &iov, arg, i); 1282 if (ret) 1283 break; 1284 ret = io_buffer_validate(&iov); 1285 if (ret) 1286 break; 1287 } else { 1288 memset(&iov, 0, sizeof(iov)); 1289 } 1290 1291 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1292 ret = -EINVAL; 1293 break; 1294 } 1295 1296 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1297 &last_hpage); 1298 if (ret) 1299 break; 1300 } 1301 1302 WARN_ON_ONCE(ctx->buf_data); 1303 1304 ctx->buf_data = data; 1305 if (ret) 1306 __io_sqe_buffers_unregister(ctx); 1307 else 1308 io_rsrc_node_switch(ctx, NULL); 1309 return ret; 1310 } 1311 1312 int io_import_fixed(int ddir, struct iov_iter *iter, 1313 struct io_mapped_ubuf *imu, 1314 u64 buf_addr, size_t len) 1315 { 1316 u64 buf_end; 1317 size_t offset; 1318 1319 if (WARN_ON_ONCE(!imu)) 1320 return -EFAULT; 1321 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1322 return -EFAULT; 1323 /* not inside the mapped region */ 1324 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1325 return -EFAULT; 1326 1327 /* 1328 * May not be a start of buffer, set size appropriately 1329 * and advance us to the beginning. 1330 */ 1331 offset = buf_addr - imu->ubuf; 1332 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1333 1334 if (offset) { 1335 /* 1336 * Don't use iov_iter_advance() here, as it's really slow for 1337 * using the latter parts of a big fixed buffer - it iterates 1338 * over each segment manually. We can cheat a bit here, because 1339 * we know that: 1340 * 1341 * 1) it's a BVEC iter, we set it up 1342 * 2) all bvecs are PAGE_SIZE in size, except potentially the 1343 * first and last bvec 1344 * 1345 * So just find our index, and adjust the iterator afterwards. 1346 * If the offset is within the first bvec (or the whole first 1347 * bvec, just use iov_iter_advance(). This makes it easier 1348 * since we can just skip the first segment, which may not 1349 * be PAGE_SIZE aligned. 1350 */ 1351 const struct bio_vec *bvec = imu->bvec; 1352 1353 if (offset <= bvec->bv_len) { 1354 iov_iter_advance(iter, offset); 1355 } else { 1356 unsigned long seg_skip; 1357 1358 /* skip first vec */ 1359 offset -= bvec->bv_len; 1360 seg_skip = 1 + (offset >> PAGE_SHIFT); 1361 1362 iter->bvec = bvec + seg_skip; 1363 iter->nr_segs -= seg_skip; 1364 iter->count -= bvec->bv_len + offset; 1365 iter->iov_offset = offset & ~PAGE_MASK; 1366 } 1367 } 1368 1369 return 0; 1370 } 1371