1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 19 struct io_rsrc_update { 20 struct file *file; 21 u64 arg; 22 u32 nr_args; 23 u32 offset; 24 }; 25 26 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 27 struct io_mapped_ubuf **pimu, 28 struct page **last_hpage); 29 30 #define IO_RSRC_REF_BATCH 100 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 void io_rsrc_refs_drop(struct io_ring_ctx *ctx) 37 __must_hold(&ctx->uring_lock) 38 { 39 if (ctx->rsrc_cached_refs) { 40 io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); 41 ctx->rsrc_cached_refs = 0; 42 } 43 } 44 45 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 46 { 47 unsigned long page_limit, cur_pages, new_pages; 48 49 if (!nr_pages) 50 return 0; 51 52 /* Don't allow more pages than we can safely lock */ 53 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 54 55 cur_pages = atomic_long_read(&user->locked_vm); 56 do { 57 new_pages = cur_pages + nr_pages; 58 if (new_pages > page_limit) 59 return -ENOMEM; 60 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 61 &cur_pages, new_pages)); 62 return 0; 63 } 64 65 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 66 { 67 if (ctx->user) 68 __io_unaccount_mem(ctx->user, nr_pages); 69 70 if (ctx->mm_account) 71 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 72 } 73 74 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 75 { 76 int ret; 77 78 if (ctx->user) { 79 ret = __io_account_mem(ctx->user, nr_pages); 80 if (ret) 81 return ret; 82 } 83 84 if (ctx->mm_account) 85 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 86 87 return 0; 88 } 89 90 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 91 void __user *arg, unsigned index) 92 { 93 struct iovec __user *src; 94 95 #ifdef CONFIG_COMPAT 96 if (ctx->compat) { 97 struct compat_iovec __user *ciovs; 98 struct compat_iovec ciov; 99 100 ciovs = (struct compat_iovec __user *) arg; 101 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 102 return -EFAULT; 103 104 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 105 dst->iov_len = ciov.iov_len; 106 return 0; 107 } 108 #endif 109 src = (struct iovec __user *) arg; 110 if (copy_from_user(dst, &src[index], sizeof(*dst))) 111 return -EFAULT; 112 return 0; 113 } 114 115 static int io_buffer_validate(struct iovec *iov) 116 { 117 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 118 119 /* 120 * Don't impose further limits on the size and buffer 121 * constraints here, we'll -EINVAL later when IO is 122 * submitted if they are wrong. 123 */ 124 if (!iov->iov_base) 125 return iov->iov_len ? -EFAULT : 0; 126 if (!iov->iov_len) 127 return -EFAULT; 128 129 /* arbitrary limit, but we need something */ 130 if (iov->iov_len > SZ_1G) 131 return -EFAULT; 132 133 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 134 return -EOVERFLOW; 135 136 return 0; 137 } 138 139 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 140 { 141 struct io_mapped_ubuf *imu = *slot; 142 unsigned int i; 143 144 if (imu != ctx->dummy_ubuf) { 145 for (i = 0; i < imu->nr_bvecs; i++) 146 unpin_user_page(imu->bvec[i].bv_page); 147 if (imu->acct_pages) 148 io_unaccount_mem(ctx, imu->acct_pages); 149 kvfree(imu); 150 } 151 *slot = NULL; 152 } 153 154 void io_rsrc_refs_refill(struct io_ring_ctx *ctx) 155 __must_hold(&ctx->uring_lock) 156 { 157 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; 158 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); 159 } 160 161 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 162 { 163 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 164 struct io_ring_ctx *ctx = rsrc_data->ctx; 165 struct io_rsrc_put *prsrc, *tmp; 166 167 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 168 list_del(&prsrc->list); 169 170 if (prsrc->tag) { 171 if (ctx->flags & IORING_SETUP_IOPOLL) { 172 mutex_lock(&ctx->uring_lock); 173 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); 174 mutex_unlock(&ctx->uring_lock); 175 } else { 176 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); 177 } 178 } 179 180 rsrc_data->do_put(ctx, prsrc); 181 kfree(prsrc); 182 } 183 184 io_rsrc_node_destroy(ref_node); 185 if (atomic_dec_and_test(&rsrc_data->refs)) 186 complete(&rsrc_data->done); 187 } 188 189 void io_rsrc_put_work(struct work_struct *work) 190 { 191 struct io_ring_ctx *ctx; 192 struct llist_node *node; 193 194 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 195 node = llist_del_all(&ctx->rsrc_put_llist); 196 197 while (node) { 198 struct io_rsrc_node *ref_node; 199 struct llist_node *next = node->next; 200 201 ref_node = llist_entry(node, struct io_rsrc_node, llist); 202 __io_rsrc_put_work(ref_node); 203 node = next; 204 } 205 } 206 207 void io_wait_rsrc_data(struct io_rsrc_data *data) 208 { 209 if (data && !atomic_dec_and_test(&data->refs)) 210 wait_for_completion(&data->done); 211 } 212 213 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 214 { 215 percpu_ref_exit(&ref_node->refs); 216 kfree(ref_node); 217 } 218 219 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) 220 { 221 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 222 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 223 unsigned long flags; 224 bool first_add = false; 225 unsigned long delay = HZ; 226 227 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 228 node->done = true; 229 230 /* if we are mid-quiesce then do not delay */ 231 if (node->rsrc_data->quiesce) 232 delay = 0; 233 234 while (!list_empty(&ctx->rsrc_ref_list)) { 235 node = list_first_entry(&ctx->rsrc_ref_list, 236 struct io_rsrc_node, node); 237 /* recycle ref nodes in order */ 238 if (!node->done) 239 break; 240 list_del(&node->node); 241 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 242 } 243 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 244 245 if (first_add) 246 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); 247 } 248 249 static struct io_rsrc_node *io_rsrc_node_alloc(void) 250 { 251 struct io_rsrc_node *ref_node; 252 253 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 254 if (!ref_node) 255 return NULL; 256 257 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 258 0, GFP_KERNEL)) { 259 kfree(ref_node); 260 return NULL; 261 } 262 INIT_LIST_HEAD(&ref_node->node); 263 INIT_LIST_HEAD(&ref_node->rsrc_list); 264 ref_node->done = false; 265 return ref_node; 266 } 267 268 void io_rsrc_node_switch(struct io_ring_ctx *ctx, 269 struct io_rsrc_data *data_to_kill) 270 __must_hold(&ctx->uring_lock) 271 { 272 WARN_ON_ONCE(!ctx->rsrc_backup_node); 273 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 274 275 io_rsrc_refs_drop(ctx); 276 277 if (data_to_kill) { 278 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 279 280 rsrc_node->rsrc_data = data_to_kill; 281 spin_lock_irq(&ctx->rsrc_ref_lock); 282 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 283 spin_unlock_irq(&ctx->rsrc_ref_lock); 284 285 atomic_inc(&data_to_kill->refs); 286 percpu_ref_kill(&rsrc_node->refs); 287 ctx->rsrc_node = NULL; 288 } 289 290 if (!ctx->rsrc_node) { 291 ctx->rsrc_node = ctx->rsrc_backup_node; 292 ctx->rsrc_backup_node = NULL; 293 } 294 } 295 296 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 297 { 298 if (ctx->rsrc_backup_node) 299 return 0; 300 ctx->rsrc_backup_node = io_rsrc_node_alloc(); 301 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 302 } 303 304 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 305 struct io_ring_ctx *ctx) 306 { 307 int ret; 308 309 /* As we may drop ->uring_lock, other task may have started quiesce */ 310 if (data->quiesce) 311 return -ENXIO; 312 313 data->quiesce = true; 314 do { 315 ret = io_rsrc_node_switch_start(ctx); 316 if (ret) 317 break; 318 io_rsrc_node_switch(ctx, data); 319 320 /* kill initial ref, already quiesced if zero */ 321 if (atomic_dec_and_test(&data->refs)) 322 break; 323 mutex_unlock(&ctx->uring_lock); 324 flush_delayed_work(&ctx->rsrc_put_work); 325 ret = wait_for_completion_interruptible(&data->done); 326 if (!ret) { 327 mutex_lock(&ctx->uring_lock); 328 if (atomic_read(&data->refs) > 0) { 329 /* 330 * it has been revived by another thread while 331 * we were unlocked 332 */ 333 mutex_unlock(&ctx->uring_lock); 334 } else { 335 break; 336 } 337 } 338 339 atomic_inc(&data->refs); 340 /* wait for all works potentially completing data->done */ 341 flush_delayed_work(&ctx->rsrc_put_work); 342 reinit_completion(&data->done); 343 344 ret = io_run_task_work_sig(ctx); 345 mutex_lock(&ctx->uring_lock); 346 } while (ret >= 0); 347 data->quiesce = false; 348 349 return ret; 350 } 351 352 static void io_free_page_table(void **table, size_t size) 353 { 354 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 355 356 for (i = 0; i < nr_tables; i++) 357 kfree(table[i]); 358 kfree(table); 359 } 360 361 static void io_rsrc_data_free(struct io_rsrc_data *data) 362 { 363 size_t size = data->nr * sizeof(data->tags[0][0]); 364 365 if (data->tags) 366 io_free_page_table((void **)data->tags, size); 367 kfree(data); 368 } 369 370 static __cold void **io_alloc_page_table(size_t size) 371 { 372 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 373 size_t init_size = size; 374 void **table; 375 376 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 377 if (!table) 378 return NULL; 379 380 for (i = 0; i < nr_tables; i++) { 381 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 382 383 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 384 if (!table[i]) { 385 io_free_page_table(table, init_size); 386 return NULL; 387 } 388 size -= this_size; 389 } 390 return table; 391 } 392 393 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, 394 rsrc_put_fn *do_put, u64 __user *utags, 395 unsigned nr, struct io_rsrc_data **pdata) 396 { 397 struct io_rsrc_data *data; 398 int ret = -ENOMEM; 399 unsigned i; 400 401 data = kzalloc(sizeof(*data), GFP_KERNEL); 402 if (!data) 403 return -ENOMEM; 404 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 405 if (!data->tags) { 406 kfree(data); 407 return -ENOMEM; 408 } 409 410 data->nr = nr; 411 data->ctx = ctx; 412 data->do_put = do_put; 413 if (utags) { 414 ret = -EFAULT; 415 for (i = 0; i < nr; i++) { 416 u64 *tag_slot = io_get_tag_slot(data, i); 417 418 if (copy_from_user(tag_slot, &utags[i], 419 sizeof(*tag_slot))) 420 goto fail; 421 } 422 } 423 424 atomic_set(&data->refs, 1); 425 init_completion(&data->done); 426 *pdata = data; 427 return 0; 428 fail: 429 io_rsrc_data_free(data); 430 return ret; 431 } 432 433 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 434 struct io_uring_rsrc_update2 *up, 435 unsigned nr_args) 436 { 437 u64 __user *tags = u64_to_user_ptr(up->tags); 438 __s32 __user *fds = u64_to_user_ptr(up->data); 439 struct io_rsrc_data *data = ctx->file_data; 440 struct io_fixed_file *file_slot; 441 struct file *file; 442 int fd, i, err = 0; 443 unsigned int done; 444 bool needs_switch = false; 445 446 if (!ctx->file_data) 447 return -ENXIO; 448 if (up->offset + nr_args > ctx->nr_user_files) 449 return -EINVAL; 450 451 for (done = 0; done < nr_args; done++) { 452 u64 tag = 0; 453 454 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 455 copy_from_user(&fd, &fds[done], sizeof(fd))) { 456 err = -EFAULT; 457 break; 458 } 459 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 460 err = -EINVAL; 461 break; 462 } 463 if (fd == IORING_REGISTER_FILES_SKIP) 464 continue; 465 466 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 467 file_slot = io_fixed_file_slot(&ctx->file_table, i); 468 469 if (file_slot->file_ptr) { 470 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 471 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 472 if (err) 473 break; 474 file_slot->file_ptr = 0; 475 io_file_bitmap_clear(&ctx->file_table, i); 476 needs_switch = true; 477 } 478 if (fd != -1) { 479 file = fget(fd); 480 if (!file) { 481 err = -EBADF; 482 break; 483 } 484 /* 485 * Don't allow io_uring instances to be registered. If 486 * UNIX isn't enabled, then this causes a reference 487 * cycle and this instance can never get freed. If UNIX 488 * is enabled we'll handle it just fine, but there's 489 * still no point in allowing a ring fd as it doesn't 490 * support regular read/write anyway. 491 */ 492 if (io_is_uring_fops(file)) { 493 fput(file); 494 err = -EBADF; 495 break; 496 } 497 err = io_scm_file_account(ctx, file); 498 if (err) { 499 fput(file); 500 break; 501 } 502 *io_get_tag_slot(data, i) = tag; 503 io_fixed_file_set(file_slot, file); 504 io_file_bitmap_set(&ctx->file_table, i); 505 } 506 } 507 508 if (needs_switch) 509 io_rsrc_node_switch(ctx, data); 510 return done ? done : err; 511 } 512 513 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 514 struct io_uring_rsrc_update2 *up, 515 unsigned int nr_args) 516 { 517 u64 __user *tags = u64_to_user_ptr(up->tags); 518 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 519 struct page *last_hpage = NULL; 520 bool needs_switch = false; 521 __u32 done; 522 int i, err; 523 524 if (!ctx->buf_data) 525 return -ENXIO; 526 if (up->offset + nr_args > ctx->nr_user_bufs) 527 return -EINVAL; 528 529 for (done = 0; done < nr_args; done++) { 530 struct io_mapped_ubuf *imu; 531 int offset = up->offset + done; 532 u64 tag = 0; 533 534 err = io_copy_iov(ctx, &iov, iovs, done); 535 if (err) 536 break; 537 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 538 err = -EFAULT; 539 break; 540 } 541 err = io_buffer_validate(&iov); 542 if (err) 543 break; 544 if (!iov.iov_base && tag) { 545 err = -EINVAL; 546 break; 547 } 548 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 549 if (err) 550 break; 551 552 i = array_index_nospec(offset, ctx->nr_user_bufs); 553 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 554 err = io_queue_rsrc_removal(ctx->buf_data, i, 555 ctx->rsrc_node, ctx->user_bufs[i]); 556 if (unlikely(err)) { 557 io_buffer_unmap(ctx, &imu); 558 break; 559 } 560 ctx->user_bufs[i] = ctx->dummy_ubuf; 561 needs_switch = true; 562 } 563 564 ctx->user_bufs[i] = imu; 565 *io_get_tag_slot(ctx->buf_data, offset) = tag; 566 } 567 568 if (needs_switch) 569 io_rsrc_node_switch(ctx, ctx->buf_data); 570 return done ? done : err; 571 } 572 573 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 574 struct io_uring_rsrc_update2 *up, 575 unsigned nr_args) 576 { 577 __u32 tmp; 578 int err; 579 580 if (check_add_overflow(up->offset, nr_args, &tmp)) 581 return -EOVERFLOW; 582 err = io_rsrc_node_switch_start(ctx); 583 if (err) 584 return err; 585 586 switch (type) { 587 case IORING_RSRC_FILE: 588 return __io_sqe_files_update(ctx, up, nr_args); 589 case IORING_RSRC_BUFFER: 590 return __io_sqe_buffers_update(ctx, up, nr_args); 591 } 592 return -EINVAL; 593 } 594 595 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 596 unsigned nr_args) 597 { 598 struct io_uring_rsrc_update2 up; 599 600 if (!nr_args) 601 return -EINVAL; 602 memset(&up, 0, sizeof(up)); 603 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 604 return -EFAULT; 605 if (up.resv || up.resv2) 606 return -EINVAL; 607 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 608 } 609 610 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 611 unsigned size, unsigned type) 612 { 613 struct io_uring_rsrc_update2 up; 614 615 if (size != sizeof(up)) 616 return -EINVAL; 617 if (copy_from_user(&up, arg, sizeof(up))) 618 return -EFAULT; 619 if (!up.nr || up.resv || up.resv2) 620 return -EINVAL; 621 return __io_register_rsrc_update(ctx, type, &up, up.nr); 622 } 623 624 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 625 unsigned int size, unsigned int type) 626 { 627 struct io_uring_rsrc_register rr; 628 629 /* keep it extendible */ 630 if (size != sizeof(rr)) 631 return -EINVAL; 632 633 memset(&rr, 0, sizeof(rr)); 634 if (copy_from_user(&rr, arg, size)) 635 return -EFAULT; 636 if (!rr.nr || rr.resv2) 637 return -EINVAL; 638 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 639 return -EINVAL; 640 641 switch (type) { 642 case IORING_RSRC_FILE: 643 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 644 break; 645 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 646 rr.nr, u64_to_user_ptr(rr.tags)); 647 case IORING_RSRC_BUFFER: 648 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 649 break; 650 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 651 rr.nr, u64_to_user_ptr(rr.tags)); 652 } 653 return -EINVAL; 654 } 655 656 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 657 { 658 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 659 660 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 661 return -EINVAL; 662 if (sqe->rw_flags || sqe->splice_fd_in) 663 return -EINVAL; 664 665 up->offset = READ_ONCE(sqe->off); 666 up->nr_args = READ_ONCE(sqe->len); 667 if (!up->nr_args) 668 return -EINVAL; 669 up->arg = READ_ONCE(sqe->addr); 670 return 0; 671 } 672 673 static int io_files_update_with_index_alloc(struct io_kiocb *req, 674 unsigned int issue_flags) 675 { 676 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 677 __s32 __user *fds = u64_to_user_ptr(up->arg); 678 unsigned int done; 679 struct file *file; 680 int ret, fd; 681 682 if (!req->ctx->file_data) 683 return -ENXIO; 684 685 for (done = 0; done < up->nr_args; done++) { 686 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 687 ret = -EFAULT; 688 break; 689 } 690 691 file = fget(fd); 692 if (!file) { 693 ret = -EBADF; 694 break; 695 } 696 ret = io_fixed_fd_install(req, issue_flags, file, 697 IORING_FILE_INDEX_ALLOC); 698 if (ret < 0) 699 break; 700 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 701 __io_close_fixed(req->ctx, issue_flags, ret); 702 ret = -EFAULT; 703 break; 704 } 705 } 706 707 if (done) 708 return done; 709 return ret; 710 } 711 712 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 713 { 714 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 715 struct io_ring_ctx *ctx = req->ctx; 716 struct io_uring_rsrc_update2 up2; 717 int ret; 718 719 up2.offset = up->offset; 720 up2.data = up->arg; 721 up2.nr = 0; 722 up2.tags = 0; 723 up2.resv = 0; 724 up2.resv2 = 0; 725 726 if (up->offset == IORING_FILE_INDEX_ALLOC) { 727 ret = io_files_update_with_index_alloc(req, issue_flags); 728 } else { 729 io_ring_submit_lock(ctx, issue_flags); 730 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 731 &up2, up->nr_args); 732 io_ring_submit_unlock(ctx, issue_flags); 733 } 734 735 if (ret < 0) 736 req_set_fail(req); 737 io_req_set_res(req, ret, 0); 738 return IOU_OK; 739 } 740 741 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 742 struct io_rsrc_node *node, void *rsrc) 743 { 744 u64 *tag_slot = io_get_tag_slot(data, idx); 745 struct io_rsrc_put *prsrc; 746 747 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 748 if (!prsrc) 749 return -ENOMEM; 750 751 prsrc->tag = *tag_slot; 752 *tag_slot = 0; 753 prsrc->rsrc = rsrc; 754 list_add(&prsrc->list, &node->rsrc_list); 755 return 0; 756 } 757 758 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 759 { 760 int i; 761 762 for (i = 0; i < ctx->nr_user_files; i++) { 763 struct file *file = io_file_from_index(&ctx->file_table, i); 764 765 /* skip scm accounted files, they'll be freed by ->ring_sock */ 766 if (!file || io_file_need_scm(file)) 767 continue; 768 io_file_bitmap_clear(&ctx->file_table, i); 769 fput(file); 770 } 771 772 #if defined(CONFIG_UNIX) 773 if (ctx->ring_sock) { 774 struct sock *sock = ctx->ring_sock->sk; 775 struct sk_buff *skb; 776 777 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 778 kfree_skb(skb); 779 } 780 #endif 781 io_free_file_tables(&ctx->file_table); 782 io_rsrc_data_free(ctx->file_data); 783 ctx->file_data = NULL; 784 ctx->nr_user_files = 0; 785 } 786 787 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 788 { 789 unsigned nr = ctx->nr_user_files; 790 int ret; 791 792 if (!ctx->file_data) 793 return -ENXIO; 794 795 /* 796 * Quiesce may unlock ->uring_lock, and while it's not held 797 * prevent new requests using the table. 798 */ 799 ctx->nr_user_files = 0; 800 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 801 ctx->nr_user_files = nr; 802 if (!ret) 803 __io_sqe_files_unregister(ctx); 804 return ret; 805 } 806 807 /* 808 * Ensure the UNIX gc is aware of our file set, so we are certain that 809 * the io_uring can be safely unregistered on process exit, even if we have 810 * loops in the file referencing. We account only files that can hold other 811 * files because otherwise they can't form a loop and so are not interesting 812 * for GC. 813 */ 814 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 815 { 816 #if defined(CONFIG_UNIX) 817 struct sock *sk = ctx->ring_sock->sk; 818 struct sk_buff_head *head = &sk->sk_receive_queue; 819 struct scm_fp_list *fpl; 820 struct sk_buff *skb; 821 822 if (likely(!io_file_need_scm(file))) 823 return 0; 824 825 /* 826 * See if we can merge this file into an existing skb SCM_RIGHTS 827 * file set. If there's no room, fall back to allocating a new skb 828 * and filling it in. 829 */ 830 spin_lock_irq(&head->lock); 831 skb = skb_peek(head); 832 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 833 __skb_unlink(skb, head); 834 else 835 skb = NULL; 836 spin_unlock_irq(&head->lock); 837 838 if (!skb) { 839 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 840 if (!fpl) 841 return -ENOMEM; 842 843 skb = alloc_skb(0, GFP_KERNEL); 844 if (!skb) { 845 kfree(fpl); 846 return -ENOMEM; 847 } 848 849 fpl->user = get_uid(current_user()); 850 fpl->max = SCM_MAX_FD; 851 fpl->count = 0; 852 853 UNIXCB(skb).fp = fpl; 854 skb->sk = sk; 855 skb->scm_io_uring = 1; 856 skb->destructor = unix_destruct_scm; 857 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 858 } 859 860 fpl = UNIXCB(skb).fp; 861 fpl->fp[fpl->count++] = get_file(file); 862 unix_inflight(fpl->user, file); 863 skb_queue_head(head, skb); 864 fput(file); 865 #endif 866 return 0; 867 } 868 869 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 870 { 871 struct file *file = prsrc->file; 872 #if defined(CONFIG_UNIX) 873 struct sock *sock = ctx->ring_sock->sk; 874 struct sk_buff_head list, *head = &sock->sk_receive_queue; 875 struct sk_buff *skb; 876 int i; 877 878 if (!io_file_need_scm(file)) { 879 fput(file); 880 return; 881 } 882 883 __skb_queue_head_init(&list); 884 885 /* 886 * Find the skb that holds this file in its SCM_RIGHTS. When found, 887 * remove this entry and rearrange the file array. 888 */ 889 skb = skb_dequeue(head); 890 while (skb) { 891 struct scm_fp_list *fp; 892 893 fp = UNIXCB(skb).fp; 894 for (i = 0; i < fp->count; i++) { 895 int left; 896 897 if (fp->fp[i] != file) 898 continue; 899 900 unix_notinflight(fp->user, fp->fp[i]); 901 left = fp->count - 1 - i; 902 if (left) { 903 memmove(&fp->fp[i], &fp->fp[i + 1], 904 left * sizeof(struct file *)); 905 } 906 fp->count--; 907 if (!fp->count) { 908 kfree_skb(skb); 909 skb = NULL; 910 } else { 911 __skb_queue_tail(&list, skb); 912 } 913 fput(file); 914 file = NULL; 915 break; 916 } 917 918 if (!file) 919 break; 920 921 __skb_queue_tail(&list, skb); 922 923 skb = skb_dequeue(head); 924 } 925 926 if (skb_peek(&list)) { 927 spin_lock_irq(&head->lock); 928 while ((skb = __skb_dequeue(&list)) != NULL) 929 __skb_queue_tail(head, skb); 930 spin_unlock_irq(&head->lock); 931 } 932 #else 933 fput(file); 934 #endif 935 } 936 937 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 938 unsigned nr_args, u64 __user *tags) 939 { 940 __s32 __user *fds = (__s32 __user *) arg; 941 struct file *file; 942 int fd, ret; 943 unsigned i; 944 945 if (ctx->file_data) 946 return -EBUSY; 947 if (!nr_args) 948 return -EINVAL; 949 if (nr_args > IORING_MAX_FIXED_FILES) 950 return -EMFILE; 951 if (nr_args > rlimit(RLIMIT_NOFILE)) 952 return -EMFILE; 953 ret = io_rsrc_node_switch_start(ctx); 954 if (ret) 955 return ret; 956 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 957 &ctx->file_data); 958 if (ret) 959 return ret; 960 961 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 962 io_rsrc_data_free(ctx->file_data); 963 ctx->file_data = NULL; 964 return -ENOMEM; 965 } 966 967 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 968 struct io_fixed_file *file_slot; 969 970 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 971 ret = -EFAULT; 972 goto fail; 973 } 974 /* allow sparse sets */ 975 if (!fds || fd == -1) { 976 ret = -EINVAL; 977 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 978 goto fail; 979 continue; 980 } 981 982 file = fget(fd); 983 ret = -EBADF; 984 if (unlikely(!file)) 985 goto fail; 986 987 /* 988 * Don't allow io_uring instances to be registered. If UNIX 989 * isn't enabled, then this causes a reference cycle and this 990 * instance can never get freed. If UNIX is enabled we'll 991 * handle it just fine, but there's still no point in allowing 992 * a ring fd as it doesn't support regular read/write anyway. 993 */ 994 if (io_is_uring_fops(file)) { 995 fput(file); 996 goto fail; 997 } 998 ret = io_scm_file_account(ctx, file); 999 if (ret) { 1000 fput(file); 1001 goto fail; 1002 } 1003 file_slot = io_fixed_file_slot(&ctx->file_table, i); 1004 io_fixed_file_set(file_slot, file); 1005 io_file_bitmap_set(&ctx->file_table, i); 1006 } 1007 1008 /* default it to the whole table */ 1009 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 1010 io_rsrc_node_switch(ctx, NULL); 1011 return 0; 1012 fail: 1013 __io_sqe_files_unregister(ctx); 1014 return ret; 1015 } 1016 1017 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 1018 { 1019 io_buffer_unmap(ctx, &prsrc->buf); 1020 prsrc->buf = NULL; 1021 } 1022 1023 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1024 { 1025 unsigned int i; 1026 1027 for (i = 0; i < ctx->nr_user_bufs; i++) 1028 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 1029 kfree(ctx->user_bufs); 1030 io_rsrc_data_free(ctx->buf_data); 1031 ctx->user_bufs = NULL; 1032 ctx->buf_data = NULL; 1033 ctx->nr_user_bufs = 0; 1034 } 1035 1036 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1037 { 1038 unsigned nr = ctx->nr_user_bufs; 1039 int ret; 1040 1041 if (!ctx->buf_data) 1042 return -ENXIO; 1043 1044 /* 1045 * Quiesce may unlock ->uring_lock, and while it's not held 1046 * prevent new requests using the table. 1047 */ 1048 ctx->nr_user_bufs = 0; 1049 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 1050 ctx->nr_user_bufs = nr; 1051 if (!ret) 1052 __io_sqe_buffers_unregister(ctx); 1053 return ret; 1054 } 1055 1056 /* 1057 * Not super efficient, but this is just a registration time. And we do cache 1058 * the last compound head, so generally we'll only do a full search if we don't 1059 * match that one. 1060 * 1061 * We check if the given compound head page has already been accounted, to 1062 * avoid double accounting it. This allows us to account the full size of the 1063 * page, not just the constituent pages of a huge page. 1064 */ 1065 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 1066 int nr_pages, struct page *hpage) 1067 { 1068 int i, j; 1069 1070 /* check current page array */ 1071 for (i = 0; i < nr_pages; i++) { 1072 if (!PageCompound(pages[i])) 1073 continue; 1074 if (compound_head(pages[i]) == hpage) 1075 return true; 1076 } 1077 1078 /* check previously registered pages */ 1079 for (i = 0; i < ctx->nr_user_bufs; i++) { 1080 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 1081 1082 for (j = 0; j < imu->nr_bvecs; j++) { 1083 if (!PageCompound(imu->bvec[j].bv_page)) 1084 continue; 1085 if (compound_head(imu->bvec[j].bv_page) == hpage) 1086 return true; 1087 } 1088 } 1089 1090 return false; 1091 } 1092 1093 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1094 int nr_pages, struct io_mapped_ubuf *imu, 1095 struct page **last_hpage) 1096 { 1097 int i, ret; 1098 1099 imu->acct_pages = 0; 1100 for (i = 0; i < nr_pages; i++) { 1101 if (!PageCompound(pages[i])) { 1102 imu->acct_pages++; 1103 } else { 1104 struct page *hpage; 1105 1106 hpage = compound_head(pages[i]); 1107 if (hpage == *last_hpage) 1108 continue; 1109 *last_hpage = hpage; 1110 if (headpage_already_acct(ctx, pages, i, hpage)) 1111 continue; 1112 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1113 } 1114 } 1115 1116 if (!imu->acct_pages) 1117 return 0; 1118 1119 ret = io_account_mem(ctx, imu->acct_pages); 1120 if (ret) 1121 imu->acct_pages = 0; 1122 return ret; 1123 } 1124 1125 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1126 { 1127 unsigned long start, end, nr_pages; 1128 struct vm_area_struct **vmas = NULL; 1129 struct page **pages = NULL; 1130 int i, pret, ret = -ENOMEM; 1131 1132 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1133 start = ubuf >> PAGE_SHIFT; 1134 nr_pages = end - start; 1135 1136 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1137 if (!pages) 1138 goto done; 1139 1140 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1141 GFP_KERNEL); 1142 if (!vmas) 1143 goto done; 1144 1145 ret = 0; 1146 mmap_read_lock(current->mm); 1147 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1148 pages, vmas); 1149 if (pret == nr_pages) { 1150 /* don't support file backed memory */ 1151 for (i = 0; i < nr_pages; i++) { 1152 struct vm_area_struct *vma = vmas[i]; 1153 1154 if (vma_is_shmem(vma)) 1155 continue; 1156 if (vma->vm_file && 1157 !is_file_hugepages(vma->vm_file)) { 1158 ret = -EOPNOTSUPP; 1159 break; 1160 } 1161 } 1162 *npages = nr_pages; 1163 } else { 1164 ret = pret < 0 ? pret : -EFAULT; 1165 } 1166 mmap_read_unlock(current->mm); 1167 if (ret) { 1168 /* 1169 * if we did partial map, or found file backed vmas, 1170 * release any pages we did get 1171 */ 1172 if (pret > 0) 1173 unpin_user_pages(pages, pret); 1174 goto done; 1175 } 1176 ret = 0; 1177 done: 1178 kvfree(vmas); 1179 if (ret < 0) { 1180 kvfree(pages); 1181 pages = ERR_PTR(ret); 1182 } 1183 return pages; 1184 } 1185 1186 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1187 struct io_mapped_ubuf **pimu, 1188 struct page **last_hpage) 1189 { 1190 struct io_mapped_ubuf *imu = NULL; 1191 struct page **pages = NULL; 1192 unsigned long off; 1193 size_t size; 1194 int ret, nr_pages, i; 1195 1196 *pimu = ctx->dummy_ubuf; 1197 if (!iov->iov_base) 1198 return 0; 1199 1200 ret = -ENOMEM; 1201 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1202 &nr_pages); 1203 if (IS_ERR(pages)) { 1204 ret = PTR_ERR(pages); 1205 pages = NULL; 1206 goto done; 1207 } 1208 1209 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1210 if (!imu) 1211 goto done; 1212 1213 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1214 if (ret) { 1215 unpin_user_pages(pages, nr_pages); 1216 goto done; 1217 } 1218 1219 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1220 size = iov->iov_len; 1221 for (i = 0; i < nr_pages; i++) { 1222 size_t vec_len; 1223 1224 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1225 imu->bvec[i].bv_page = pages[i]; 1226 imu->bvec[i].bv_len = vec_len; 1227 imu->bvec[i].bv_offset = off; 1228 off = 0; 1229 size -= vec_len; 1230 } 1231 /* store original address for later verification */ 1232 imu->ubuf = (unsigned long) iov->iov_base; 1233 imu->ubuf_end = imu->ubuf + iov->iov_len; 1234 imu->nr_bvecs = nr_pages; 1235 *pimu = imu; 1236 ret = 0; 1237 done: 1238 if (ret) 1239 kvfree(imu); 1240 kvfree(pages); 1241 return ret; 1242 } 1243 1244 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1245 { 1246 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1247 return ctx->user_bufs ? 0 : -ENOMEM; 1248 } 1249 1250 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1251 unsigned int nr_args, u64 __user *tags) 1252 { 1253 struct page *last_hpage = NULL; 1254 struct io_rsrc_data *data; 1255 int i, ret; 1256 struct iovec iov; 1257 1258 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1259 1260 if (ctx->user_bufs) 1261 return -EBUSY; 1262 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1263 return -EINVAL; 1264 ret = io_rsrc_node_switch_start(ctx); 1265 if (ret) 1266 return ret; 1267 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 1268 if (ret) 1269 return ret; 1270 ret = io_buffers_map_alloc(ctx, nr_args); 1271 if (ret) { 1272 io_rsrc_data_free(data); 1273 return ret; 1274 } 1275 1276 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1277 if (arg) { 1278 ret = io_copy_iov(ctx, &iov, arg, i); 1279 if (ret) 1280 break; 1281 ret = io_buffer_validate(&iov); 1282 if (ret) 1283 break; 1284 } else { 1285 memset(&iov, 0, sizeof(iov)); 1286 } 1287 1288 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1289 ret = -EINVAL; 1290 break; 1291 } 1292 1293 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1294 &last_hpage); 1295 if (ret) 1296 break; 1297 } 1298 1299 WARN_ON_ONCE(ctx->buf_data); 1300 1301 ctx->buf_data = data; 1302 if (ret) 1303 __io_sqe_buffers_unregister(ctx); 1304 else 1305 io_rsrc_node_switch(ctx, NULL); 1306 return ret; 1307 } 1308 1309 int io_import_fixed(int ddir, struct iov_iter *iter, 1310 struct io_mapped_ubuf *imu, 1311 u64 buf_addr, size_t len) 1312 { 1313 u64 buf_end; 1314 size_t offset; 1315 1316 if (WARN_ON_ONCE(!imu)) 1317 return -EFAULT; 1318 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1319 return -EFAULT; 1320 /* not inside the mapped region */ 1321 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1322 return -EFAULT; 1323 1324 /* 1325 * May not be a start of buffer, set size appropriately 1326 * and advance us to the beginning. 1327 */ 1328 offset = buf_addr - imu->ubuf; 1329 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1330 1331 if (offset) { 1332 /* 1333 * Don't use iov_iter_advance() here, as it's really slow for 1334 * using the latter parts of a big fixed buffer - it iterates 1335 * over each segment manually. We can cheat a bit here, because 1336 * we know that: 1337 * 1338 * 1) it's a BVEC iter, we set it up 1339 * 2) all bvecs are PAGE_SIZE in size, except potentially the 1340 * first and last bvec 1341 * 1342 * So just find our index, and adjust the iterator afterwards. 1343 * If the offset is within the first bvec (or the whole first 1344 * bvec, just use iov_iter_advance(). This makes it easier 1345 * since we can just skip the first segment, which may not 1346 * be PAGE_SIZE aligned. 1347 */ 1348 const struct bio_vec *bvec = imu->bvec; 1349 1350 if (offset <= bvec->bv_len) { 1351 iov_iter_advance(iter, offset); 1352 } else { 1353 unsigned long seg_skip; 1354 1355 /* skip first vec */ 1356 offset -= bvec->bv_len; 1357 seg_skip = 1 + (offset >> PAGE_SHIFT); 1358 1359 iter->bvec = bvec + seg_skip; 1360 iter->nr_segs -= seg_skip; 1361 iter->count -= bvec->bv_len + offset; 1362 iter->iov_offset = offset & ~PAGE_MASK; 1363 } 1364 } 1365 1366 return 0; 1367 } 1368