1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring_types.h" 16 #include "io_uring.h" 17 #include "openclose.h" 18 #include "rsrc.h" 19 20 struct io_rsrc_update { 21 struct file *file; 22 u64 arg; 23 u32 nr_args; 24 u32 offset; 25 }; 26 27 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 28 struct io_mapped_ubuf **pimu, 29 struct page **last_hpage); 30 31 #define IO_RSRC_REF_BATCH 100 32 33 /* only define max */ 34 #define IORING_MAX_FIXED_FILES (1U << 20) 35 #define IORING_MAX_REG_BUFFERS (1U << 14) 36 37 void io_rsrc_refs_drop(struct io_ring_ctx *ctx) 38 __must_hold(&ctx->uring_lock) 39 { 40 if (ctx->rsrc_cached_refs) { 41 io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); 42 ctx->rsrc_cached_refs = 0; 43 } 44 } 45 46 static inline void __io_unaccount_mem(struct user_struct *user, 47 unsigned long nr_pages) 48 { 49 atomic_long_sub(nr_pages, &user->locked_vm); 50 } 51 52 static inline int __io_account_mem(struct user_struct *user, 53 unsigned long nr_pages) 54 { 55 unsigned long page_limit, cur_pages, new_pages; 56 57 /* Don't allow more pages than we can safely lock */ 58 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 59 60 do { 61 cur_pages = atomic_long_read(&user->locked_vm); 62 new_pages = cur_pages + nr_pages; 63 if (new_pages > page_limit) 64 return -ENOMEM; 65 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, 66 new_pages) != cur_pages); 67 68 return 0; 69 } 70 71 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 72 { 73 if (ctx->user) 74 __io_unaccount_mem(ctx->user, nr_pages); 75 76 if (ctx->mm_account) 77 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 78 } 79 80 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 81 { 82 int ret; 83 84 if (ctx->user) { 85 ret = __io_account_mem(ctx->user, nr_pages); 86 if (ret) 87 return ret; 88 } 89 90 if (ctx->mm_account) 91 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 92 93 return 0; 94 } 95 96 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 97 void __user *arg, unsigned index) 98 { 99 struct iovec __user *src; 100 101 #ifdef CONFIG_COMPAT 102 if (ctx->compat) { 103 struct compat_iovec __user *ciovs; 104 struct compat_iovec ciov; 105 106 ciovs = (struct compat_iovec __user *) arg; 107 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 108 return -EFAULT; 109 110 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 111 dst->iov_len = ciov.iov_len; 112 return 0; 113 } 114 #endif 115 src = (struct iovec __user *) arg; 116 if (copy_from_user(dst, &src[index], sizeof(*dst))) 117 return -EFAULT; 118 return 0; 119 } 120 121 static int io_buffer_validate(struct iovec *iov) 122 { 123 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 124 125 /* 126 * Don't impose further limits on the size and buffer 127 * constraints here, we'll -EINVAL later when IO is 128 * submitted if they are wrong. 129 */ 130 if (!iov->iov_base) 131 return iov->iov_len ? -EFAULT : 0; 132 if (!iov->iov_len) 133 return -EFAULT; 134 135 /* arbitrary limit, but we need something */ 136 if (iov->iov_len > SZ_1G) 137 return -EFAULT; 138 139 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 140 return -EOVERFLOW; 141 142 return 0; 143 } 144 145 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 146 { 147 struct io_mapped_ubuf *imu = *slot; 148 unsigned int i; 149 150 if (imu != ctx->dummy_ubuf) { 151 for (i = 0; i < imu->nr_bvecs; i++) 152 unpin_user_page(imu->bvec[i].bv_page); 153 if (imu->acct_pages) 154 io_unaccount_mem(ctx, imu->acct_pages); 155 kvfree(imu); 156 } 157 *slot = NULL; 158 } 159 160 void io_rsrc_refs_refill(struct io_ring_ctx *ctx) 161 __must_hold(&ctx->uring_lock) 162 { 163 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; 164 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); 165 } 166 167 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 168 { 169 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 170 struct io_ring_ctx *ctx = rsrc_data->ctx; 171 struct io_rsrc_put *prsrc, *tmp; 172 173 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 174 list_del(&prsrc->list); 175 176 if (prsrc->tag) { 177 if (ctx->flags & IORING_SETUP_IOPOLL) 178 mutex_lock(&ctx->uring_lock); 179 180 spin_lock(&ctx->completion_lock); 181 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); 182 io_commit_cqring(ctx); 183 spin_unlock(&ctx->completion_lock); 184 io_cqring_ev_posted(ctx); 185 186 if (ctx->flags & IORING_SETUP_IOPOLL) 187 mutex_unlock(&ctx->uring_lock); 188 } 189 190 rsrc_data->do_put(ctx, prsrc); 191 kfree(prsrc); 192 } 193 194 io_rsrc_node_destroy(ref_node); 195 if (atomic_dec_and_test(&rsrc_data->refs)) 196 complete(&rsrc_data->done); 197 } 198 199 void io_rsrc_put_work(struct work_struct *work) 200 { 201 struct io_ring_ctx *ctx; 202 struct llist_node *node; 203 204 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 205 node = llist_del_all(&ctx->rsrc_put_llist); 206 207 while (node) { 208 struct io_rsrc_node *ref_node; 209 struct llist_node *next = node->next; 210 211 ref_node = llist_entry(node, struct io_rsrc_node, llist); 212 __io_rsrc_put_work(ref_node); 213 node = next; 214 } 215 } 216 217 void io_wait_rsrc_data(struct io_rsrc_data *data) 218 { 219 if (data && !atomic_dec_and_test(&data->refs)) 220 wait_for_completion(&data->done); 221 } 222 223 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 224 { 225 percpu_ref_exit(&ref_node->refs); 226 kfree(ref_node); 227 } 228 229 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) 230 { 231 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 232 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 233 unsigned long flags; 234 bool first_add = false; 235 unsigned long delay = HZ; 236 237 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 238 node->done = true; 239 240 /* if we are mid-quiesce then do not delay */ 241 if (node->rsrc_data->quiesce) 242 delay = 0; 243 244 while (!list_empty(&ctx->rsrc_ref_list)) { 245 node = list_first_entry(&ctx->rsrc_ref_list, 246 struct io_rsrc_node, node); 247 /* recycle ref nodes in order */ 248 if (!node->done) 249 break; 250 list_del(&node->node); 251 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 252 } 253 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 254 255 if (first_add) 256 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); 257 } 258 259 static struct io_rsrc_node *io_rsrc_node_alloc(void) 260 { 261 struct io_rsrc_node *ref_node; 262 263 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 264 if (!ref_node) 265 return NULL; 266 267 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 268 0, GFP_KERNEL)) { 269 kfree(ref_node); 270 return NULL; 271 } 272 INIT_LIST_HEAD(&ref_node->node); 273 INIT_LIST_HEAD(&ref_node->rsrc_list); 274 ref_node->done = false; 275 return ref_node; 276 } 277 278 void io_rsrc_node_switch(struct io_ring_ctx *ctx, 279 struct io_rsrc_data *data_to_kill) 280 __must_hold(&ctx->uring_lock) 281 { 282 WARN_ON_ONCE(!ctx->rsrc_backup_node); 283 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 284 285 io_rsrc_refs_drop(ctx); 286 287 if (data_to_kill) { 288 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 289 290 rsrc_node->rsrc_data = data_to_kill; 291 spin_lock_irq(&ctx->rsrc_ref_lock); 292 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 293 spin_unlock_irq(&ctx->rsrc_ref_lock); 294 295 atomic_inc(&data_to_kill->refs); 296 percpu_ref_kill(&rsrc_node->refs); 297 ctx->rsrc_node = NULL; 298 } 299 300 if (!ctx->rsrc_node) { 301 ctx->rsrc_node = ctx->rsrc_backup_node; 302 ctx->rsrc_backup_node = NULL; 303 } 304 } 305 306 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 307 { 308 if (ctx->rsrc_backup_node) 309 return 0; 310 ctx->rsrc_backup_node = io_rsrc_node_alloc(); 311 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 312 } 313 314 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 315 struct io_ring_ctx *ctx) 316 { 317 int ret; 318 319 /* As we may drop ->uring_lock, other task may have started quiesce */ 320 if (data->quiesce) 321 return -ENXIO; 322 323 data->quiesce = true; 324 do { 325 ret = io_rsrc_node_switch_start(ctx); 326 if (ret) 327 break; 328 io_rsrc_node_switch(ctx, data); 329 330 /* kill initial ref, already quiesced if zero */ 331 if (atomic_dec_and_test(&data->refs)) 332 break; 333 mutex_unlock(&ctx->uring_lock); 334 flush_delayed_work(&ctx->rsrc_put_work); 335 ret = wait_for_completion_interruptible(&data->done); 336 if (!ret) { 337 mutex_lock(&ctx->uring_lock); 338 if (atomic_read(&data->refs) > 0) { 339 /* 340 * it has been revived by another thread while 341 * we were unlocked 342 */ 343 mutex_unlock(&ctx->uring_lock); 344 } else { 345 break; 346 } 347 } 348 349 atomic_inc(&data->refs); 350 /* wait for all works potentially completing data->done */ 351 flush_delayed_work(&ctx->rsrc_put_work); 352 reinit_completion(&data->done); 353 354 ret = io_run_task_work_sig(); 355 mutex_lock(&ctx->uring_lock); 356 } while (ret >= 0); 357 data->quiesce = false; 358 359 return ret; 360 } 361 362 static void io_free_page_table(void **table, size_t size) 363 { 364 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 365 366 for (i = 0; i < nr_tables; i++) 367 kfree(table[i]); 368 kfree(table); 369 } 370 371 static void io_rsrc_data_free(struct io_rsrc_data *data) 372 { 373 size_t size = data->nr * sizeof(data->tags[0][0]); 374 375 if (data->tags) 376 io_free_page_table((void **)data->tags, size); 377 kfree(data); 378 } 379 380 static __cold void **io_alloc_page_table(size_t size) 381 { 382 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 383 size_t init_size = size; 384 void **table; 385 386 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 387 if (!table) 388 return NULL; 389 390 for (i = 0; i < nr_tables; i++) { 391 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 392 393 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 394 if (!table[i]) { 395 io_free_page_table(table, init_size); 396 return NULL; 397 } 398 size -= this_size; 399 } 400 return table; 401 } 402 403 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, 404 rsrc_put_fn *do_put, u64 __user *utags, 405 unsigned nr, struct io_rsrc_data **pdata) 406 { 407 struct io_rsrc_data *data; 408 int ret = -ENOMEM; 409 unsigned i; 410 411 data = kzalloc(sizeof(*data), GFP_KERNEL); 412 if (!data) 413 return -ENOMEM; 414 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 415 if (!data->tags) { 416 kfree(data); 417 return -ENOMEM; 418 } 419 420 data->nr = nr; 421 data->ctx = ctx; 422 data->do_put = do_put; 423 if (utags) { 424 ret = -EFAULT; 425 for (i = 0; i < nr; i++) { 426 u64 *tag_slot = io_get_tag_slot(data, i); 427 428 if (copy_from_user(tag_slot, &utags[i], 429 sizeof(*tag_slot))) 430 goto fail; 431 } 432 } 433 434 atomic_set(&data->refs, 1); 435 init_completion(&data->done); 436 *pdata = data; 437 return 0; 438 fail: 439 io_rsrc_data_free(data); 440 return ret; 441 } 442 443 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 444 struct io_uring_rsrc_update2 *up, 445 unsigned nr_args) 446 { 447 u64 __user *tags = u64_to_user_ptr(up->tags); 448 __s32 __user *fds = u64_to_user_ptr(up->data); 449 struct io_rsrc_data *data = ctx->file_data; 450 struct io_fixed_file *file_slot; 451 struct file *file; 452 int fd, i, err = 0; 453 unsigned int done; 454 bool needs_switch = false; 455 456 if (!ctx->file_data) 457 return -ENXIO; 458 if (up->offset + nr_args > ctx->nr_user_files) 459 return -EINVAL; 460 461 for (done = 0; done < nr_args; done++) { 462 u64 tag = 0; 463 464 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 465 copy_from_user(&fd, &fds[done], sizeof(fd))) { 466 err = -EFAULT; 467 break; 468 } 469 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 470 err = -EINVAL; 471 break; 472 } 473 if (fd == IORING_REGISTER_FILES_SKIP) 474 continue; 475 476 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 477 file_slot = io_fixed_file_slot(&ctx->file_table, i); 478 479 if (file_slot->file_ptr) { 480 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 481 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 482 if (err) 483 break; 484 file_slot->file_ptr = 0; 485 io_file_bitmap_clear(&ctx->file_table, i); 486 needs_switch = true; 487 } 488 if (fd != -1) { 489 file = fget(fd); 490 if (!file) { 491 err = -EBADF; 492 break; 493 } 494 /* 495 * Don't allow io_uring instances to be registered. If 496 * UNIX isn't enabled, then this causes a reference 497 * cycle and this instance can never get freed. If UNIX 498 * is enabled we'll handle it just fine, but there's 499 * still no point in allowing a ring fd as it doesn't 500 * support regular read/write anyway. 501 */ 502 if (io_is_uring_fops(file)) { 503 fput(file); 504 err = -EBADF; 505 break; 506 } 507 err = io_scm_file_account(ctx, file); 508 if (err) { 509 fput(file); 510 break; 511 } 512 *io_get_tag_slot(data, i) = tag; 513 io_fixed_file_set(file_slot, file); 514 io_file_bitmap_set(&ctx->file_table, i); 515 } 516 } 517 518 if (needs_switch) 519 io_rsrc_node_switch(ctx, data); 520 return done ? done : err; 521 } 522 523 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 524 struct io_uring_rsrc_update2 *up, 525 unsigned int nr_args) 526 { 527 u64 __user *tags = u64_to_user_ptr(up->tags); 528 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 529 struct page *last_hpage = NULL; 530 bool needs_switch = false; 531 __u32 done; 532 int i, err; 533 534 if (!ctx->buf_data) 535 return -ENXIO; 536 if (up->offset + nr_args > ctx->nr_user_bufs) 537 return -EINVAL; 538 539 for (done = 0; done < nr_args; done++) { 540 struct io_mapped_ubuf *imu; 541 int offset = up->offset + done; 542 u64 tag = 0; 543 544 err = io_copy_iov(ctx, &iov, iovs, done); 545 if (err) 546 break; 547 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 548 err = -EFAULT; 549 break; 550 } 551 err = io_buffer_validate(&iov); 552 if (err) 553 break; 554 if (!iov.iov_base && tag) { 555 err = -EINVAL; 556 break; 557 } 558 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 559 if (err) 560 break; 561 562 i = array_index_nospec(offset, ctx->nr_user_bufs); 563 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 564 err = io_queue_rsrc_removal(ctx->buf_data, i, 565 ctx->rsrc_node, ctx->user_bufs[i]); 566 if (unlikely(err)) { 567 io_buffer_unmap(ctx, &imu); 568 break; 569 } 570 ctx->user_bufs[i] = ctx->dummy_ubuf; 571 needs_switch = true; 572 } 573 574 ctx->user_bufs[i] = imu; 575 *io_get_tag_slot(ctx->buf_data, offset) = tag; 576 } 577 578 if (needs_switch) 579 io_rsrc_node_switch(ctx, ctx->buf_data); 580 return done ? done : err; 581 } 582 583 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 584 struct io_uring_rsrc_update2 *up, 585 unsigned nr_args) 586 { 587 __u32 tmp; 588 int err; 589 590 if (check_add_overflow(up->offset, nr_args, &tmp)) 591 return -EOVERFLOW; 592 err = io_rsrc_node_switch_start(ctx); 593 if (err) 594 return err; 595 596 switch (type) { 597 case IORING_RSRC_FILE: 598 return __io_sqe_files_update(ctx, up, nr_args); 599 case IORING_RSRC_BUFFER: 600 return __io_sqe_buffers_update(ctx, up, nr_args); 601 } 602 return -EINVAL; 603 } 604 605 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 606 unsigned nr_args) 607 { 608 struct io_uring_rsrc_update2 up; 609 610 if (!nr_args) 611 return -EINVAL; 612 memset(&up, 0, sizeof(up)); 613 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 614 return -EFAULT; 615 if (up.resv || up.resv2) 616 return -EINVAL; 617 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 618 } 619 620 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 621 unsigned size, unsigned type) 622 { 623 struct io_uring_rsrc_update2 up; 624 625 if (size != sizeof(up)) 626 return -EINVAL; 627 if (copy_from_user(&up, arg, sizeof(up))) 628 return -EFAULT; 629 if (!up.nr || up.resv || up.resv2) 630 return -EINVAL; 631 return __io_register_rsrc_update(ctx, type, &up, up.nr); 632 } 633 634 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 635 unsigned int size, unsigned int type) 636 { 637 struct io_uring_rsrc_register rr; 638 639 /* keep it extendible */ 640 if (size != sizeof(rr)) 641 return -EINVAL; 642 643 memset(&rr, 0, sizeof(rr)); 644 if (copy_from_user(&rr, arg, size)) 645 return -EFAULT; 646 if (!rr.nr || rr.resv2) 647 return -EINVAL; 648 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 649 return -EINVAL; 650 651 switch (type) { 652 case IORING_RSRC_FILE: 653 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 654 break; 655 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 656 rr.nr, u64_to_user_ptr(rr.tags)); 657 case IORING_RSRC_BUFFER: 658 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 659 break; 660 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 661 rr.nr, u64_to_user_ptr(rr.tags)); 662 } 663 return -EINVAL; 664 } 665 666 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 667 { 668 struct io_rsrc_update *up = io_kiocb_to_cmd(req); 669 670 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 671 return -EINVAL; 672 if (sqe->rw_flags || sqe->splice_fd_in) 673 return -EINVAL; 674 675 up->offset = READ_ONCE(sqe->off); 676 up->nr_args = READ_ONCE(sqe->len); 677 if (!up->nr_args) 678 return -EINVAL; 679 up->arg = READ_ONCE(sqe->addr); 680 return 0; 681 } 682 683 static int io_files_update_with_index_alloc(struct io_kiocb *req, 684 unsigned int issue_flags) 685 { 686 struct io_rsrc_update *up = io_kiocb_to_cmd(req); 687 __s32 __user *fds = u64_to_user_ptr(up->arg); 688 unsigned int done; 689 struct file *file; 690 int ret, fd; 691 692 if (!req->ctx->file_data) 693 return -ENXIO; 694 695 for (done = 0; done < up->nr_args; done++) { 696 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 697 ret = -EFAULT; 698 break; 699 } 700 701 file = fget(fd); 702 if (!file) { 703 ret = -EBADF; 704 break; 705 } 706 ret = io_fixed_fd_install(req, issue_flags, file, 707 IORING_FILE_INDEX_ALLOC); 708 if (ret < 0) 709 break; 710 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 711 __io_close_fixed(req, issue_flags, ret); 712 ret = -EFAULT; 713 break; 714 } 715 } 716 717 if (done) 718 return done; 719 return ret; 720 } 721 722 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 723 { 724 struct io_rsrc_update *up = io_kiocb_to_cmd(req); 725 struct io_ring_ctx *ctx = req->ctx; 726 struct io_uring_rsrc_update2 up2; 727 int ret; 728 729 up2.offset = up->offset; 730 up2.data = up->arg; 731 up2.nr = 0; 732 up2.tags = 0; 733 up2.resv = 0; 734 up2.resv2 = 0; 735 736 if (up->offset == IORING_FILE_INDEX_ALLOC) { 737 ret = io_files_update_with_index_alloc(req, issue_flags); 738 } else { 739 io_ring_submit_lock(ctx, issue_flags); 740 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 741 &up2, up->nr_args); 742 io_ring_submit_unlock(ctx, issue_flags); 743 } 744 745 if (ret < 0) 746 req_set_fail(req); 747 io_req_set_res(req, ret, 0); 748 return IOU_OK; 749 } 750 751 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 752 struct io_rsrc_node *node, void *rsrc) 753 { 754 u64 *tag_slot = io_get_tag_slot(data, idx); 755 struct io_rsrc_put *prsrc; 756 757 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 758 if (!prsrc) 759 return -ENOMEM; 760 761 prsrc->tag = *tag_slot; 762 *tag_slot = 0; 763 prsrc->rsrc = rsrc; 764 list_add(&prsrc->list, &node->rsrc_list); 765 return 0; 766 } 767 768 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 769 { 770 #if !defined(IO_URING_SCM_ALL) 771 int i; 772 773 for (i = 0; i < ctx->nr_user_files; i++) { 774 struct file *file = io_file_from_index(&ctx->file_table, i); 775 776 if (!file) 777 continue; 778 if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) 779 continue; 780 io_file_bitmap_clear(&ctx->file_table, i); 781 fput(file); 782 } 783 #endif 784 785 #if defined(CONFIG_UNIX) 786 if (ctx->ring_sock) { 787 struct sock *sock = ctx->ring_sock->sk; 788 struct sk_buff *skb; 789 790 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 791 kfree_skb(skb); 792 } 793 #endif 794 io_free_file_tables(&ctx->file_table); 795 io_rsrc_data_free(ctx->file_data); 796 ctx->file_data = NULL; 797 ctx->nr_user_files = 0; 798 } 799 800 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 801 { 802 unsigned nr = ctx->nr_user_files; 803 int ret; 804 805 if (!ctx->file_data) 806 return -ENXIO; 807 808 /* 809 * Quiesce may unlock ->uring_lock, and while it's not held 810 * prevent new requests using the table. 811 */ 812 ctx->nr_user_files = 0; 813 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 814 ctx->nr_user_files = nr; 815 if (!ret) 816 __io_sqe_files_unregister(ctx); 817 return ret; 818 } 819 820 /* 821 * Ensure the UNIX gc is aware of our file set, so we are certain that 822 * the io_uring can be safely unregistered on process exit, even if we have 823 * loops in the file referencing. We account only files that can hold other 824 * files because otherwise they can't form a loop and so are not interesting 825 * for GC. 826 */ 827 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 828 { 829 #if defined(CONFIG_UNIX) 830 struct sock *sk = ctx->ring_sock->sk; 831 struct sk_buff_head *head = &sk->sk_receive_queue; 832 struct scm_fp_list *fpl; 833 struct sk_buff *skb; 834 835 if (likely(!io_file_need_scm(file))) 836 return 0; 837 838 /* 839 * See if we can merge this file into an existing skb SCM_RIGHTS 840 * file set. If there's no room, fall back to allocating a new skb 841 * and filling it in. 842 */ 843 spin_lock_irq(&head->lock); 844 skb = skb_peek(head); 845 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 846 __skb_unlink(skb, head); 847 else 848 skb = NULL; 849 spin_unlock_irq(&head->lock); 850 851 if (!skb) { 852 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 853 if (!fpl) 854 return -ENOMEM; 855 856 skb = alloc_skb(0, GFP_KERNEL); 857 if (!skb) { 858 kfree(fpl); 859 return -ENOMEM; 860 } 861 862 fpl->user = get_uid(current_user()); 863 fpl->max = SCM_MAX_FD; 864 fpl->count = 0; 865 866 UNIXCB(skb).fp = fpl; 867 skb->sk = sk; 868 skb->destructor = unix_destruct_scm; 869 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 870 } 871 872 fpl = UNIXCB(skb).fp; 873 fpl->fp[fpl->count++] = get_file(file); 874 unix_inflight(fpl->user, file); 875 skb_queue_head(head, skb); 876 fput(file); 877 #endif 878 return 0; 879 } 880 881 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 882 { 883 struct file *file = prsrc->file; 884 #if defined(CONFIG_UNIX) 885 struct sock *sock = ctx->ring_sock->sk; 886 struct sk_buff_head list, *head = &sock->sk_receive_queue; 887 struct sk_buff *skb; 888 int i; 889 890 if (!io_file_need_scm(file)) { 891 fput(file); 892 return; 893 } 894 895 __skb_queue_head_init(&list); 896 897 /* 898 * Find the skb that holds this file in its SCM_RIGHTS. When found, 899 * remove this entry and rearrange the file array. 900 */ 901 skb = skb_dequeue(head); 902 while (skb) { 903 struct scm_fp_list *fp; 904 905 fp = UNIXCB(skb).fp; 906 for (i = 0; i < fp->count; i++) { 907 int left; 908 909 if (fp->fp[i] != file) 910 continue; 911 912 unix_notinflight(fp->user, fp->fp[i]); 913 left = fp->count - 1 - i; 914 if (left) { 915 memmove(&fp->fp[i], &fp->fp[i + 1], 916 left * sizeof(struct file *)); 917 } 918 fp->count--; 919 if (!fp->count) { 920 kfree_skb(skb); 921 skb = NULL; 922 } else { 923 __skb_queue_tail(&list, skb); 924 } 925 fput(file); 926 file = NULL; 927 break; 928 } 929 930 if (!file) 931 break; 932 933 __skb_queue_tail(&list, skb); 934 935 skb = skb_dequeue(head); 936 } 937 938 if (skb_peek(&list)) { 939 spin_lock_irq(&head->lock); 940 while ((skb = __skb_dequeue(&list)) != NULL) 941 __skb_queue_tail(head, skb); 942 spin_unlock_irq(&head->lock); 943 } 944 #else 945 fput(file); 946 #endif 947 } 948 949 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 950 unsigned nr_args, u64 __user *tags) 951 { 952 __s32 __user *fds = (__s32 __user *) arg; 953 struct file *file; 954 int fd, ret; 955 unsigned i; 956 957 if (ctx->file_data) 958 return -EBUSY; 959 if (!nr_args) 960 return -EINVAL; 961 if (nr_args > IORING_MAX_FIXED_FILES) 962 return -EMFILE; 963 if (nr_args > rlimit(RLIMIT_NOFILE)) 964 return -EMFILE; 965 ret = io_rsrc_node_switch_start(ctx); 966 if (ret) 967 return ret; 968 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 969 &ctx->file_data); 970 if (ret) 971 return ret; 972 973 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 974 io_rsrc_data_free(ctx->file_data); 975 ctx->file_data = NULL; 976 return -ENOMEM; 977 } 978 979 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 980 struct io_fixed_file *file_slot; 981 982 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 983 ret = -EFAULT; 984 goto fail; 985 } 986 /* allow sparse sets */ 987 if (!fds || fd == -1) { 988 ret = -EINVAL; 989 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 990 goto fail; 991 continue; 992 } 993 994 file = fget(fd); 995 ret = -EBADF; 996 if (unlikely(!file)) 997 goto fail; 998 999 /* 1000 * Don't allow io_uring instances to be registered. If UNIX 1001 * isn't enabled, then this causes a reference cycle and this 1002 * instance can never get freed. If UNIX is enabled we'll 1003 * handle it just fine, but there's still no point in allowing 1004 * a ring fd as it doesn't support regular read/write anyway. 1005 */ 1006 if (io_is_uring_fops(file)) { 1007 fput(file); 1008 goto fail; 1009 } 1010 ret = io_scm_file_account(ctx, file); 1011 if (ret) { 1012 fput(file); 1013 goto fail; 1014 } 1015 file_slot = io_fixed_file_slot(&ctx->file_table, i); 1016 io_fixed_file_set(file_slot, file); 1017 io_file_bitmap_set(&ctx->file_table, i); 1018 } 1019 1020 io_rsrc_node_switch(ctx, NULL); 1021 return 0; 1022 fail: 1023 __io_sqe_files_unregister(ctx); 1024 return ret; 1025 } 1026 1027 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 1028 { 1029 io_buffer_unmap(ctx, &prsrc->buf); 1030 prsrc->buf = NULL; 1031 } 1032 1033 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1034 { 1035 unsigned int i; 1036 1037 for (i = 0; i < ctx->nr_user_bufs; i++) 1038 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 1039 kfree(ctx->user_bufs); 1040 io_rsrc_data_free(ctx->buf_data); 1041 ctx->user_bufs = NULL; 1042 ctx->buf_data = NULL; 1043 ctx->nr_user_bufs = 0; 1044 } 1045 1046 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1047 { 1048 unsigned nr = ctx->nr_user_bufs; 1049 int ret; 1050 1051 if (!ctx->buf_data) 1052 return -ENXIO; 1053 1054 /* 1055 * Quiesce may unlock ->uring_lock, and while it's not held 1056 * prevent new requests using the table. 1057 */ 1058 ctx->nr_user_bufs = 0; 1059 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 1060 ctx->nr_user_bufs = nr; 1061 if (!ret) 1062 __io_sqe_buffers_unregister(ctx); 1063 return ret; 1064 } 1065 1066 /* 1067 * Not super efficient, but this is just a registration time. And we do cache 1068 * the last compound head, so generally we'll only do a full search if we don't 1069 * match that one. 1070 * 1071 * We check if the given compound head page has already been accounted, to 1072 * avoid double accounting it. This allows us to account the full size of the 1073 * page, not just the constituent pages of a huge page. 1074 */ 1075 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 1076 int nr_pages, struct page *hpage) 1077 { 1078 int i, j; 1079 1080 /* check current page array */ 1081 for (i = 0; i < nr_pages; i++) { 1082 if (!PageCompound(pages[i])) 1083 continue; 1084 if (compound_head(pages[i]) == hpage) 1085 return true; 1086 } 1087 1088 /* check previously registered pages */ 1089 for (i = 0; i < ctx->nr_user_bufs; i++) { 1090 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 1091 1092 for (j = 0; j < imu->nr_bvecs; j++) { 1093 if (!PageCompound(imu->bvec[j].bv_page)) 1094 continue; 1095 if (compound_head(imu->bvec[j].bv_page) == hpage) 1096 return true; 1097 } 1098 } 1099 1100 return false; 1101 } 1102 1103 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1104 int nr_pages, struct io_mapped_ubuf *imu, 1105 struct page **last_hpage) 1106 { 1107 int i, ret; 1108 1109 imu->acct_pages = 0; 1110 for (i = 0; i < nr_pages; i++) { 1111 if (!PageCompound(pages[i])) { 1112 imu->acct_pages++; 1113 } else { 1114 struct page *hpage; 1115 1116 hpage = compound_head(pages[i]); 1117 if (hpage == *last_hpage) 1118 continue; 1119 *last_hpage = hpage; 1120 if (headpage_already_acct(ctx, pages, i, hpage)) 1121 continue; 1122 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1123 } 1124 } 1125 1126 if (!imu->acct_pages) 1127 return 0; 1128 1129 ret = io_account_mem(ctx, imu->acct_pages); 1130 if (ret) 1131 imu->acct_pages = 0; 1132 return ret; 1133 } 1134 1135 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1136 { 1137 unsigned long start, end, nr_pages; 1138 struct vm_area_struct **vmas = NULL; 1139 struct page **pages = NULL; 1140 int i, pret, ret = -ENOMEM; 1141 1142 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1143 start = ubuf >> PAGE_SHIFT; 1144 nr_pages = end - start; 1145 1146 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1147 if (!pages) 1148 goto done; 1149 1150 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1151 GFP_KERNEL); 1152 if (!vmas) 1153 goto done; 1154 1155 ret = 0; 1156 mmap_read_lock(current->mm); 1157 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1158 pages, vmas); 1159 if (pret == nr_pages) { 1160 /* don't support file backed memory */ 1161 for (i = 0; i < nr_pages; i++) { 1162 struct vm_area_struct *vma = vmas[i]; 1163 1164 if (vma_is_shmem(vma)) 1165 continue; 1166 if (vma->vm_file && 1167 !is_file_hugepages(vma->vm_file)) { 1168 ret = -EOPNOTSUPP; 1169 break; 1170 } 1171 } 1172 *npages = nr_pages; 1173 } else { 1174 ret = pret < 0 ? pret : -EFAULT; 1175 } 1176 mmap_read_unlock(current->mm); 1177 if (ret) { 1178 /* 1179 * if we did partial map, or found file backed vmas, 1180 * release any pages we did get 1181 */ 1182 if (pret > 0) 1183 unpin_user_pages(pages, pret); 1184 goto done; 1185 } 1186 ret = 0; 1187 done: 1188 kvfree(vmas); 1189 if (ret < 0) { 1190 kvfree(pages); 1191 pages = ERR_PTR(ret); 1192 } 1193 return pages; 1194 } 1195 1196 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1197 struct io_mapped_ubuf **pimu, 1198 struct page **last_hpage) 1199 { 1200 struct io_mapped_ubuf *imu = NULL; 1201 struct page **pages = NULL; 1202 unsigned long off; 1203 size_t size; 1204 int ret, nr_pages, i; 1205 1206 *pimu = ctx->dummy_ubuf; 1207 if (!iov->iov_base) 1208 return 0; 1209 1210 ret = -ENOMEM; 1211 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1212 &nr_pages); 1213 if (IS_ERR(pages)) { 1214 ret = PTR_ERR(pages); 1215 pages = NULL; 1216 goto done; 1217 } 1218 1219 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1220 if (!imu) 1221 goto done; 1222 1223 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1224 if (ret) { 1225 unpin_user_pages(pages, nr_pages); 1226 goto done; 1227 } 1228 1229 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1230 size = iov->iov_len; 1231 for (i = 0; i < nr_pages; i++) { 1232 size_t vec_len; 1233 1234 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1235 imu->bvec[i].bv_page = pages[i]; 1236 imu->bvec[i].bv_len = vec_len; 1237 imu->bvec[i].bv_offset = off; 1238 off = 0; 1239 size -= vec_len; 1240 } 1241 /* store original address for later verification */ 1242 imu->ubuf = (unsigned long) iov->iov_base; 1243 imu->ubuf_end = imu->ubuf + iov->iov_len; 1244 imu->nr_bvecs = nr_pages; 1245 *pimu = imu; 1246 ret = 0; 1247 done: 1248 if (ret) 1249 kvfree(imu); 1250 kvfree(pages); 1251 return ret; 1252 } 1253 1254 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1255 { 1256 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1257 return ctx->user_bufs ? 0 : -ENOMEM; 1258 } 1259 1260 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1261 unsigned int nr_args, u64 __user *tags) 1262 { 1263 struct page *last_hpage = NULL; 1264 struct io_rsrc_data *data; 1265 int i, ret; 1266 struct iovec iov; 1267 1268 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1269 1270 if (ctx->user_bufs) 1271 return -EBUSY; 1272 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1273 return -EINVAL; 1274 ret = io_rsrc_node_switch_start(ctx); 1275 if (ret) 1276 return ret; 1277 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 1278 if (ret) 1279 return ret; 1280 ret = io_buffers_map_alloc(ctx, nr_args); 1281 if (ret) { 1282 io_rsrc_data_free(data); 1283 return ret; 1284 } 1285 1286 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1287 if (arg) { 1288 ret = io_copy_iov(ctx, &iov, arg, i); 1289 if (ret) 1290 break; 1291 ret = io_buffer_validate(&iov); 1292 if (ret) 1293 break; 1294 } else { 1295 memset(&iov, 0, sizeof(iov)); 1296 } 1297 1298 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1299 ret = -EINVAL; 1300 break; 1301 } 1302 1303 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1304 &last_hpage); 1305 if (ret) 1306 break; 1307 } 1308 1309 WARN_ON_ONCE(ctx->buf_data); 1310 1311 ctx->buf_data = data; 1312 if (ret) 1313 __io_sqe_buffers_unregister(ctx); 1314 else 1315 io_rsrc_node_switch(ctx, NULL); 1316 return ret; 1317 } 1318