1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 19 struct io_rsrc_update { 20 struct file *file; 21 u64 arg; 22 u32 nr_args; 23 u32 offset; 24 }; 25 26 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 27 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 28 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 29 struct io_mapped_ubuf **pimu, 30 struct page **last_hpage); 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 37 { 38 unsigned long page_limit, cur_pages, new_pages; 39 40 if (!nr_pages) 41 return 0; 42 43 /* Don't allow more pages than we can safely lock */ 44 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 45 46 cur_pages = atomic_long_read(&user->locked_vm); 47 do { 48 new_pages = cur_pages + nr_pages; 49 if (new_pages > page_limit) 50 return -ENOMEM; 51 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 52 &cur_pages, new_pages)); 53 return 0; 54 } 55 56 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 57 { 58 if (ctx->user) 59 __io_unaccount_mem(ctx->user, nr_pages); 60 61 if (ctx->mm_account) 62 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 63 } 64 65 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 66 { 67 int ret; 68 69 if (ctx->user) { 70 ret = __io_account_mem(ctx->user, nr_pages); 71 if (ret) 72 return ret; 73 } 74 75 if (ctx->mm_account) 76 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 77 78 return 0; 79 } 80 81 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 82 void __user *arg, unsigned index) 83 { 84 struct iovec __user *src; 85 86 #ifdef CONFIG_COMPAT 87 if (ctx->compat) { 88 struct compat_iovec __user *ciovs; 89 struct compat_iovec ciov; 90 91 ciovs = (struct compat_iovec __user *) arg; 92 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 93 return -EFAULT; 94 95 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 96 dst->iov_len = ciov.iov_len; 97 return 0; 98 } 99 #endif 100 src = (struct iovec __user *) arg; 101 if (copy_from_user(dst, &src[index], sizeof(*dst))) 102 return -EFAULT; 103 return 0; 104 } 105 106 static int io_buffer_validate(struct iovec *iov) 107 { 108 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 109 110 /* 111 * Don't impose further limits on the size and buffer 112 * constraints here, we'll -EINVAL later when IO is 113 * submitted if they are wrong. 114 */ 115 if (!iov->iov_base) 116 return iov->iov_len ? -EFAULT : 0; 117 if (!iov->iov_len) 118 return -EFAULT; 119 120 /* arbitrary limit, but we need something */ 121 if (iov->iov_len > SZ_1G) 122 return -EFAULT; 123 124 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 125 return -EOVERFLOW; 126 127 return 0; 128 } 129 130 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 131 { 132 struct io_mapped_ubuf *imu = *slot; 133 unsigned int i; 134 135 if (imu != ctx->dummy_ubuf) { 136 for (i = 0; i < imu->nr_bvecs; i++) 137 unpin_user_page(imu->bvec[i].bv_page); 138 if (imu->acct_pages) 139 io_unaccount_mem(ctx, imu->acct_pages); 140 kvfree(imu); 141 } 142 *slot = NULL; 143 } 144 145 static void io_rsrc_put_work(struct io_rsrc_node *node) 146 { 147 struct io_rsrc_data *data = node->rsrc_data; 148 struct io_rsrc_put *prsrc = &node->item; 149 150 if (prsrc->tag) 151 io_post_aux_cqe(data->ctx, prsrc->tag, 0, 0); 152 153 switch (data->rsrc_type) { 154 case IORING_RSRC_FILE: 155 io_rsrc_file_put(data->ctx, prsrc); 156 break; 157 case IORING_RSRC_BUFFER: 158 io_rsrc_buf_put(data->ctx, prsrc); 159 break; 160 default: 161 WARN_ON_ONCE(1); 162 break; 163 } 164 } 165 166 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 167 { 168 if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) 169 kfree(node); 170 } 171 172 void io_rsrc_node_ref_zero(struct io_rsrc_node *node) 173 __must_hold(&node->rsrc_data->ctx->uring_lock) 174 { 175 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 176 177 while (!list_empty(&ctx->rsrc_ref_list)) { 178 node = list_first_entry(&ctx->rsrc_ref_list, 179 struct io_rsrc_node, node); 180 /* recycle ref nodes in order */ 181 if (node->refs) 182 break; 183 list_del(&node->node); 184 185 if (likely(!node->empty)) 186 io_rsrc_put_work(node); 187 io_rsrc_node_destroy(ctx, node); 188 } 189 if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) 190 wake_up_all(&ctx->rsrc_quiesce_wq); 191 } 192 193 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 194 { 195 struct io_rsrc_node *ref_node; 196 struct io_cache_entry *entry; 197 198 entry = io_alloc_cache_get(&ctx->rsrc_node_cache); 199 if (entry) { 200 ref_node = container_of(entry, struct io_rsrc_node, cache); 201 } else { 202 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 203 if (!ref_node) 204 return NULL; 205 } 206 207 ref_node->rsrc_data = NULL; 208 ref_node->empty = 0; 209 ref_node->refs = 1; 210 return ref_node; 211 } 212 213 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 214 struct io_ring_ctx *ctx) 215 { 216 struct io_rsrc_node *backup; 217 DEFINE_WAIT(we); 218 int ret; 219 220 /* As We may drop ->uring_lock, other task may have started quiesce */ 221 if (data->quiesce) 222 return -ENXIO; 223 224 backup = io_rsrc_node_alloc(ctx); 225 if (!backup) 226 return -ENOMEM; 227 ctx->rsrc_node->empty = true; 228 ctx->rsrc_node->rsrc_data = data; 229 list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); 230 io_put_rsrc_node(ctx, ctx->rsrc_node); 231 ctx->rsrc_node = backup; 232 233 if (list_empty(&ctx->rsrc_ref_list)) 234 return 0; 235 236 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 237 atomic_set(&ctx->cq_wait_nr, 1); 238 smp_mb(); 239 } 240 241 ctx->rsrc_quiesce++; 242 data->quiesce = true; 243 do { 244 prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); 245 mutex_unlock(&ctx->uring_lock); 246 247 ret = io_run_task_work_sig(ctx); 248 if (ret < 0) { 249 mutex_lock(&ctx->uring_lock); 250 if (list_empty(&ctx->rsrc_ref_list)) 251 ret = 0; 252 break; 253 } 254 255 schedule(); 256 __set_current_state(TASK_RUNNING); 257 mutex_lock(&ctx->uring_lock); 258 ret = 0; 259 } while (!list_empty(&ctx->rsrc_ref_list)); 260 261 finish_wait(&ctx->rsrc_quiesce_wq, &we); 262 data->quiesce = false; 263 ctx->rsrc_quiesce--; 264 265 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 266 atomic_set(&ctx->cq_wait_nr, 0); 267 smp_mb(); 268 } 269 return ret; 270 } 271 272 static void io_free_page_table(void **table, size_t size) 273 { 274 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 275 276 for (i = 0; i < nr_tables; i++) 277 kfree(table[i]); 278 kfree(table); 279 } 280 281 static void io_rsrc_data_free(struct io_rsrc_data *data) 282 { 283 size_t size = data->nr * sizeof(data->tags[0][0]); 284 285 if (data->tags) 286 io_free_page_table((void **)data->tags, size); 287 kfree(data); 288 } 289 290 static __cold void **io_alloc_page_table(size_t size) 291 { 292 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 293 size_t init_size = size; 294 void **table; 295 296 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 297 if (!table) 298 return NULL; 299 300 for (i = 0; i < nr_tables; i++) { 301 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 302 303 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 304 if (!table[i]) { 305 io_free_page_table(table, init_size); 306 return NULL; 307 } 308 size -= this_size; 309 } 310 return table; 311 } 312 313 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, 314 u64 __user *utags, 315 unsigned nr, struct io_rsrc_data **pdata) 316 { 317 struct io_rsrc_data *data; 318 int ret = 0; 319 unsigned i; 320 321 data = kzalloc(sizeof(*data), GFP_KERNEL); 322 if (!data) 323 return -ENOMEM; 324 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 325 if (!data->tags) { 326 kfree(data); 327 return -ENOMEM; 328 } 329 330 data->nr = nr; 331 data->ctx = ctx; 332 data->rsrc_type = type; 333 if (utags) { 334 ret = -EFAULT; 335 for (i = 0; i < nr; i++) { 336 u64 *tag_slot = io_get_tag_slot(data, i); 337 338 if (copy_from_user(tag_slot, &utags[i], 339 sizeof(*tag_slot))) 340 goto fail; 341 } 342 } 343 *pdata = data; 344 return 0; 345 fail: 346 io_rsrc_data_free(data); 347 return ret; 348 } 349 350 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 351 struct io_uring_rsrc_update2 *up, 352 unsigned nr_args) 353 { 354 u64 __user *tags = u64_to_user_ptr(up->tags); 355 __s32 __user *fds = u64_to_user_ptr(up->data); 356 struct io_rsrc_data *data = ctx->file_data; 357 struct io_fixed_file *file_slot; 358 struct file *file; 359 int fd, i, err = 0; 360 unsigned int done; 361 362 if (!ctx->file_data) 363 return -ENXIO; 364 if (up->offset + nr_args > ctx->nr_user_files) 365 return -EINVAL; 366 367 for (done = 0; done < nr_args; done++) { 368 u64 tag = 0; 369 370 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 371 copy_from_user(&fd, &fds[done], sizeof(fd))) { 372 err = -EFAULT; 373 break; 374 } 375 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 376 err = -EINVAL; 377 break; 378 } 379 if (fd == IORING_REGISTER_FILES_SKIP) 380 continue; 381 382 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 383 file_slot = io_fixed_file_slot(&ctx->file_table, i); 384 385 if (file_slot->file_ptr) { 386 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 387 err = io_queue_rsrc_removal(data, i, file); 388 if (err) 389 break; 390 file_slot->file_ptr = 0; 391 io_file_bitmap_clear(&ctx->file_table, i); 392 } 393 if (fd != -1) { 394 file = fget(fd); 395 if (!file) { 396 err = -EBADF; 397 break; 398 } 399 /* 400 * Don't allow io_uring instances to be registered. If 401 * UNIX isn't enabled, then this causes a reference 402 * cycle and this instance can never get freed. If UNIX 403 * is enabled we'll handle it just fine, but there's 404 * still no point in allowing a ring fd as it doesn't 405 * support regular read/write anyway. 406 */ 407 if (io_is_uring_fops(file)) { 408 fput(file); 409 err = -EBADF; 410 break; 411 } 412 err = io_scm_file_account(ctx, file); 413 if (err) { 414 fput(file); 415 break; 416 } 417 *io_get_tag_slot(data, i) = tag; 418 io_fixed_file_set(file_slot, file); 419 io_file_bitmap_set(&ctx->file_table, i); 420 } 421 } 422 return done ? done : err; 423 } 424 425 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 426 struct io_uring_rsrc_update2 *up, 427 unsigned int nr_args) 428 { 429 u64 __user *tags = u64_to_user_ptr(up->tags); 430 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 431 struct page *last_hpage = NULL; 432 __u32 done; 433 int i, err; 434 435 if (!ctx->buf_data) 436 return -ENXIO; 437 if (up->offset + nr_args > ctx->nr_user_bufs) 438 return -EINVAL; 439 440 for (done = 0; done < nr_args; done++) { 441 struct io_mapped_ubuf *imu; 442 u64 tag = 0; 443 444 err = io_copy_iov(ctx, &iov, iovs, done); 445 if (err) 446 break; 447 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 448 err = -EFAULT; 449 break; 450 } 451 err = io_buffer_validate(&iov); 452 if (err) 453 break; 454 if (!iov.iov_base && tag) { 455 err = -EINVAL; 456 break; 457 } 458 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 459 if (err) 460 break; 461 462 i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); 463 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 464 err = io_queue_rsrc_removal(ctx->buf_data, i, 465 ctx->user_bufs[i]); 466 if (unlikely(err)) { 467 io_buffer_unmap(ctx, &imu); 468 break; 469 } 470 ctx->user_bufs[i] = ctx->dummy_ubuf; 471 } 472 473 ctx->user_bufs[i] = imu; 474 *io_get_tag_slot(ctx->buf_data, i) = tag; 475 } 476 return done ? done : err; 477 } 478 479 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 480 struct io_uring_rsrc_update2 *up, 481 unsigned nr_args) 482 { 483 __u32 tmp; 484 485 lockdep_assert_held(&ctx->uring_lock); 486 487 if (check_add_overflow(up->offset, nr_args, &tmp)) 488 return -EOVERFLOW; 489 490 switch (type) { 491 case IORING_RSRC_FILE: 492 return __io_sqe_files_update(ctx, up, nr_args); 493 case IORING_RSRC_BUFFER: 494 return __io_sqe_buffers_update(ctx, up, nr_args); 495 } 496 return -EINVAL; 497 } 498 499 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 500 unsigned nr_args) 501 { 502 struct io_uring_rsrc_update2 up; 503 504 if (!nr_args) 505 return -EINVAL; 506 memset(&up, 0, sizeof(up)); 507 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 508 return -EFAULT; 509 if (up.resv || up.resv2) 510 return -EINVAL; 511 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 512 } 513 514 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 515 unsigned size, unsigned type) 516 { 517 struct io_uring_rsrc_update2 up; 518 519 if (size != sizeof(up)) 520 return -EINVAL; 521 if (copy_from_user(&up, arg, sizeof(up))) 522 return -EFAULT; 523 if (!up.nr || up.resv || up.resv2) 524 return -EINVAL; 525 return __io_register_rsrc_update(ctx, type, &up, up.nr); 526 } 527 528 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 529 unsigned int size, unsigned int type) 530 { 531 struct io_uring_rsrc_register rr; 532 533 /* keep it extendible */ 534 if (size != sizeof(rr)) 535 return -EINVAL; 536 537 memset(&rr, 0, sizeof(rr)); 538 if (copy_from_user(&rr, arg, size)) 539 return -EFAULT; 540 if (!rr.nr || rr.resv2) 541 return -EINVAL; 542 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 543 return -EINVAL; 544 545 switch (type) { 546 case IORING_RSRC_FILE: 547 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 548 break; 549 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 550 rr.nr, u64_to_user_ptr(rr.tags)); 551 case IORING_RSRC_BUFFER: 552 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 553 break; 554 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 555 rr.nr, u64_to_user_ptr(rr.tags)); 556 } 557 return -EINVAL; 558 } 559 560 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 561 { 562 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 563 564 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 565 return -EINVAL; 566 if (sqe->rw_flags || sqe->splice_fd_in) 567 return -EINVAL; 568 569 up->offset = READ_ONCE(sqe->off); 570 up->nr_args = READ_ONCE(sqe->len); 571 if (!up->nr_args) 572 return -EINVAL; 573 up->arg = READ_ONCE(sqe->addr); 574 return 0; 575 } 576 577 static int io_files_update_with_index_alloc(struct io_kiocb *req, 578 unsigned int issue_flags) 579 { 580 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 581 __s32 __user *fds = u64_to_user_ptr(up->arg); 582 unsigned int done; 583 struct file *file; 584 int ret, fd; 585 586 if (!req->ctx->file_data) 587 return -ENXIO; 588 589 for (done = 0; done < up->nr_args; done++) { 590 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 591 ret = -EFAULT; 592 break; 593 } 594 595 file = fget(fd); 596 if (!file) { 597 ret = -EBADF; 598 break; 599 } 600 ret = io_fixed_fd_install(req, issue_flags, file, 601 IORING_FILE_INDEX_ALLOC); 602 if (ret < 0) 603 break; 604 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 605 __io_close_fixed(req->ctx, issue_flags, ret); 606 ret = -EFAULT; 607 break; 608 } 609 } 610 611 if (done) 612 return done; 613 return ret; 614 } 615 616 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 617 { 618 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 619 struct io_ring_ctx *ctx = req->ctx; 620 struct io_uring_rsrc_update2 up2; 621 int ret; 622 623 up2.offset = up->offset; 624 up2.data = up->arg; 625 up2.nr = 0; 626 up2.tags = 0; 627 up2.resv = 0; 628 up2.resv2 = 0; 629 630 if (up->offset == IORING_FILE_INDEX_ALLOC) { 631 ret = io_files_update_with_index_alloc(req, issue_flags); 632 } else { 633 io_ring_submit_lock(ctx, issue_flags); 634 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 635 &up2, up->nr_args); 636 io_ring_submit_unlock(ctx, issue_flags); 637 } 638 639 if (ret < 0) 640 req_set_fail(req); 641 io_req_set_res(req, ret, 0); 642 return IOU_OK; 643 } 644 645 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) 646 { 647 struct io_ring_ctx *ctx = data->ctx; 648 struct io_rsrc_node *node = ctx->rsrc_node; 649 u64 *tag_slot = io_get_tag_slot(data, idx); 650 651 ctx->rsrc_node = io_rsrc_node_alloc(ctx); 652 if (unlikely(!ctx->rsrc_node)) { 653 ctx->rsrc_node = node; 654 return -ENOMEM; 655 } 656 657 node->item.rsrc = rsrc; 658 node->item.tag = *tag_slot; 659 *tag_slot = 0; 660 661 node->rsrc_data = data; 662 list_add_tail(&node->node, &ctx->rsrc_ref_list); 663 io_put_rsrc_node(ctx, node); 664 return 0; 665 } 666 667 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 668 { 669 int i; 670 671 for (i = 0; i < ctx->nr_user_files; i++) { 672 struct file *file = io_file_from_index(&ctx->file_table, i); 673 674 /* skip scm accounted files, they'll be freed by ->ring_sock */ 675 if (!file || io_file_need_scm(file)) 676 continue; 677 io_file_bitmap_clear(&ctx->file_table, i); 678 fput(file); 679 } 680 681 #if defined(CONFIG_UNIX) 682 if (ctx->ring_sock) { 683 struct sock *sock = ctx->ring_sock->sk; 684 struct sk_buff *skb; 685 686 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 687 kfree_skb(skb); 688 } 689 #endif 690 io_free_file_tables(&ctx->file_table); 691 io_file_table_set_alloc_range(ctx, 0, 0); 692 io_rsrc_data_free(ctx->file_data); 693 ctx->file_data = NULL; 694 ctx->nr_user_files = 0; 695 } 696 697 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 698 { 699 unsigned nr = ctx->nr_user_files; 700 int ret; 701 702 if (!ctx->file_data) 703 return -ENXIO; 704 705 /* 706 * Quiesce may unlock ->uring_lock, and while it's not held 707 * prevent new requests using the table. 708 */ 709 ctx->nr_user_files = 0; 710 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 711 ctx->nr_user_files = nr; 712 if (!ret) 713 __io_sqe_files_unregister(ctx); 714 return ret; 715 } 716 717 /* 718 * Ensure the UNIX gc is aware of our file set, so we are certain that 719 * the io_uring can be safely unregistered on process exit, even if we have 720 * loops in the file referencing. We account only files that can hold other 721 * files because otherwise they can't form a loop and so are not interesting 722 * for GC. 723 */ 724 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 725 { 726 #if defined(CONFIG_UNIX) 727 struct sock *sk = ctx->ring_sock->sk; 728 struct sk_buff_head *head = &sk->sk_receive_queue; 729 struct scm_fp_list *fpl; 730 struct sk_buff *skb; 731 732 if (likely(!io_file_need_scm(file))) 733 return 0; 734 735 /* 736 * See if we can merge this file into an existing skb SCM_RIGHTS 737 * file set. If there's no room, fall back to allocating a new skb 738 * and filling it in. 739 */ 740 spin_lock_irq(&head->lock); 741 skb = skb_peek(head); 742 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 743 __skb_unlink(skb, head); 744 else 745 skb = NULL; 746 spin_unlock_irq(&head->lock); 747 748 if (!skb) { 749 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 750 if (!fpl) 751 return -ENOMEM; 752 753 skb = alloc_skb(0, GFP_KERNEL); 754 if (!skb) { 755 kfree(fpl); 756 return -ENOMEM; 757 } 758 759 fpl->user = get_uid(current_user()); 760 fpl->max = SCM_MAX_FD; 761 fpl->count = 0; 762 763 UNIXCB(skb).fp = fpl; 764 skb->sk = sk; 765 skb->scm_io_uring = 1; 766 skb->destructor = unix_destruct_scm; 767 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 768 } 769 770 fpl = UNIXCB(skb).fp; 771 fpl->fp[fpl->count++] = get_file(file); 772 unix_inflight(fpl->user, file); 773 skb_queue_head(head, skb); 774 fput(file); 775 #endif 776 return 0; 777 } 778 779 static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) 780 { 781 #if defined(CONFIG_UNIX) 782 struct sock *sock = ctx->ring_sock->sk; 783 struct sk_buff_head list, *head = &sock->sk_receive_queue; 784 struct sk_buff *skb; 785 int i; 786 787 __skb_queue_head_init(&list); 788 789 /* 790 * Find the skb that holds this file in its SCM_RIGHTS. When found, 791 * remove this entry and rearrange the file array. 792 */ 793 skb = skb_dequeue(head); 794 while (skb) { 795 struct scm_fp_list *fp; 796 797 fp = UNIXCB(skb).fp; 798 for (i = 0; i < fp->count; i++) { 799 int left; 800 801 if (fp->fp[i] != file) 802 continue; 803 804 unix_notinflight(fp->user, fp->fp[i]); 805 left = fp->count - 1 - i; 806 if (left) { 807 memmove(&fp->fp[i], &fp->fp[i + 1], 808 left * sizeof(struct file *)); 809 } 810 fp->count--; 811 if (!fp->count) { 812 kfree_skb(skb); 813 skb = NULL; 814 } else { 815 __skb_queue_tail(&list, skb); 816 } 817 fput(file); 818 file = NULL; 819 break; 820 } 821 822 if (!file) 823 break; 824 825 __skb_queue_tail(&list, skb); 826 827 skb = skb_dequeue(head); 828 } 829 830 if (skb_peek(&list)) { 831 spin_lock_irq(&head->lock); 832 while ((skb = __skb_dequeue(&list)) != NULL) 833 __skb_queue_tail(head, skb); 834 spin_unlock_irq(&head->lock); 835 } 836 #endif 837 } 838 839 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 840 { 841 struct file *file = prsrc->file; 842 843 if (likely(!io_file_need_scm(file))) 844 fput(file); 845 else 846 io_rsrc_file_scm_put(ctx, file); 847 } 848 849 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 850 unsigned nr_args, u64 __user *tags) 851 { 852 __s32 __user *fds = (__s32 __user *) arg; 853 struct file *file; 854 int fd, ret; 855 unsigned i; 856 857 if (ctx->file_data) 858 return -EBUSY; 859 if (!nr_args) 860 return -EINVAL; 861 if (nr_args > IORING_MAX_FIXED_FILES) 862 return -EMFILE; 863 if (nr_args > rlimit(RLIMIT_NOFILE)) 864 return -EMFILE; 865 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, 866 &ctx->file_data); 867 if (ret) 868 return ret; 869 870 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 871 io_rsrc_data_free(ctx->file_data); 872 ctx->file_data = NULL; 873 return -ENOMEM; 874 } 875 876 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 877 struct io_fixed_file *file_slot; 878 879 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 880 ret = -EFAULT; 881 goto fail; 882 } 883 /* allow sparse sets */ 884 if (!fds || fd == -1) { 885 ret = -EINVAL; 886 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 887 goto fail; 888 continue; 889 } 890 891 file = fget(fd); 892 ret = -EBADF; 893 if (unlikely(!file)) 894 goto fail; 895 896 /* 897 * Don't allow io_uring instances to be registered. If UNIX 898 * isn't enabled, then this causes a reference cycle and this 899 * instance can never get freed. If UNIX is enabled we'll 900 * handle it just fine, but there's still no point in allowing 901 * a ring fd as it doesn't support regular read/write anyway. 902 */ 903 if (io_is_uring_fops(file)) { 904 fput(file); 905 goto fail; 906 } 907 ret = io_scm_file_account(ctx, file); 908 if (ret) { 909 fput(file); 910 goto fail; 911 } 912 file_slot = io_fixed_file_slot(&ctx->file_table, i); 913 io_fixed_file_set(file_slot, file); 914 io_file_bitmap_set(&ctx->file_table, i); 915 } 916 917 /* default it to the whole table */ 918 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 919 return 0; 920 fail: 921 __io_sqe_files_unregister(ctx); 922 return ret; 923 } 924 925 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 926 { 927 io_buffer_unmap(ctx, &prsrc->buf); 928 prsrc->buf = NULL; 929 } 930 931 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 932 { 933 unsigned int i; 934 935 for (i = 0; i < ctx->nr_user_bufs; i++) 936 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 937 kfree(ctx->user_bufs); 938 io_rsrc_data_free(ctx->buf_data); 939 ctx->user_bufs = NULL; 940 ctx->buf_data = NULL; 941 ctx->nr_user_bufs = 0; 942 } 943 944 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 945 { 946 unsigned nr = ctx->nr_user_bufs; 947 int ret; 948 949 if (!ctx->buf_data) 950 return -ENXIO; 951 952 /* 953 * Quiesce may unlock ->uring_lock, and while it's not held 954 * prevent new requests using the table. 955 */ 956 ctx->nr_user_bufs = 0; 957 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 958 ctx->nr_user_bufs = nr; 959 if (!ret) 960 __io_sqe_buffers_unregister(ctx); 961 return ret; 962 } 963 964 /* 965 * Not super efficient, but this is just a registration time. And we do cache 966 * the last compound head, so generally we'll only do a full search if we don't 967 * match that one. 968 * 969 * We check if the given compound head page has already been accounted, to 970 * avoid double accounting it. This allows us to account the full size of the 971 * page, not just the constituent pages of a huge page. 972 */ 973 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 974 int nr_pages, struct page *hpage) 975 { 976 int i, j; 977 978 /* check current page array */ 979 for (i = 0; i < nr_pages; i++) { 980 if (!PageCompound(pages[i])) 981 continue; 982 if (compound_head(pages[i]) == hpage) 983 return true; 984 } 985 986 /* check previously registered pages */ 987 for (i = 0; i < ctx->nr_user_bufs; i++) { 988 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 989 990 for (j = 0; j < imu->nr_bvecs; j++) { 991 if (!PageCompound(imu->bvec[j].bv_page)) 992 continue; 993 if (compound_head(imu->bvec[j].bv_page) == hpage) 994 return true; 995 } 996 } 997 998 return false; 999 } 1000 1001 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1002 int nr_pages, struct io_mapped_ubuf *imu, 1003 struct page **last_hpage) 1004 { 1005 int i, ret; 1006 1007 imu->acct_pages = 0; 1008 for (i = 0; i < nr_pages; i++) { 1009 if (!PageCompound(pages[i])) { 1010 imu->acct_pages++; 1011 } else { 1012 struct page *hpage; 1013 1014 hpage = compound_head(pages[i]); 1015 if (hpage == *last_hpage) 1016 continue; 1017 *last_hpage = hpage; 1018 if (headpage_already_acct(ctx, pages, i, hpage)) 1019 continue; 1020 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1021 } 1022 } 1023 1024 if (!imu->acct_pages) 1025 return 0; 1026 1027 ret = io_account_mem(ctx, imu->acct_pages); 1028 if (ret) 1029 imu->acct_pages = 0; 1030 return ret; 1031 } 1032 1033 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1034 { 1035 unsigned long start, end, nr_pages; 1036 struct vm_area_struct **vmas = NULL; 1037 struct page **pages = NULL; 1038 int i, pret, ret = -ENOMEM; 1039 1040 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1041 start = ubuf >> PAGE_SHIFT; 1042 nr_pages = end - start; 1043 1044 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1045 if (!pages) 1046 goto done; 1047 1048 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1049 GFP_KERNEL); 1050 if (!vmas) 1051 goto done; 1052 1053 ret = 0; 1054 mmap_read_lock(current->mm); 1055 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1056 pages, vmas); 1057 if (pret == nr_pages) { 1058 struct file *file = vmas[0]->vm_file; 1059 1060 /* don't support file backed memory */ 1061 for (i = 0; i < nr_pages; i++) { 1062 if (vmas[i]->vm_file != file) { 1063 ret = -EINVAL; 1064 break; 1065 } 1066 if (!file) 1067 continue; 1068 if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { 1069 ret = -EOPNOTSUPP; 1070 break; 1071 } 1072 } 1073 *npages = nr_pages; 1074 } else { 1075 ret = pret < 0 ? pret : -EFAULT; 1076 } 1077 mmap_read_unlock(current->mm); 1078 if (ret) { 1079 /* 1080 * if we did partial map, or found file backed vmas, 1081 * release any pages we did get 1082 */ 1083 if (pret > 0) 1084 unpin_user_pages(pages, pret); 1085 goto done; 1086 } 1087 ret = 0; 1088 done: 1089 kvfree(vmas); 1090 if (ret < 0) { 1091 kvfree(pages); 1092 pages = ERR_PTR(ret); 1093 } 1094 return pages; 1095 } 1096 1097 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1098 struct io_mapped_ubuf **pimu, 1099 struct page **last_hpage) 1100 { 1101 struct io_mapped_ubuf *imu = NULL; 1102 struct page **pages = NULL; 1103 unsigned long off; 1104 size_t size; 1105 int ret, nr_pages, i; 1106 struct folio *folio = NULL; 1107 1108 *pimu = ctx->dummy_ubuf; 1109 if (!iov->iov_base) 1110 return 0; 1111 1112 ret = -ENOMEM; 1113 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1114 &nr_pages); 1115 if (IS_ERR(pages)) { 1116 ret = PTR_ERR(pages); 1117 pages = NULL; 1118 goto done; 1119 } 1120 1121 /* If it's a huge page, try to coalesce them into a single bvec entry */ 1122 if (nr_pages > 1) { 1123 folio = page_folio(pages[0]); 1124 for (i = 1; i < nr_pages; i++) { 1125 if (page_folio(pages[i]) != folio) { 1126 folio = NULL; 1127 break; 1128 } 1129 } 1130 if (folio) { 1131 /* 1132 * The pages are bound to the folio, it doesn't 1133 * actually unpin them but drops all but one reference, 1134 * which is usually put down by io_buffer_unmap(). 1135 * Note, needs a better helper. 1136 */ 1137 unpin_user_pages(&pages[1], nr_pages - 1); 1138 nr_pages = 1; 1139 } 1140 } 1141 1142 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1143 if (!imu) 1144 goto done; 1145 1146 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1147 if (ret) { 1148 unpin_user_pages(pages, nr_pages); 1149 goto done; 1150 } 1151 1152 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1153 size = iov->iov_len; 1154 /* store original address for later verification */ 1155 imu->ubuf = (unsigned long) iov->iov_base; 1156 imu->ubuf_end = imu->ubuf + iov->iov_len; 1157 imu->nr_bvecs = nr_pages; 1158 *pimu = imu; 1159 ret = 0; 1160 1161 if (folio) { 1162 bvec_set_page(&imu->bvec[0], pages[0], size, off); 1163 goto done; 1164 } 1165 for (i = 0; i < nr_pages; i++) { 1166 size_t vec_len; 1167 1168 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1169 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 1170 off = 0; 1171 size -= vec_len; 1172 } 1173 done: 1174 if (ret) 1175 kvfree(imu); 1176 kvfree(pages); 1177 return ret; 1178 } 1179 1180 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1181 { 1182 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1183 return ctx->user_bufs ? 0 : -ENOMEM; 1184 } 1185 1186 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1187 unsigned int nr_args, u64 __user *tags) 1188 { 1189 struct page *last_hpage = NULL; 1190 struct io_rsrc_data *data; 1191 int i, ret; 1192 struct iovec iov; 1193 1194 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1195 1196 if (ctx->user_bufs) 1197 return -EBUSY; 1198 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1199 return -EINVAL; 1200 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); 1201 if (ret) 1202 return ret; 1203 ret = io_buffers_map_alloc(ctx, nr_args); 1204 if (ret) { 1205 io_rsrc_data_free(data); 1206 return ret; 1207 } 1208 1209 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1210 if (arg) { 1211 ret = io_copy_iov(ctx, &iov, arg, i); 1212 if (ret) 1213 break; 1214 ret = io_buffer_validate(&iov); 1215 if (ret) 1216 break; 1217 } else { 1218 memset(&iov, 0, sizeof(iov)); 1219 } 1220 1221 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1222 ret = -EINVAL; 1223 break; 1224 } 1225 1226 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1227 &last_hpage); 1228 if (ret) 1229 break; 1230 } 1231 1232 WARN_ON_ONCE(ctx->buf_data); 1233 1234 ctx->buf_data = data; 1235 if (ret) 1236 __io_sqe_buffers_unregister(ctx); 1237 return ret; 1238 } 1239 1240 int io_import_fixed(int ddir, struct iov_iter *iter, 1241 struct io_mapped_ubuf *imu, 1242 u64 buf_addr, size_t len) 1243 { 1244 u64 buf_end; 1245 size_t offset; 1246 1247 if (WARN_ON_ONCE(!imu)) 1248 return -EFAULT; 1249 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1250 return -EFAULT; 1251 /* not inside the mapped region */ 1252 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1253 return -EFAULT; 1254 1255 /* 1256 * Might not be a start of buffer, set size appropriately 1257 * and advance us to the beginning. 1258 */ 1259 offset = buf_addr - imu->ubuf; 1260 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1261 1262 if (offset) { 1263 /* 1264 * Don't use iov_iter_advance() here, as it's really slow for 1265 * using the latter parts of a big fixed buffer - it iterates 1266 * over each segment manually. We can cheat a bit here, because 1267 * we know that: 1268 * 1269 * 1) it's a BVEC iter, we set it up 1270 * 2) all bvecs are PAGE_SIZE in size, except potentially the 1271 * first and last bvec 1272 * 1273 * So just find our index, and adjust the iterator afterwards. 1274 * If the offset is within the first bvec (or the whole first 1275 * bvec, just use iov_iter_advance(). This makes it easier 1276 * since we can just skip the first segment, which may not 1277 * be PAGE_SIZE aligned. 1278 */ 1279 const struct bio_vec *bvec = imu->bvec; 1280 1281 if (offset <= bvec->bv_len) { 1282 /* 1283 * Note, huge pages buffers consists of one large 1284 * bvec entry and should always go this way. The other 1285 * branch doesn't expect non PAGE_SIZE'd chunks. 1286 */ 1287 iter->bvec = bvec; 1288 iter->nr_segs = bvec->bv_len; 1289 iter->count -= offset; 1290 iter->iov_offset = offset; 1291 } else { 1292 unsigned long seg_skip; 1293 1294 /* skip first vec */ 1295 offset -= bvec->bv_len; 1296 seg_skip = 1 + (offset >> PAGE_SHIFT); 1297 1298 iter->bvec = bvec + seg_skip; 1299 iter->nr_segs -= seg_skip; 1300 iter->count -= bvec->bv_len + offset; 1301 iter->iov_offset = offset & ~PAGE_MASK; 1302 } 1303 } 1304 1305 return 0; 1306 } 1307