1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/file.c 4 * 5 * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes 6 * 7 * Manage the dynamic fd arrays in the process files_struct. 8 */ 9 10 #include <linux/syscalls.h> 11 #include <linux/export.h> 12 #include <linux/fs.h> 13 #include <linux/kernel.h> 14 #include <linux/mm.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/fdtable.h> 19 #include <linux/bitops.h> 20 #include <linux/spinlock.h> 21 #include <linux/rcupdate.h> 22 #include <linux/close_range.h> 23 #include <net/sock.h> 24 25 #include "internal.h" 26 27 unsigned int sysctl_nr_open __read_mostly = 1024*1024; 28 unsigned int sysctl_nr_open_min = BITS_PER_LONG; 29 /* our min() is unusable in constant expressions ;-/ */ 30 #define __const_min(x, y) ((x) < (y) ? (x) : (y)) 31 unsigned int sysctl_nr_open_max = 32 __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; 33 34 static void __free_fdtable(struct fdtable *fdt) 35 { 36 kvfree(fdt->fd); 37 kvfree(fdt->open_fds); 38 kfree(fdt); 39 } 40 41 static void free_fdtable_rcu(struct rcu_head *rcu) 42 { 43 __free_fdtable(container_of(rcu, struct fdtable, rcu)); 44 } 45 46 #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) 47 #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) 48 49 #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds 50 /* 51 * Copy 'count' fd bits from the old table to the new table and clear the extra 52 * space if any. This does not copy the file pointers. Called with the files 53 * spinlock held for write. 54 */ 55 static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, 56 unsigned int copy_words) 57 { 58 unsigned int nwords = fdt_words(nfdt); 59 60 bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, 61 copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); 62 bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, 63 copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); 64 bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, 65 copy_words, nwords); 66 } 67 68 /* 69 * Copy all file descriptors from the old table to the new, expanded table and 70 * clear the extra space. Called with the files spinlock held for write. 71 */ 72 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) 73 { 74 size_t cpy, set; 75 76 BUG_ON(nfdt->max_fds < ofdt->max_fds); 77 78 cpy = ofdt->max_fds * sizeof(struct file *); 79 set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); 80 memcpy(nfdt->fd, ofdt->fd, cpy); 81 memset((char *)nfdt->fd + cpy, 0, set); 82 83 copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); 84 } 85 86 /* 87 * Note how the fdtable bitmap allocations very much have to be a multiple of 88 * BITS_PER_LONG. This is not only because we walk those things in chunks of 89 * 'unsigned long' in some places, but simply because that is how the Linux 90 * kernel bitmaps are defined to work: they are not "bits in an array of bytes", 91 * they are very much "bits in an array of unsigned long". 92 * 93 * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied 94 * by that "1024/sizeof(ptr)" before, we already know there are sufficient 95 * clear low bits. Clang seems to realize that, gcc ends up being confused. 96 * 97 * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, 98 * let's consider it documentation (and maybe a test-case for gcc to improve 99 * its code generation ;) 100 */ 101 static struct fdtable * alloc_fdtable(unsigned int nr) 102 { 103 struct fdtable *fdt; 104 void *data; 105 106 /* 107 * Figure out how many fds we actually want to support in this fdtable. 108 * Allocation steps are keyed to the size of the fdarray, since it 109 * grows far faster than any of the other dynamic data. We try to fit 110 * the fdarray into comfortable page-tuned chunks: starting at 1024B 111 * and growing in powers of two from there on. 112 */ 113 nr /= (1024 / sizeof(struct file *)); 114 nr = roundup_pow_of_two(nr + 1); 115 nr *= (1024 / sizeof(struct file *)); 116 nr = ALIGN(nr, BITS_PER_LONG); 117 /* 118 * Note that this can drive nr *below* what we had passed if sysctl_nr_open 119 * had been set lower between the check in expand_files() and here. Deal 120 * with that in caller, it's cheaper that way. 121 * 122 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise 123 * bitmaps handling below becomes unpleasant, to put it mildly... 124 */ 125 if (unlikely(nr > sysctl_nr_open)) 126 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; 127 128 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); 129 if (!fdt) 130 goto out; 131 fdt->max_fds = nr; 132 data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); 133 if (!data) 134 goto out_fdt; 135 fdt->fd = data; 136 137 data = kvmalloc(max_t(size_t, 138 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), 139 GFP_KERNEL_ACCOUNT); 140 if (!data) 141 goto out_arr; 142 fdt->open_fds = data; 143 data += nr / BITS_PER_BYTE; 144 fdt->close_on_exec = data; 145 data += nr / BITS_PER_BYTE; 146 fdt->full_fds_bits = data; 147 148 return fdt; 149 150 out_arr: 151 kvfree(fdt->fd); 152 out_fdt: 153 kfree(fdt); 154 out: 155 return NULL; 156 } 157 158 /* 159 * Expand the file descriptor table. 160 * This function will allocate a new fdtable and both fd array and fdset, of 161 * the given size. 162 * Return <0 error code on error; 1 on successful completion. 163 * The files->file_lock should be held on entry, and will be held on exit. 164 */ 165 static int expand_fdtable(struct files_struct *files, unsigned int nr) 166 __releases(files->file_lock) 167 __acquires(files->file_lock) 168 { 169 struct fdtable *new_fdt, *cur_fdt; 170 171 spin_unlock(&files->file_lock); 172 new_fdt = alloc_fdtable(nr); 173 174 /* make sure all fd_install() have seen resize_in_progress 175 * or have finished their rcu_read_lock_sched() section. 176 */ 177 if (atomic_read(&files->count) > 1) 178 synchronize_rcu(); 179 180 spin_lock(&files->file_lock); 181 if (!new_fdt) 182 return -ENOMEM; 183 /* 184 * extremely unlikely race - sysctl_nr_open decreased between the check in 185 * caller and alloc_fdtable(). Cheaper to catch it here... 186 */ 187 if (unlikely(new_fdt->max_fds <= nr)) { 188 __free_fdtable(new_fdt); 189 return -EMFILE; 190 } 191 cur_fdt = files_fdtable(files); 192 BUG_ON(nr < cur_fdt->max_fds); 193 copy_fdtable(new_fdt, cur_fdt); 194 rcu_assign_pointer(files->fdt, new_fdt); 195 if (cur_fdt != &files->fdtab) 196 call_rcu(&cur_fdt->rcu, free_fdtable_rcu); 197 /* coupled with smp_rmb() in fd_install() */ 198 smp_wmb(); 199 return 1; 200 } 201 202 /* 203 * Expand files. 204 * This function will expand the file structures, if the requested size exceeds 205 * the current capacity and there is room for expansion. 206 * Return <0 error code on error; 0 when nothing done; 1 when files were 207 * expanded and execution may have blocked. 208 * The files->file_lock should be held on entry, and will be held on exit. 209 */ 210 static int expand_files(struct files_struct *files, unsigned int nr) 211 __releases(files->file_lock) 212 __acquires(files->file_lock) 213 { 214 struct fdtable *fdt; 215 int expanded = 0; 216 217 repeat: 218 fdt = files_fdtable(files); 219 220 /* Do we need to expand? */ 221 if (nr < fdt->max_fds) 222 return expanded; 223 224 /* Can we expand? */ 225 if (nr >= sysctl_nr_open) 226 return -EMFILE; 227 228 if (unlikely(files->resize_in_progress)) { 229 spin_unlock(&files->file_lock); 230 expanded = 1; 231 wait_event(files->resize_wait, !files->resize_in_progress); 232 spin_lock(&files->file_lock); 233 goto repeat; 234 } 235 236 /* All good, so we try */ 237 files->resize_in_progress = true; 238 expanded = expand_fdtable(files, nr); 239 files->resize_in_progress = false; 240 241 wake_up_all(&files->resize_wait); 242 return expanded; 243 } 244 245 static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) 246 { 247 __set_bit(fd, fdt->close_on_exec); 248 } 249 250 static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) 251 { 252 if (test_bit(fd, fdt->close_on_exec)) 253 __clear_bit(fd, fdt->close_on_exec); 254 } 255 256 static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) 257 { 258 __set_bit(fd, fdt->open_fds); 259 fd /= BITS_PER_LONG; 260 if (!~fdt->open_fds[fd]) 261 __set_bit(fd, fdt->full_fds_bits); 262 } 263 264 static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) 265 { 266 __clear_bit(fd, fdt->open_fds); 267 __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); 268 } 269 270 /* 271 * Note that a sane fdtable size always has to be a multiple of 272 * BITS_PER_LONG, since we have bitmaps that are sized by this. 273 * 274 * punch_hole is optional - when close_range() is asked to unshare 275 * and close, we don't need to copy descriptors in that range, so 276 * a smaller cloned descriptor table might suffice if the last 277 * currently opened descriptor falls into that range. 278 */ 279 static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) 280 { 281 unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); 282 283 if (last == fdt->max_fds) 284 return NR_OPEN_DEFAULT; 285 if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { 286 last = find_last_bit(fdt->open_fds, punch_hole->from); 287 if (last == punch_hole->from) 288 return NR_OPEN_DEFAULT; 289 } 290 return ALIGN(last + 1, BITS_PER_LONG); 291 } 292 293 /* 294 * Allocate a new descriptor table and copy contents from the passed in 295 * instance. Returns a pointer to cloned table on success, ERR_PTR() 296 * on failure. For 'punch_hole' see sane_fdtable_size(). 297 */ 298 struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) 299 { 300 struct files_struct *newf; 301 struct file **old_fds, **new_fds; 302 unsigned int open_files, i; 303 struct fdtable *old_fdt, *new_fdt; 304 int error; 305 306 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); 307 if (!newf) 308 return ERR_PTR(-ENOMEM); 309 310 atomic_set(&newf->count, 1); 311 312 spin_lock_init(&newf->file_lock); 313 newf->resize_in_progress = false; 314 init_waitqueue_head(&newf->resize_wait); 315 newf->next_fd = 0; 316 new_fdt = &newf->fdtab; 317 new_fdt->max_fds = NR_OPEN_DEFAULT; 318 new_fdt->close_on_exec = newf->close_on_exec_init; 319 new_fdt->open_fds = newf->open_fds_init; 320 new_fdt->full_fds_bits = newf->full_fds_bits_init; 321 new_fdt->fd = &newf->fd_array[0]; 322 323 spin_lock(&oldf->file_lock); 324 old_fdt = files_fdtable(oldf); 325 open_files = sane_fdtable_size(old_fdt, punch_hole); 326 327 /* 328 * Check whether we need to allocate a larger fd array and fd set. 329 */ 330 while (unlikely(open_files > new_fdt->max_fds)) { 331 spin_unlock(&oldf->file_lock); 332 333 if (new_fdt != &newf->fdtab) 334 __free_fdtable(new_fdt); 335 336 new_fdt = alloc_fdtable(open_files - 1); 337 if (!new_fdt) { 338 error = -ENOMEM; 339 goto out_release; 340 } 341 342 /* beyond sysctl_nr_open; nothing to do */ 343 if (unlikely(new_fdt->max_fds < open_files)) { 344 __free_fdtable(new_fdt); 345 error = -EMFILE; 346 goto out_release; 347 } 348 349 /* 350 * Reacquire the oldf lock and a pointer to its fd table 351 * who knows it may have a new bigger fd table. We need 352 * the latest pointer. 353 */ 354 spin_lock(&oldf->file_lock); 355 old_fdt = files_fdtable(oldf); 356 open_files = sane_fdtable_size(old_fdt, punch_hole); 357 } 358 359 copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); 360 361 old_fds = old_fdt->fd; 362 new_fds = new_fdt->fd; 363 364 for (i = open_files; i != 0; i--) { 365 struct file *f = *old_fds++; 366 if (f) { 367 get_file(f); 368 } else { 369 /* 370 * The fd may be claimed in the fd bitmap but not yet 371 * instantiated in the files array if a sibling thread 372 * is partway through open(). So make sure that this 373 * fd is available to the new process. 374 */ 375 __clear_open_fd(open_files - i, new_fdt); 376 } 377 rcu_assign_pointer(*new_fds++, f); 378 } 379 spin_unlock(&oldf->file_lock); 380 381 /* clear the remainder */ 382 memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); 383 384 rcu_assign_pointer(newf->fdt, new_fdt); 385 386 return newf; 387 388 out_release: 389 kmem_cache_free(files_cachep, newf); 390 return ERR_PTR(error); 391 } 392 393 static struct fdtable *close_files(struct files_struct * files) 394 { 395 /* 396 * It is safe to dereference the fd table without RCU or 397 * ->file_lock because this is the last reference to the 398 * files structure. 399 */ 400 struct fdtable *fdt = rcu_dereference_raw(files->fdt); 401 unsigned int i, j = 0; 402 403 for (;;) { 404 unsigned long set; 405 i = j * BITS_PER_LONG; 406 if (i >= fdt->max_fds) 407 break; 408 set = fdt->open_fds[j++]; 409 while (set) { 410 if (set & 1) { 411 struct file * file = xchg(&fdt->fd[i], NULL); 412 if (file) { 413 filp_close(file, files); 414 cond_resched(); 415 } 416 } 417 i++; 418 set >>= 1; 419 } 420 } 421 422 return fdt; 423 } 424 425 void put_files_struct(struct files_struct *files) 426 { 427 if (atomic_dec_and_test(&files->count)) { 428 struct fdtable *fdt = close_files(files); 429 430 /* free the arrays if they are not embedded */ 431 if (fdt != &files->fdtab) 432 __free_fdtable(fdt); 433 kmem_cache_free(files_cachep, files); 434 } 435 } 436 437 void exit_files(struct task_struct *tsk) 438 { 439 struct files_struct * files = tsk->files; 440 441 if (files) { 442 task_lock(tsk); 443 tsk->files = NULL; 444 task_unlock(tsk); 445 put_files_struct(files); 446 } 447 } 448 449 struct files_struct init_files = { 450 .count = ATOMIC_INIT(1), 451 .fdt = &init_files.fdtab, 452 .fdtab = { 453 .max_fds = NR_OPEN_DEFAULT, 454 .fd = &init_files.fd_array[0], 455 .close_on_exec = init_files.close_on_exec_init, 456 .open_fds = init_files.open_fds_init, 457 .full_fds_bits = init_files.full_fds_bits_init, 458 }, 459 .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), 460 .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), 461 }; 462 463 static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) 464 { 465 unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ 466 unsigned int maxbit = maxfd / BITS_PER_LONG; 467 unsigned int bitbit = start / BITS_PER_LONG; 468 469 bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; 470 if (bitbit >= maxfd) 471 return maxfd; 472 if (bitbit > start) 473 start = bitbit; 474 return find_next_zero_bit(fdt->open_fds, maxfd, start); 475 } 476 477 /* 478 * allocate a file descriptor, mark it busy. 479 */ 480 static int alloc_fd(unsigned start, unsigned end, unsigned flags) 481 { 482 struct files_struct *files = current->files; 483 unsigned int fd; 484 int error; 485 struct fdtable *fdt; 486 487 spin_lock(&files->file_lock); 488 repeat: 489 fdt = files_fdtable(files); 490 fd = start; 491 if (fd < files->next_fd) 492 fd = files->next_fd; 493 494 if (fd < fdt->max_fds) 495 fd = find_next_fd(fdt, fd); 496 497 /* 498 * N.B. For clone tasks sharing a files structure, this test 499 * will limit the total number of files that can be opened. 500 */ 501 error = -EMFILE; 502 if (fd >= end) 503 goto out; 504 505 error = expand_files(files, fd); 506 if (error < 0) 507 goto out; 508 509 /* 510 * If we needed to expand the fs array we 511 * might have blocked - try again. 512 */ 513 if (error) 514 goto repeat; 515 516 if (start <= files->next_fd) 517 files->next_fd = fd + 1; 518 519 __set_open_fd(fd, fdt); 520 if (flags & O_CLOEXEC) 521 __set_close_on_exec(fd, fdt); 522 else 523 __clear_close_on_exec(fd, fdt); 524 error = fd; 525 #if 1 526 /* Sanity check */ 527 if (rcu_access_pointer(fdt->fd[fd]) != NULL) { 528 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 529 rcu_assign_pointer(fdt->fd[fd], NULL); 530 } 531 #endif 532 533 out: 534 spin_unlock(&files->file_lock); 535 return error; 536 } 537 538 int __get_unused_fd_flags(unsigned flags, unsigned long nofile) 539 { 540 return alloc_fd(0, nofile, flags); 541 } 542 543 int get_unused_fd_flags(unsigned flags) 544 { 545 return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); 546 } 547 EXPORT_SYMBOL(get_unused_fd_flags); 548 549 static void __put_unused_fd(struct files_struct *files, unsigned int fd) 550 { 551 struct fdtable *fdt = files_fdtable(files); 552 __clear_open_fd(fd, fdt); 553 if (fd < files->next_fd) 554 files->next_fd = fd; 555 } 556 557 void put_unused_fd(unsigned int fd) 558 { 559 struct files_struct *files = current->files; 560 spin_lock(&files->file_lock); 561 __put_unused_fd(files, fd); 562 spin_unlock(&files->file_lock); 563 } 564 565 EXPORT_SYMBOL(put_unused_fd); 566 567 /* 568 * Install a file pointer in the fd array. 569 * 570 * The VFS is full of places where we drop the files lock between 571 * setting the open_fds bitmap and installing the file in the file 572 * array. At any such point, we are vulnerable to a dup2() race 573 * installing a file in the array before us. We need to detect this and 574 * fput() the struct file we are about to overwrite in this case. 575 * 576 * It should never happen - if we allow dup2() do it, _really_ bad things 577 * will follow. 578 * 579 * This consumes the "file" refcount, so callers should treat it 580 * as if they had called fput(file). 581 */ 582 583 void fd_install(unsigned int fd, struct file *file) 584 { 585 struct files_struct *files = current->files; 586 struct fdtable *fdt; 587 588 rcu_read_lock_sched(); 589 590 if (unlikely(files->resize_in_progress)) { 591 rcu_read_unlock_sched(); 592 spin_lock(&files->file_lock); 593 fdt = files_fdtable(files); 594 BUG_ON(fdt->fd[fd] != NULL); 595 rcu_assign_pointer(fdt->fd[fd], file); 596 spin_unlock(&files->file_lock); 597 return; 598 } 599 /* coupled with smp_wmb() in expand_fdtable() */ 600 smp_rmb(); 601 fdt = rcu_dereference_sched(files->fdt); 602 BUG_ON(fdt->fd[fd] != NULL); 603 rcu_assign_pointer(fdt->fd[fd], file); 604 rcu_read_unlock_sched(); 605 } 606 607 EXPORT_SYMBOL(fd_install); 608 609 /** 610 * pick_file - return file associatd with fd 611 * @files: file struct to retrieve file from 612 * @fd: file descriptor to retrieve file for 613 * 614 * Context: files_lock must be held. 615 * 616 * Returns: The file associated with @fd (NULL if @fd is not open) 617 */ 618 static struct file *pick_file(struct files_struct *files, unsigned fd) 619 { 620 struct fdtable *fdt = files_fdtable(files); 621 struct file *file; 622 623 if (fd >= fdt->max_fds) 624 return NULL; 625 626 fd = array_index_nospec(fd, fdt->max_fds); 627 file = fdt->fd[fd]; 628 if (file) { 629 rcu_assign_pointer(fdt->fd[fd], NULL); 630 __put_unused_fd(files, fd); 631 } 632 return file; 633 } 634 635 int close_fd(unsigned fd) 636 { 637 struct files_struct *files = current->files; 638 struct file *file; 639 640 spin_lock(&files->file_lock); 641 file = pick_file(files, fd); 642 spin_unlock(&files->file_lock); 643 if (!file) 644 return -EBADF; 645 646 return filp_close(file, files); 647 } 648 EXPORT_SYMBOL(close_fd); /* for ksys_close() */ 649 650 /** 651 * last_fd - return last valid index into fd table 652 * @fdt: File descriptor table. 653 * 654 * Context: Either rcu read lock or files_lock must be held. 655 * 656 * Returns: Last valid index into fdtable. 657 */ 658 static inline unsigned last_fd(struct fdtable *fdt) 659 { 660 return fdt->max_fds - 1; 661 } 662 663 static inline void __range_cloexec(struct files_struct *cur_fds, 664 unsigned int fd, unsigned int max_fd) 665 { 666 struct fdtable *fdt; 667 668 /* make sure we're using the correct maximum value */ 669 spin_lock(&cur_fds->file_lock); 670 fdt = files_fdtable(cur_fds); 671 max_fd = min(last_fd(fdt), max_fd); 672 if (fd <= max_fd) 673 bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); 674 spin_unlock(&cur_fds->file_lock); 675 } 676 677 static inline void __range_close(struct files_struct *files, unsigned int fd, 678 unsigned int max_fd) 679 { 680 struct file *file; 681 unsigned n; 682 683 spin_lock(&files->file_lock); 684 n = last_fd(files_fdtable(files)); 685 max_fd = min(max_fd, n); 686 687 for (; fd <= max_fd; fd++) { 688 file = pick_file(files, fd); 689 if (file) { 690 spin_unlock(&files->file_lock); 691 filp_close(file, files); 692 cond_resched(); 693 spin_lock(&files->file_lock); 694 } else if (need_resched()) { 695 spin_unlock(&files->file_lock); 696 cond_resched(); 697 spin_lock(&files->file_lock); 698 } 699 } 700 spin_unlock(&files->file_lock); 701 } 702 703 /** 704 * __close_range() - Close all file descriptors in a given range. 705 * 706 * @fd: starting file descriptor to close 707 * @max_fd: last file descriptor to close 708 * @flags: CLOSE_RANGE flags. 709 * 710 * This closes a range of file descriptors. All file descriptors 711 * from @fd up to and including @max_fd are closed. 712 */ 713 int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) 714 { 715 struct task_struct *me = current; 716 struct files_struct *cur_fds = me->files, *fds = NULL; 717 718 if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) 719 return -EINVAL; 720 721 if (fd > max_fd) 722 return -EINVAL; 723 724 if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { 725 struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ 726 727 /* 728 * If the caller requested all fds to be made cloexec we always 729 * copy all of the file descriptors since they still want to 730 * use them. 731 */ 732 if (flags & CLOSE_RANGE_CLOEXEC) 733 punch_hole = NULL; 734 735 fds = dup_fd(cur_fds, punch_hole); 736 if (IS_ERR(fds)) 737 return PTR_ERR(fds); 738 /* 739 * We used to share our file descriptor table, and have now 740 * created a private one, make sure we're using it below. 741 */ 742 swap(cur_fds, fds); 743 } 744 745 if (flags & CLOSE_RANGE_CLOEXEC) 746 __range_cloexec(cur_fds, fd, max_fd); 747 else 748 __range_close(cur_fds, fd, max_fd); 749 750 if (fds) { 751 /* 752 * We're done closing the files we were supposed to. Time to install 753 * the new file descriptor table and drop the old one. 754 */ 755 task_lock(me); 756 me->files = cur_fds; 757 task_unlock(me); 758 put_files_struct(fds); 759 } 760 761 return 0; 762 } 763 764 /* 765 * See close_fd_get_file() below, this variant assumes current->files->file_lock 766 * is held. 767 */ 768 struct file *__close_fd_get_file(unsigned int fd) 769 { 770 return pick_file(current->files, fd); 771 } 772 773 /* 774 * variant of close_fd that gets a ref on the file for later fput. 775 * The caller must ensure that filp_close() called on the file. 776 */ 777 struct file *close_fd_get_file(unsigned int fd) 778 { 779 struct files_struct *files = current->files; 780 struct file *file; 781 782 spin_lock(&files->file_lock); 783 file = pick_file(files, fd); 784 spin_unlock(&files->file_lock); 785 786 return file; 787 } 788 789 void do_close_on_exec(struct files_struct *files) 790 { 791 unsigned i; 792 struct fdtable *fdt; 793 794 /* exec unshares first */ 795 spin_lock(&files->file_lock); 796 for (i = 0; ; i++) { 797 unsigned long set; 798 unsigned fd = i * BITS_PER_LONG; 799 fdt = files_fdtable(files); 800 if (fd >= fdt->max_fds) 801 break; 802 set = fdt->close_on_exec[i]; 803 if (!set) 804 continue; 805 fdt->close_on_exec[i] = 0; 806 for ( ; set ; fd++, set >>= 1) { 807 struct file *file; 808 if (!(set & 1)) 809 continue; 810 file = fdt->fd[fd]; 811 if (!file) 812 continue; 813 rcu_assign_pointer(fdt->fd[fd], NULL); 814 __put_unused_fd(files, fd); 815 spin_unlock(&files->file_lock); 816 filp_close(file, files); 817 cond_resched(); 818 spin_lock(&files->file_lock); 819 } 820 821 } 822 spin_unlock(&files->file_lock); 823 } 824 825 static inline struct file *__fget_files_rcu(struct files_struct *files, 826 unsigned int fd, fmode_t mask) 827 { 828 for (;;) { 829 struct file *file; 830 struct fdtable *fdt = rcu_dereference_raw(files->fdt); 831 struct file __rcu **fdentry; 832 833 if (unlikely(fd >= fdt->max_fds)) 834 return NULL; 835 836 fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); 837 file = rcu_dereference_raw(*fdentry); 838 if (unlikely(!file)) 839 return NULL; 840 841 if (unlikely(file->f_mode & mask)) 842 return NULL; 843 844 /* 845 * Ok, we have a file pointer. However, because we do 846 * this all locklessly under RCU, we may be racing with 847 * that file being closed. 848 * 849 * Such a race can take two forms: 850 * 851 * (a) the file ref already went down to zero, 852 * and get_file_rcu() fails. Just try again: 853 */ 854 if (unlikely(!get_file_rcu(file))) 855 continue; 856 857 /* 858 * (b) the file table entry has changed under us. 859 * Note that we don't need to re-check the 'fdt->fd' 860 * pointer having changed, because it always goes 861 * hand-in-hand with 'fdt'. 862 * 863 * If so, we need to put our ref and try again. 864 */ 865 if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || 866 unlikely(rcu_dereference_raw(*fdentry) != file)) { 867 fput(file); 868 continue; 869 } 870 871 /* 872 * Ok, we have a ref to the file, and checked that it 873 * still exists. 874 */ 875 return file; 876 } 877 } 878 879 static struct file *__fget_files(struct files_struct *files, unsigned int fd, 880 fmode_t mask) 881 { 882 struct file *file; 883 884 rcu_read_lock(); 885 file = __fget_files_rcu(files, fd, mask); 886 rcu_read_unlock(); 887 888 return file; 889 } 890 891 static inline struct file *__fget(unsigned int fd, fmode_t mask) 892 { 893 return __fget_files(current->files, fd, mask); 894 } 895 896 struct file *fget(unsigned int fd) 897 { 898 return __fget(fd, FMODE_PATH); 899 } 900 EXPORT_SYMBOL(fget); 901 902 struct file *fget_raw(unsigned int fd) 903 { 904 return __fget(fd, 0); 905 } 906 EXPORT_SYMBOL(fget_raw); 907 908 struct file *fget_task(struct task_struct *task, unsigned int fd) 909 { 910 struct file *file = NULL; 911 912 task_lock(task); 913 if (task->files) 914 file = __fget_files(task->files, fd, 0); 915 task_unlock(task); 916 917 return file; 918 } 919 920 struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) 921 { 922 /* Must be called with rcu_read_lock held */ 923 struct files_struct *files; 924 struct file *file = NULL; 925 926 task_lock(task); 927 files = task->files; 928 if (files) 929 file = files_lookup_fd_rcu(files, fd); 930 task_unlock(task); 931 932 return file; 933 } 934 935 struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) 936 { 937 /* Must be called with rcu_read_lock held */ 938 struct files_struct *files; 939 unsigned int fd = *ret_fd; 940 struct file *file = NULL; 941 942 task_lock(task); 943 files = task->files; 944 if (files) { 945 for (; fd < files_fdtable(files)->max_fds; fd++) { 946 file = files_lookup_fd_rcu(files, fd); 947 if (file) 948 break; 949 } 950 } 951 task_unlock(task); 952 *ret_fd = fd; 953 return file; 954 } 955 EXPORT_SYMBOL(task_lookup_next_fd_rcu); 956 957 /* 958 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 959 * 960 * You can use this instead of fget if you satisfy all of the following 961 * conditions: 962 * 1) You must call fput_light before exiting the syscall and returning control 963 * to userspace (i.e. you cannot remember the returned struct file * after 964 * returning to userspace). 965 * 2) You must not call filp_close on the returned struct file * in between 966 * calls to fget_light and fput_light. 967 * 3) You must not clone the current task in between the calls to fget_light 968 * and fput_light. 969 * 970 * The fput_needed flag returned by fget_light should be passed to the 971 * corresponding fput_light. 972 */ 973 static unsigned long __fget_light(unsigned int fd, fmode_t mask) 974 { 975 struct files_struct *files = current->files; 976 struct file *file; 977 978 /* 979 * If another thread is concurrently calling close_fd() followed 980 * by put_files_struct(), we must not observe the old table 981 * entry combined with the new refcount - otherwise we could 982 * return a file that is concurrently being freed. 983 * 984 * atomic_read_acquire() pairs with atomic_dec_and_test() in 985 * put_files_struct(). 986 */ 987 if (atomic_read_acquire(&files->count) == 1) { 988 file = files_lookup_fd_raw(files, fd); 989 if (!file || unlikely(file->f_mode & mask)) 990 return 0; 991 return (unsigned long)file; 992 } else { 993 file = __fget(fd, mask); 994 if (!file) 995 return 0; 996 return FDPUT_FPUT | (unsigned long)file; 997 } 998 } 999 unsigned long __fdget(unsigned int fd) 1000 { 1001 return __fget_light(fd, FMODE_PATH); 1002 } 1003 EXPORT_SYMBOL(__fdget); 1004 1005 unsigned long __fdget_raw(unsigned int fd) 1006 { 1007 return __fget_light(fd, 0); 1008 } 1009 1010 /* 1011 * Try to avoid f_pos locking. We only need it if the 1012 * file is marked for FMODE_ATOMIC_POS, and it can be 1013 * accessed multiple ways. 1014 * 1015 * Always do it for directories, because pidfd_getfd() 1016 * can make a file accessible even if it otherwise would 1017 * not be, and for directories this is a correctness 1018 * issue, not a "POSIX requirement". 1019 */ 1020 static inline bool file_needs_f_pos_lock(struct file *file) 1021 { 1022 return (file->f_mode & FMODE_ATOMIC_POS) && 1023 (file_count(file) > 1 || file->f_op->iterate_shared); 1024 } 1025 1026 unsigned long __fdget_pos(unsigned int fd) 1027 { 1028 unsigned long v = __fdget(fd); 1029 struct file *file = (struct file *)(v & ~3); 1030 1031 if (file && file_needs_f_pos_lock(file)) { 1032 v |= FDPUT_POS_UNLOCK; 1033 mutex_lock(&file->f_pos_lock); 1034 } 1035 return v; 1036 } 1037 1038 void __f_unlock_pos(struct file *f) 1039 { 1040 mutex_unlock(&f->f_pos_lock); 1041 } 1042 1043 /* 1044 * We only lock f_pos if we have threads or if the file might be 1045 * shared with another process. In both cases we'll have an elevated 1046 * file count (done either by fdget() or by fork()). 1047 */ 1048 1049 void set_close_on_exec(unsigned int fd, int flag) 1050 { 1051 struct files_struct *files = current->files; 1052 struct fdtable *fdt; 1053 spin_lock(&files->file_lock); 1054 fdt = files_fdtable(files); 1055 if (flag) 1056 __set_close_on_exec(fd, fdt); 1057 else 1058 __clear_close_on_exec(fd, fdt); 1059 spin_unlock(&files->file_lock); 1060 } 1061 1062 bool get_close_on_exec(unsigned int fd) 1063 { 1064 struct files_struct *files = current->files; 1065 struct fdtable *fdt; 1066 bool res; 1067 rcu_read_lock(); 1068 fdt = files_fdtable(files); 1069 res = close_on_exec(fd, fdt); 1070 rcu_read_unlock(); 1071 return res; 1072 } 1073 1074 static int do_dup2(struct files_struct *files, 1075 struct file *file, unsigned fd, unsigned flags) 1076 __releases(&files->file_lock) 1077 { 1078 struct file *tofree; 1079 struct fdtable *fdt; 1080 1081 /* 1082 * We need to detect attempts to do dup2() over allocated but still 1083 * not finished descriptor. NB: OpenBSD avoids that at the price of 1084 * extra work in their equivalent of fget() - they insert struct 1085 * file immediately after grabbing descriptor, mark it larval if 1086 * more work (e.g. actual opening) is needed and make sure that 1087 * fget() treats larval files as absent. Potentially interesting, 1088 * but while extra work in fget() is trivial, locking implications 1089 * and amount of surgery on open()-related paths in VFS are not. 1090 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" 1091 * deadlocks in rather amusing ways, AFAICS. All of that is out of 1092 * scope of POSIX or SUS, since neither considers shared descriptor 1093 * tables and this condition does not arise without those. 1094 */ 1095 fdt = files_fdtable(files); 1096 fd = array_index_nospec(fd, fdt->max_fds); 1097 tofree = fdt->fd[fd]; 1098 if (!tofree && fd_is_open(fd, fdt)) 1099 goto Ebusy; 1100 get_file(file); 1101 rcu_assign_pointer(fdt->fd[fd], file); 1102 __set_open_fd(fd, fdt); 1103 if (flags & O_CLOEXEC) 1104 __set_close_on_exec(fd, fdt); 1105 else 1106 __clear_close_on_exec(fd, fdt); 1107 spin_unlock(&files->file_lock); 1108 1109 if (tofree) 1110 filp_close(tofree, files); 1111 1112 return fd; 1113 1114 Ebusy: 1115 spin_unlock(&files->file_lock); 1116 return -EBUSY; 1117 } 1118 1119 int replace_fd(unsigned fd, struct file *file, unsigned flags) 1120 { 1121 int err; 1122 struct files_struct *files = current->files; 1123 1124 if (!file) 1125 return close_fd(fd); 1126 1127 if (fd >= rlimit(RLIMIT_NOFILE)) 1128 return -EBADF; 1129 1130 spin_lock(&files->file_lock); 1131 err = expand_files(files, fd); 1132 if (unlikely(err < 0)) 1133 goto out_unlock; 1134 return do_dup2(files, file, fd, flags); 1135 1136 out_unlock: 1137 spin_unlock(&files->file_lock); 1138 return err; 1139 } 1140 1141 /** 1142 * __receive_fd() - Install received file into file descriptor table 1143 * @file: struct file that was received from another process 1144 * @ufd: __user pointer to write new fd number to 1145 * @o_flags: the O_* flags to apply to the new fd entry 1146 * 1147 * Installs a received file into the file descriptor table, with appropriate 1148 * checks and count updates. Optionally writes the fd number to userspace, if 1149 * @ufd is non-NULL. 1150 * 1151 * This helper handles its own reference counting of the incoming 1152 * struct file. 1153 * 1154 * Returns newly install fd or -ve on error. 1155 */ 1156 int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) 1157 { 1158 int new_fd; 1159 int error; 1160 1161 error = security_file_receive(file); 1162 if (error) 1163 return error; 1164 1165 new_fd = get_unused_fd_flags(o_flags); 1166 if (new_fd < 0) 1167 return new_fd; 1168 1169 if (ufd) { 1170 error = put_user(new_fd, ufd); 1171 if (error) { 1172 put_unused_fd(new_fd); 1173 return error; 1174 } 1175 } 1176 1177 fd_install(new_fd, get_file(file)); 1178 __receive_sock(file); 1179 return new_fd; 1180 } 1181 1182 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) 1183 { 1184 int error; 1185 1186 error = security_file_receive(file); 1187 if (error) 1188 return error; 1189 error = replace_fd(new_fd, file, o_flags); 1190 if (error) 1191 return error; 1192 __receive_sock(file); 1193 return new_fd; 1194 } 1195 1196 int receive_fd(struct file *file, unsigned int o_flags) 1197 { 1198 return __receive_fd(file, NULL, o_flags); 1199 } 1200 EXPORT_SYMBOL_GPL(receive_fd); 1201 1202 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) 1203 { 1204 int err = -EBADF; 1205 struct file *file; 1206 struct files_struct *files = current->files; 1207 1208 if ((flags & ~O_CLOEXEC) != 0) 1209 return -EINVAL; 1210 1211 if (unlikely(oldfd == newfd)) 1212 return -EINVAL; 1213 1214 if (newfd >= rlimit(RLIMIT_NOFILE)) 1215 return -EBADF; 1216 1217 spin_lock(&files->file_lock); 1218 err = expand_files(files, newfd); 1219 file = files_lookup_fd_locked(files, oldfd); 1220 if (unlikely(!file)) 1221 goto Ebadf; 1222 if (unlikely(err < 0)) { 1223 if (err == -EMFILE) 1224 goto Ebadf; 1225 goto out_unlock; 1226 } 1227 return do_dup2(files, file, newfd, flags); 1228 1229 Ebadf: 1230 err = -EBADF; 1231 out_unlock: 1232 spin_unlock(&files->file_lock); 1233 return err; 1234 } 1235 1236 SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) 1237 { 1238 return ksys_dup3(oldfd, newfd, flags); 1239 } 1240 1241 SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) 1242 { 1243 if (unlikely(newfd == oldfd)) { /* corner case */ 1244 struct files_struct *files = current->files; 1245 int retval = oldfd; 1246 1247 rcu_read_lock(); 1248 if (!files_lookup_fd_rcu(files, oldfd)) 1249 retval = -EBADF; 1250 rcu_read_unlock(); 1251 return retval; 1252 } 1253 return ksys_dup3(oldfd, newfd, 0); 1254 } 1255 1256 SYSCALL_DEFINE1(dup, unsigned int, fildes) 1257 { 1258 int ret = -EBADF; 1259 struct file *file = fget_raw(fildes); 1260 1261 if (file) { 1262 ret = get_unused_fd_flags(0); 1263 if (ret >= 0) 1264 fd_install(ret, file); 1265 else 1266 fput(file); 1267 } 1268 return ret; 1269 } 1270 1271 int f_dupfd(unsigned int from, struct file *file, unsigned flags) 1272 { 1273 unsigned long nofile = rlimit(RLIMIT_NOFILE); 1274 int err; 1275 if (from >= nofile) 1276 return -EINVAL; 1277 err = alloc_fd(from, nofile, flags); 1278 if (err >= 0) { 1279 get_file(file); 1280 fd_install(err, file); 1281 } 1282 return err; 1283 } 1284 1285 int iterate_fd(struct files_struct *files, unsigned n, 1286 int (*f)(const void *, struct file *, unsigned), 1287 const void *p) 1288 { 1289 struct fdtable *fdt; 1290 int res = 0; 1291 if (!files) 1292 return 0; 1293 spin_lock(&files->file_lock); 1294 for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { 1295 struct file *file; 1296 file = rcu_dereference_check_fdtable(files, fdt->fd[n]); 1297 if (!file) 1298 continue; 1299 res = f(p, file, n); 1300 if (res) 1301 break; 1302 } 1303 spin_unlock(&files->file_lock); 1304 return res; 1305 } 1306 EXPORT_SYMBOL(iterate_fd); 1307