1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * linux/fs/file.c
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Manage the dynamic fd arrays in the process files_struct.
81da177e4SLinus Torvalds */
91da177e4SLinus Torvalds
10fe17f22dSAl Viro #include <linux/syscalls.h>
11630d9c47SPaul Gortmaker #include <linux/export.h>
121da177e4SLinus Torvalds #include <linux/fs.h>
13278a5fbaSChristian Brauner #include <linux/kernel.h>
141da177e4SLinus Torvalds #include <linux/mm.h>
153f07c014SIngo Molnar #include <linux/sched/signal.h>
161da177e4SLinus Torvalds #include <linux/slab.h>
171da177e4SLinus Torvalds #include <linux/file.h>
189f3acc31SAl Viro #include <linux/fdtable.h>
191da177e4SLinus Torvalds #include <linux/bitops.h>
20ab2af1f5SDipankar Sarma #include <linux/spinlock.h>
21ab2af1f5SDipankar Sarma #include <linux/rcupdate.h>
2260997c3dSChristian Brauner #include <linux/close_range.h>
2366590610SKees Cook #include <net/sock.h>
24ab2af1f5SDipankar Sarma
2553dec2eaSJens Axboe #include "internal.h"
2653dec2eaSJens Axboe
279b80a184SAlexey Dobriyan unsigned int sysctl_nr_open __read_mostly = 1024*1024;
289b80a184SAlexey Dobriyan unsigned int sysctl_nr_open_min = BITS_PER_LONG;
29752343beSRasmus Villemoes /* our min() is unusable in constant expressions ;-/ */
30752343beSRasmus Villemoes #define __const_min(x, y) ((x) < (y) ? (x) : (y))
319b80a184SAlexey Dobriyan unsigned int sysctl_nr_open_max =
329b80a184SAlexey Dobriyan __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
339cfe015aSEric Dumazet
__free_fdtable(struct fdtable * fdt)34a892e2d7SChangli Gao static void __free_fdtable(struct fdtable *fdt)
351da177e4SLinus Torvalds {
36f6c0a192SAl Viro kvfree(fdt->fd);
37f6c0a192SAl Viro kvfree(fdt->open_fds);
38a892e2d7SChangli Gao kfree(fdt);
39ab2af1f5SDipankar Sarma }
40ab2af1f5SDipankar Sarma
free_fdtable_rcu(struct rcu_head * rcu)417cf4dc3cSAl Viro static void free_fdtable_rcu(struct rcu_head *rcu)
42ab2af1f5SDipankar Sarma {
43ac3e3c5bSAl Viro __free_fdtable(container_of(rcu, struct fdtable, rcu));
44ab2af1f5SDipankar Sarma }
45ab2af1f5SDipankar Sarma
46f3f86e33SLinus Torvalds #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
47f3f86e33SLinus Torvalds #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
48f3f86e33SLinus Torvalds
491da177e4SLinus Torvalds /*
50ea5c58e7SEric Biggers * Copy 'count' fd bits from the old table to the new table and clear the extra
51ea5c58e7SEric Biggers * space if any. This does not copy the file pointers. Called with the files
52ea5c58e7SEric Biggers * spinlock held for write.
53ea5c58e7SEric Biggers */
copy_fd_bitmaps(struct fdtable * nfdt,struct fdtable * ofdt,unsigned int count)54ea5c58e7SEric Biggers static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
55ea5c58e7SEric Biggers unsigned int count)
56ea5c58e7SEric Biggers {
57ea5c58e7SEric Biggers unsigned int cpy, set;
58ea5c58e7SEric Biggers
59ea5c58e7SEric Biggers cpy = count / BITS_PER_BYTE;
60ea5c58e7SEric Biggers set = (nfdt->max_fds - count) / BITS_PER_BYTE;
61ea5c58e7SEric Biggers memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
62ea5c58e7SEric Biggers memset((char *)nfdt->open_fds + cpy, 0, set);
63ea5c58e7SEric Biggers memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
64ea5c58e7SEric Biggers memset((char *)nfdt->close_on_exec + cpy, 0, set);
65ea5c58e7SEric Biggers
66ea5c58e7SEric Biggers cpy = BITBIT_SIZE(count);
67ea5c58e7SEric Biggers set = BITBIT_SIZE(nfdt->max_fds) - cpy;
68ea5c58e7SEric Biggers memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
69ea5c58e7SEric Biggers memset((char *)nfdt->full_fds_bits + cpy, 0, set);
70ea5c58e7SEric Biggers }
71ea5c58e7SEric Biggers
72ea5c58e7SEric Biggers /*
73ea5c58e7SEric Biggers * Copy all file descriptors from the old table to the new, expanded table and
74ea5c58e7SEric Biggers * clear the extra space. Called with the files spinlock held for write.
751da177e4SLinus Torvalds */
copy_fdtable(struct fdtable * nfdt,struct fdtable * ofdt)765466b456SVadim Lobanov static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
77ab2af1f5SDipankar Sarma {
784e89b721SAl Viro size_t cpy, set;
791da177e4SLinus Torvalds
805466b456SVadim Lobanov BUG_ON(nfdt->max_fds < ofdt->max_fds);
815466b456SVadim Lobanov
825466b456SVadim Lobanov cpy = ofdt->max_fds * sizeof(struct file *);
835466b456SVadim Lobanov set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
845466b456SVadim Lobanov memcpy(nfdt->fd, ofdt->fd, cpy);
85ea5c58e7SEric Biggers memset((char *)nfdt->fd + cpy, 0, set);
865466b456SVadim Lobanov
87ea5c58e7SEric Biggers copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
881da177e4SLinus Torvalds }
891da177e4SLinus Torvalds
901c24a186SLinus Torvalds /*
911c24a186SLinus Torvalds * Note how the fdtable bitmap allocations very much have to be a multiple of
921c24a186SLinus Torvalds * BITS_PER_LONG. This is not only because we walk those things in chunks of
931c24a186SLinus Torvalds * 'unsigned long' in some places, but simply because that is how the Linux
941c24a186SLinus Torvalds * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
951c24a186SLinus Torvalds * they are very much "bits in an array of unsigned long".
961c24a186SLinus Torvalds *
971c24a186SLinus Torvalds * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
981c24a186SLinus Torvalds * by that "1024/sizeof(ptr)" before, we already know there are sufficient
991c24a186SLinus Torvalds * clear low bits. Clang seems to realize that, gcc ends up being confused.
1001c24a186SLinus Torvalds *
1011c24a186SLinus Torvalds * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
1021c24a186SLinus Torvalds * let's consider it documentation (and maybe a test-case for gcc to improve
1031c24a186SLinus Torvalds * its code generation ;)
1041c24a186SLinus Torvalds */
alloc_fdtable(unsigned int nr)1055466b456SVadim Lobanov static struct fdtable * alloc_fdtable(unsigned int nr)
1061da177e4SLinus Torvalds {
1075466b456SVadim Lobanov struct fdtable *fdt;
1081fd36adcSDavid Howells void *data;
1091da177e4SLinus Torvalds
1105466b456SVadim Lobanov /*
1115466b456SVadim Lobanov * Figure out how many fds we actually want to support in this fdtable.
1125466b456SVadim Lobanov * Allocation steps are keyed to the size of the fdarray, since it
1135466b456SVadim Lobanov * grows far faster than any of the other dynamic data. We try to fit
1145466b456SVadim Lobanov * the fdarray into comfortable page-tuned chunks: starting at 1024B
1155466b456SVadim Lobanov * and growing in powers of two from there on.
1165466b456SVadim Lobanov */
1175466b456SVadim Lobanov nr /= (1024 / sizeof(struct file *));
1185466b456SVadim Lobanov nr = roundup_pow_of_two(nr + 1);
1195466b456SVadim Lobanov nr *= (1024 / sizeof(struct file *));
1201c24a186SLinus Torvalds nr = ALIGN(nr, BITS_PER_LONG);
1215c598b34SAl Viro /*
1225c598b34SAl Viro * Note that this can drive nr *below* what we had passed if sysctl_nr_open
1235c598b34SAl Viro * had been set lower between the check in expand_files() and here. Deal
1245c598b34SAl Viro * with that in caller, it's cheaper that way.
1255c598b34SAl Viro *
1265c598b34SAl Viro * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
1275c598b34SAl Viro * bitmaps handling below becomes unpleasant, to put it mildly...
1285c598b34SAl Viro */
1295c598b34SAl Viro if (unlikely(nr > sysctl_nr_open))
1305c598b34SAl Viro nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
1315466b456SVadim Lobanov
1325d097056SVladimir Davydov fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
133ab2af1f5SDipankar Sarma if (!fdt)
1341da177e4SLinus Torvalds goto out;
1355466b456SVadim Lobanov fdt->max_fds = nr;
136c823bd92SMichal Hocko data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
1375466b456SVadim Lobanov if (!data)
1385466b456SVadim Lobanov goto out_fdt;
1391fd36adcSDavid Howells fdt->fd = data;
1401fd36adcSDavid Howells
141c823bd92SMichal Hocko data = kvmalloc(max_t(size_t,
142c823bd92SMichal Hocko 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
143c823bd92SMichal Hocko GFP_KERNEL_ACCOUNT);
1445466b456SVadim Lobanov if (!data)
1455466b456SVadim Lobanov goto out_arr;
1461fd36adcSDavid Howells fdt->open_fds = data;
1475466b456SVadim Lobanov data += nr / BITS_PER_BYTE;
1481fd36adcSDavid Howells fdt->close_on_exec = data;
149f3f86e33SLinus Torvalds data += nr / BITS_PER_BYTE;
150f3f86e33SLinus Torvalds fdt->full_fds_bits = data;
1511da177e4SLinus Torvalds
152ab2af1f5SDipankar Sarma return fdt;
1535466b456SVadim Lobanov
1545466b456SVadim Lobanov out_arr:
155f6c0a192SAl Viro kvfree(fdt->fd);
1565466b456SVadim Lobanov out_fdt:
157ab2af1f5SDipankar Sarma kfree(fdt);
1585466b456SVadim Lobanov out:
159ab2af1f5SDipankar Sarma return NULL;
160ab2af1f5SDipankar Sarma }
161ab2af1f5SDipankar Sarma
162ab2af1f5SDipankar Sarma /*
16374d392aaSVadim Lobanov * Expand the file descriptor table.
16474d392aaSVadim Lobanov * This function will allocate a new fdtable and both fd array and fdset, of
16574d392aaSVadim Lobanov * the given size.
16674d392aaSVadim Lobanov * Return <0 error code on error; 1 on successful completion.
16774d392aaSVadim Lobanov * The files->file_lock should be held on entry, and will be held on exit.
168ab2af1f5SDipankar Sarma */
expand_fdtable(struct files_struct * files,unsigned int nr)1699b80a184SAlexey Dobriyan static int expand_fdtable(struct files_struct *files, unsigned int nr)
170ab2af1f5SDipankar Sarma __releases(files->file_lock)
171ab2af1f5SDipankar Sarma __acquires(files->file_lock)
172ab2af1f5SDipankar Sarma {
17374d392aaSVadim Lobanov struct fdtable *new_fdt, *cur_fdt;
174ab2af1f5SDipankar Sarma
175ab2af1f5SDipankar Sarma spin_unlock(&files->file_lock);
17674d392aaSVadim Lobanov new_fdt = alloc_fdtable(nr);
1778a81252bSEric Dumazet
178d74ba04dSEric W. Biederman /* make sure all fd_install() have seen resize_in_progress
1798a81252bSEric Dumazet * or have finished their rcu_read_lock_sched() section.
1808a81252bSEric Dumazet */
1818a81252bSEric Dumazet if (atomic_read(&files->count) > 1)
182c93ffc15SPaul E. McKenney synchronize_rcu();
1838a81252bSEric Dumazet
1841da177e4SLinus Torvalds spin_lock(&files->file_lock);
18574d392aaSVadim Lobanov if (!new_fdt)
18674d392aaSVadim Lobanov return -ENOMEM;
187ab2af1f5SDipankar Sarma /*
1885c598b34SAl Viro * extremely unlikely race - sysctl_nr_open decreased between the check in
1895c598b34SAl Viro * caller and alloc_fdtable(). Cheaper to catch it here...
1905c598b34SAl Viro */
1915c598b34SAl Viro if (unlikely(new_fdt->max_fds <= nr)) {
192a892e2d7SChangli Gao __free_fdtable(new_fdt);
1935c598b34SAl Viro return -EMFILE;
1945c598b34SAl Viro }
19574d392aaSVadim Lobanov cur_fdt = files_fdtable(files);
1968a81252bSEric Dumazet BUG_ON(nr < cur_fdt->max_fds);
19774d392aaSVadim Lobanov copy_fdtable(new_fdt, cur_fdt);
19874d392aaSVadim Lobanov rcu_assign_pointer(files->fdt, new_fdt);
199ac3e3c5bSAl Viro if (cur_fdt != &files->fdtab)
2001983e781SAl Viro call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
201d74ba04dSEric W. Biederman /* coupled with smp_rmb() in fd_install() */
2028a81252bSEric Dumazet smp_wmb();
20374d392aaSVadim Lobanov return 1;
2041da177e4SLinus Torvalds }
2051da177e4SLinus Torvalds
2061da177e4SLinus Torvalds /*
2071da177e4SLinus Torvalds * Expand files.
20874d392aaSVadim Lobanov * This function will expand the file structures, if the requested size exceeds
20974d392aaSVadim Lobanov * the current capacity and there is room for expansion.
21074d392aaSVadim Lobanov * Return <0 error code on error; 0 when nothing done; 1 when files were
21174d392aaSVadim Lobanov * expanded and execution may have blocked.
21274d392aaSVadim Lobanov * The files->file_lock should be held on entry, and will be held on exit.
2131da177e4SLinus Torvalds */
expand_files(struct files_struct * files,unsigned int nr)2149b80a184SAlexey Dobriyan static int expand_files(struct files_struct *files, unsigned int nr)
2158a81252bSEric Dumazet __releases(files->file_lock)
2168a81252bSEric Dumazet __acquires(files->file_lock)
2171da177e4SLinus Torvalds {
218badf1662SDipankar Sarma struct fdtable *fdt;
2198a81252bSEric Dumazet int expanded = 0;
2201da177e4SLinus Torvalds
2218a81252bSEric Dumazet repeat:
222badf1662SDipankar Sarma fdt = files_fdtable(files);
2234e1e018eSAl Viro
22474d392aaSVadim Lobanov /* Do we need to expand? */
225bbea9f69SVadim Lobanov if (nr < fdt->max_fds)
2268a81252bSEric Dumazet return expanded;
2274e1e018eSAl Viro
22874d392aaSVadim Lobanov /* Can we expand? */
2299cfe015aSEric Dumazet if (nr >= sysctl_nr_open)
23074d392aaSVadim Lobanov return -EMFILE;
23174d392aaSVadim Lobanov
2328a81252bSEric Dumazet if (unlikely(files->resize_in_progress)) {
2338a81252bSEric Dumazet spin_unlock(&files->file_lock);
2348a81252bSEric Dumazet expanded = 1;
2358a81252bSEric Dumazet wait_event(files->resize_wait, !files->resize_in_progress);
2368a81252bSEric Dumazet spin_lock(&files->file_lock);
2378a81252bSEric Dumazet goto repeat;
2388a81252bSEric Dumazet }
2398a81252bSEric Dumazet
24074d392aaSVadim Lobanov /* All good, so we try */
2418a81252bSEric Dumazet files->resize_in_progress = true;
2428a81252bSEric Dumazet expanded = expand_fdtable(files, nr);
2438a81252bSEric Dumazet files->resize_in_progress = false;
2448a81252bSEric Dumazet
2458a81252bSEric Dumazet wake_up_all(&files->resize_wait);
2468a81252bSEric Dumazet return expanded;
2471da177e4SLinus Torvalds }
248ab2af1f5SDipankar Sarma
__set_close_on_exec(unsigned int fd,struct fdtable * fdt)2499b80a184SAlexey Dobriyan static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
250b8318b01SAl Viro {
251b8318b01SAl Viro __set_bit(fd, fdt->close_on_exec);
252b8318b01SAl Viro }
253b8318b01SAl Viro
__clear_close_on_exec(unsigned int fd,struct fdtable * fdt)2549b80a184SAlexey Dobriyan static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
255b8318b01SAl Viro {
256fc90888dSLinus Torvalds if (test_bit(fd, fdt->close_on_exec))
257b8318b01SAl Viro __clear_bit(fd, fdt->close_on_exec);
258b8318b01SAl Viro }
259b8318b01SAl Viro
__set_open_fd(unsigned int fd,struct fdtable * fdt)260f3f86e33SLinus Torvalds static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
261b8318b01SAl Viro {
262b8318b01SAl Viro __set_bit(fd, fdt->open_fds);
263f3f86e33SLinus Torvalds fd /= BITS_PER_LONG;
264f3f86e33SLinus Torvalds if (!~fdt->open_fds[fd])
265f3f86e33SLinus Torvalds __set_bit(fd, fdt->full_fds_bits);
266b8318b01SAl Viro }
267b8318b01SAl Viro
__clear_open_fd(unsigned int fd,struct fdtable * fdt)268f3f86e33SLinus Torvalds static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
269b8318b01SAl Viro {
270b8318b01SAl Viro __clear_bit(fd, fdt->open_fds);
271f3f86e33SLinus Torvalds __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
272b8318b01SAl Viro }
273b8318b01SAl Viro
count_open_files(struct fdtable * fdt)2749b80a184SAlexey Dobriyan static unsigned int count_open_files(struct fdtable *fdt)
27502afc626SAl Viro {
2769b80a184SAlexey Dobriyan unsigned int size = fdt->max_fds;
2779b80a184SAlexey Dobriyan unsigned int i;
27802afc626SAl Viro
27902afc626SAl Viro /* Find the last open fd */
2801fd36adcSDavid Howells for (i = size / BITS_PER_LONG; i > 0; ) {
2811fd36adcSDavid Howells if (fdt->open_fds[--i])
28202afc626SAl Viro break;
28302afc626SAl Viro }
2841fd36adcSDavid Howells i = (i + 1) * BITS_PER_LONG;
28502afc626SAl Viro return i;
28602afc626SAl Viro }
28702afc626SAl Viro
2881c24a186SLinus Torvalds /*
2891c24a186SLinus Torvalds * Note that a sane fdtable size always has to be a multiple of
2901c24a186SLinus Torvalds * BITS_PER_LONG, since we have bitmaps that are sized by this.
2911c24a186SLinus Torvalds *
2921c24a186SLinus Torvalds * 'max_fds' will normally already be properly aligned, but it
2931c24a186SLinus Torvalds * turns out that in the close_range() -> __close_range() ->
2941c24a186SLinus Torvalds * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
2951c24a186SLinus Torvalds * up having a 'max_fds' value that isn't already aligned.
2961c24a186SLinus Torvalds *
2971c24a186SLinus Torvalds * Rather than make close_range() have to worry about this,
2981c24a186SLinus Torvalds * just make that BITS_PER_LONG alignment be part of a sane
2991c24a186SLinus Torvalds * fdtable size. Becuase that's really what it is.
3001c24a186SLinus Torvalds */
sane_fdtable_size(struct fdtable * fdt,unsigned int max_fds)30160997c3dSChristian Brauner static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
30260997c3dSChristian Brauner {
30360997c3dSChristian Brauner unsigned int count;
30460997c3dSChristian Brauner
30560997c3dSChristian Brauner count = count_open_files(fdt);
30660997c3dSChristian Brauner if (max_fds < NR_OPEN_DEFAULT)
30760997c3dSChristian Brauner max_fds = NR_OPEN_DEFAULT;
308d888c83fSLinus Torvalds return ALIGN(min(count, max_fds), BITS_PER_LONG);
30960997c3dSChristian Brauner }
31060997c3dSChristian Brauner
31102afc626SAl Viro /*
31202afc626SAl Viro * Allocate a new files structure and copy contents from the
31302afc626SAl Viro * passed in files structure.
31402afc626SAl Viro * errorp will be valid only when the returned files_struct is NULL.
31502afc626SAl Viro */
dup_fd(struct files_struct * oldf,unsigned int max_fds,int * errorp)31660997c3dSChristian Brauner struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
31702afc626SAl Viro {
31802afc626SAl Viro struct files_struct *newf;
31902afc626SAl Viro struct file **old_fds, **new_fds;
3209b80a184SAlexey Dobriyan unsigned int open_files, i;
32102afc626SAl Viro struct fdtable *old_fdt, *new_fdt;
32202afc626SAl Viro
32302afc626SAl Viro *errorp = -ENOMEM;
324afbec7ffSAl Viro newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
32502afc626SAl Viro if (!newf)
32602afc626SAl Viro goto out;
32702afc626SAl Viro
328afbec7ffSAl Viro atomic_set(&newf->count, 1);
329afbec7ffSAl Viro
330afbec7ffSAl Viro spin_lock_init(&newf->file_lock);
3318a81252bSEric Dumazet newf->resize_in_progress = false;
3328a81252bSEric Dumazet init_waitqueue_head(&newf->resize_wait);
333afbec7ffSAl Viro newf->next_fd = 0;
334afbec7ffSAl Viro new_fdt = &newf->fdtab;
335afbec7ffSAl Viro new_fdt->max_fds = NR_OPEN_DEFAULT;
3361fd36adcSDavid Howells new_fdt->close_on_exec = newf->close_on_exec_init;
3371fd36adcSDavid Howells new_fdt->open_fds = newf->open_fds_init;
338f3f86e33SLinus Torvalds new_fdt->full_fds_bits = newf->full_fds_bits_init;
339afbec7ffSAl Viro new_fdt->fd = &newf->fd_array[0];
340afbec7ffSAl Viro
34102afc626SAl Viro spin_lock(&oldf->file_lock);
34202afc626SAl Viro old_fdt = files_fdtable(oldf);
34360997c3dSChristian Brauner open_files = sane_fdtable_size(old_fdt, max_fds);
34402afc626SAl Viro
34502afc626SAl Viro /*
34602afc626SAl Viro * Check whether we need to allocate a larger fd array and fd set.
34702afc626SAl Viro */
348adbecb12SAl Viro while (unlikely(open_files > new_fdt->max_fds)) {
34902afc626SAl Viro spin_unlock(&oldf->file_lock);
3509dec3c4dSAl Viro
351a892e2d7SChangli Gao if (new_fdt != &newf->fdtab)
352a892e2d7SChangli Gao __free_fdtable(new_fdt);
353adbecb12SAl Viro
3549dec3c4dSAl Viro new_fdt = alloc_fdtable(open_files - 1);
3559dec3c4dSAl Viro if (!new_fdt) {
3569dec3c4dSAl Viro *errorp = -ENOMEM;
35702afc626SAl Viro goto out_release;
3589dec3c4dSAl Viro }
3599dec3c4dSAl Viro
3609dec3c4dSAl Viro /* beyond sysctl_nr_open; nothing to do */
3619dec3c4dSAl Viro if (unlikely(new_fdt->max_fds < open_files)) {
362a892e2d7SChangli Gao __free_fdtable(new_fdt);
3639dec3c4dSAl Viro *errorp = -EMFILE;
3649dec3c4dSAl Viro goto out_release;
3659dec3c4dSAl Viro }
3669dec3c4dSAl Viro
36702afc626SAl Viro /*
36802afc626SAl Viro * Reacquire the oldf lock and a pointer to its fd table
36902afc626SAl Viro * who knows it may have a new bigger fd table. We need
37002afc626SAl Viro * the latest pointer.
37102afc626SAl Viro */
37202afc626SAl Viro spin_lock(&oldf->file_lock);
37302afc626SAl Viro old_fdt = files_fdtable(oldf);
37460997c3dSChristian Brauner open_files = sane_fdtable_size(old_fdt, max_fds);
37502afc626SAl Viro }
37602afc626SAl Viro
377ea5c58e7SEric Biggers copy_fd_bitmaps(new_fdt, old_fdt, open_files);
378ea5c58e7SEric Biggers
37902afc626SAl Viro old_fds = old_fdt->fd;
38002afc626SAl Viro new_fds = new_fdt->fd;
38102afc626SAl Viro
38202afc626SAl Viro for (i = open_files; i != 0; i--) {
38302afc626SAl Viro struct file *f = *old_fds++;
38402afc626SAl Viro if (f) {
38502afc626SAl Viro get_file(f);
38602afc626SAl Viro } else {
38702afc626SAl Viro /*
38802afc626SAl Viro * The fd may be claimed in the fd bitmap but not yet
38902afc626SAl Viro * instantiated in the files array if a sibling thread
39002afc626SAl Viro * is partway through open(). So make sure that this
39102afc626SAl Viro * fd is available to the new process.
39202afc626SAl Viro */
3931dce27c5SDavid Howells __clear_open_fd(open_files - i, new_fdt);
39402afc626SAl Viro }
39502afc626SAl Viro rcu_assign_pointer(*new_fds++, f);
39602afc626SAl Viro }
39702afc626SAl Viro spin_unlock(&oldf->file_lock);
39802afc626SAl Viro
399ea5c58e7SEric Biggers /* clear the remainder */
400ea5c58e7SEric Biggers memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
40102afc626SAl Viro
402afbec7ffSAl Viro rcu_assign_pointer(newf->fdt, new_fdt);
403afbec7ffSAl Viro
40402afc626SAl Viro return newf;
40502afc626SAl Viro
40602afc626SAl Viro out_release:
40702afc626SAl Viro kmem_cache_free(files_cachep, newf);
40802afc626SAl Viro out:
40902afc626SAl Viro return NULL;
41002afc626SAl Viro }
41102afc626SAl Viro
close_files(struct files_struct * files)412ce08b62dSOleg Nesterov static struct fdtable *close_files(struct files_struct * files)
4137cf4dc3cSAl Viro {
4147cf4dc3cSAl Viro /*
4157cf4dc3cSAl Viro * It is safe to dereference the fd table without RCU or
4167cf4dc3cSAl Viro * ->file_lock because this is the last reference to the
417ce08b62dSOleg Nesterov * files structure.
4187cf4dc3cSAl Viro */
419ce08b62dSOleg Nesterov struct fdtable *fdt = rcu_dereference_raw(files->fdt);
4209b80a184SAlexey Dobriyan unsigned int i, j = 0;
421ce08b62dSOleg Nesterov
4227cf4dc3cSAl Viro for (;;) {
4237cf4dc3cSAl Viro unsigned long set;
4247cf4dc3cSAl Viro i = j * BITS_PER_LONG;
4257cf4dc3cSAl Viro if (i >= fdt->max_fds)
4267cf4dc3cSAl Viro break;
4277cf4dc3cSAl Viro set = fdt->open_fds[j++];
4287cf4dc3cSAl Viro while (set) {
4297cf4dc3cSAl Viro if (set & 1) {
4307cf4dc3cSAl Viro struct file * file = xchg(&fdt->fd[i], NULL);
4317cf4dc3cSAl Viro if (file) {
4327cf4dc3cSAl Viro filp_close(file, files);
433388a4c88SPaul E. McKenney cond_resched();
4347cf4dc3cSAl Viro }
4357cf4dc3cSAl Viro }
4367cf4dc3cSAl Viro i++;
4377cf4dc3cSAl Viro set >>= 1;
4387cf4dc3cSAl Viro }
4397cf4dc3cSAl Viro }
440ce08b62dSOleg Nesterov
441ce08b62dSOleg Nesterov return fdt;
4427cf4dc3cSAl Viro }
4437cf4dc3cSAl Viro
put_files_struct(struct files_struct * files)4447cf4dc3cSAl Viro void put_files_struct(struct files_struct *files)
4457cf4dc3cSAl Viro {
4467cf4dc3cSAl Viro if (atomic_dec_and_test(&files->count)) {
447ce08b62dSOleg Nesterov struct fdtable *fdt = close_files(files);
448ce08b62dSOleg Nesterov
449b9e02af0SAl Viro /* free the arrays if they are not embedded */
450b9e02af0SAl Viro if (fdt != &files->fdtab)
451b9e02af0SAl Viro __free_fdtable(fdt);
452b9e02af0SAl Viro kmem_cache_free(files_cachep, files);
4537cf4dc3cSAl Viro }
4547cf4dc3cSAl Viro }
4557cf4dc3cSAl Viro
exit_files(struct task_struct * tsk)4567cf4dc3cSAl Viro void exit_files(struct task_struct *tsk)
4577cf4dc3cSAl Viro {
4587cf4dc3cSAl Viro struct files_struct * files = tsk->files;
4597cf4dc3cSAl Viro
4607cf4dc3cSAl Viro if (files) {
4617cf4dc3cSAl Viro task_lock(tsk);
4627cf4dc3cSAl Viro tsk->files = NULL;
4637cf4dc3cSAl Viro task_unlock(tsk);
4647cf4dc3cSAl Viro put_files_struct(files);
4657cf4dc3cSAl Viro }
4667cf4dc3cSAl Viro }
4677cf4dc3cSAl Viro
468f52111b1SAl Viro struct files_struct init_files = {
469f52111b1SAl Viro .count = ATOMIC_INIT(1),
470f52111b1SAl Viro .fdt = &init_files.fdtab,
471f52111b1SAl Viro .fdtab = {
472f52111b1SAl Viro .max_fds = NR_OPEN_DEFAULT,
473f52111b1SAl Viro .fd = &init_files.fd_array[0],
4741fd36adcSDavid Howells .close_on_exec = init_files.close_on_exec_init,
4751fd36adcSDavid Howells .open_fds = init_files.open_fds_init,
476f3f86e33SLinus Torvalds .full_fds_bits = init_files.full_fds_bits_init,
477f52111b1SAl Viro },
478eece09ecSThomas Gleixner .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
4795704a068SShuriyc Chu .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
480f52111b1SAl Viro };
4811027abe8SAl Viro
find_next_fd(struct fdtable * fdt,unsigned int start)4829b80a184SAlexey Dobriyan static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
483f3f86e33SLinus Torvalds {
4849b80a184SAlexey Dobriyan unsigned int maxfd = fdt->max_fds;
4859b80a184SAlexey Dobriyan unsigned int maxbit = maxfd / BITS_PER_LONG;
4869b80a184SAlexey Dobriyan unsigned int bitbit = start / BITS_PER_LONG;
487f3f86e33SLinus Torvalds
488f3f86e33SLinus Torvalds bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
489f3f86e33SLinus Torvalds if (bitbit > maxfd)
490f3f86e33SLinus Torvalds return maxfd;
491f3f86e33SLinus Torvalds if (bitbit > start)
492f3f86e33SLinus Torvalds start = bitbit;
493f3f86e33SLinus Torvalds return find_next_zero_bit(fdt->open_fds, maxfd, start);
494f3f86e33SLinus Torvalds }
495f3f86e33SLinus Torvalds
4961027abe8SAl Viro /*
4971027abe8SAl Viro * allocate a file descriptor, mark it busy.
4981027abe8SAl Viro */
alloc_fd(unsigned start,unsigned end,unsigned flags)499aa384d10SEric W. Biederman static int alloc_fd(unsigned start, unsigned end, unsigned flags)
5001027abe8SAl Viro {
501aa384d10SEric W. Biederman struct files_struct *files = current->files;
5021027abe8SAl Viro unsigned int fd;
5031027abe8SAl Viro int error;
5041027abe8SAl Viro struct fdtable *fdt;
5051027abe8SAl Viro
5061027abe8SAl Viro spin_lock(&files->file_lock);
5071027abe8SAl Viro repeat:
5081027abe8SAl Viro fdt = files_fdtable(files);
5091027abe8SAl Viro fd = start;
5101027abe8SAl Viro if (fd < files->next_fd)
5111027abe8SAl Viro fd = files->next_fd;
5121027abe8SAl Viro
5131027abe8SAl Viro if (fd < fdt->max_fds)
514f3f86e33SLinus Torvalds fd = find_next_fd(fdt, fd);
5151027abe8SAl Viro
516f33ff992SAl Viro /*
517f33ff992SAl Viro * N.B. For clone tasks sharing a files structure, this test
518f33ff992SAl Viro * will limit the total number of files that can be opened.
519f33ff992SAl Viro */
520f33ff992SAl Viro error = -EMFILE;
521f33ff992SAl Viro if (fd >= end)
522f33ff992SAl Viro goto out;
523f33ff992SAl Viro
5241027abe8SAl Viro error = expand_files(files, fd);
5251027abe8SAl Viro if (error < 0)
5261027abe8SAl Viro goto out;
5271027abe8SAl Viro
5281027abe8SAl Viro /*
5291027abe8SAl Viro * If we needed to expand the fs array we
5301027abe8SAl Viro * might have blocked - try again.
5311027abe8SAl Viro */
5321027abe8SAl Viro if (error)
5331027abe8SAl Viro goto repeat;
5341027abe8SAl Viro
5351027abe8SAl Viro if (start <= files->next_fd)
5361027abe8SAl Viro files->next_fd = fd + 1;
5371027abe8SAl Viro
5381dce27c5SDavid Howells __set_open_fd(fd, fdt);
5391027abe8SAl Viro if (flags & O_CLOEXEC)
5401dce27c5SDavid Howells __set_close_on_exec(fd, fdt);
5411027abe8SAl Viro else
5421dce27c5SDavid Howells __clear_close_on_exec(fd, fdt);
5431027abe8SAl Viro error = fd;
5441027abe8SAl Viro #if 1
5451027abe8SAl Viro /* Sanity check */
546add1f099SPaul E. McKenney if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
5471027abe8SAl Viro printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
5481027abe8SAl Viro rcu_assign_pointer(fdt->fd[fd], NULL);
5491027abe8SAl Viro }
5501027abe8SAl Viro #endif
5511027abe8SAl Viro
5521027abe8SAl Viro out:
5531027abe8SAl Viro spin_unlock(&files->file_lock);
5541027abe8SAl Viro return error;
5551027abe8SAl Viro }
5561027abe8SAl Viro
__get_unused_fd_flags(unsigned flags,unsigned long nofile)5574022e7afSJens Axboe int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
5584022e7afSJens Axboe {
559aa384d10SEric W. Biederman return alloc_fd(0, nofile, flags);
5604022e7afSJens Axboe }
5614022e7afSJens Axboe
get_unused_fd_flags(unsigned flags)5621a7bd226SAl Viro int get_unused_fd_flags(unsigned flags)
5631027abe8SAl Viro {
5644022e7afSJens Axboe return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
5651027abe8SAl Viro }
5661a7bd226SAl Viro EXPORT_SYMBOL(get_unused_fd_flags);
56756007caeSAl Viro
__put_unused_fd(struct files_struct * files,unsigned int fd)56856007caeSAl Viro static void __put_unused_fd(struct files_struct *files, unsigned int fd)
56956007caeSAl Viro {
57056007caeSAl Viro struct fdtable *fdt = files_fdtable(files);
57156007caeSAl Viro __clear_open_fd(fd, fdt);
57256007caeSAl Viro if (fd < files->next_fd)
57356007caeSAl Viro files->next_fd = fd;
57456007caeSAl Viro }
57556007caeSAl Viro
put_unused_fd(unsigned int fd)57656007caeSAl Viro void put_unused_fd(unsigned int fd)
57756007caeSAl Viro {
57856007caeSAl Viro struct files_struct *files = current->files;
57956007caeSAl Viro spin_lock(&files->file_lock);
58056007caeSAl Viro __put_unused_fd(files, fd);
58156007caeSAl Viro spin_unlock(&files->file_lock);
58256007caeSAl Viro }
58356007caeSAl Viro
58456007caeSAl Viro EXPORT_SYMBOL(put_unused_fd);
58556007caeSAl Viro
58656007caeSAl Viro /*
58756007caeSAl Viro * Install a file pointer in the fd array.
58856007caeSAl Viro *
58956007caeSAl Viro * The VFS is full of places where we drop the files lock between
59056007caeSAl Viro * setting the open_fds bitmap and installing the file in the file
59156007caeSAl Viro * array. At any such point, we are vulnerable to a dup2() race
59256007caeSAl Viro * installing a file in the array before us. We need to detect this and
59356007caeSAl Viro * fput() the struct file we are about to overwrite in this case.
59456007caeSAl Viro *
59556007caeSAl Viro * It should never happen - if we allow dup2() do it, _really_ bad things
59656007caeSAl Viro * will follow.
597f869e8a7SAl Viro *
598d74ba04dSEric W. Biederman * This consumes the "file" refcount, so callers should treat it
599d74ba04dSEric W. Biederman * as if they had called fput(file).
60056007caeSAl Viro */
60156007caeSAl Viro
fd_install(unsigned int fd,struct file * file)602d74ba04dSEric W. Biederman void fd_install(unsigned int fd, struct file *file)
60356007caeSAl Viro {
604d74ba04dSEric W. Biederman struct files_struct *files = current->files;
60556007caeSAl Viro struct fdtable *fdt;
6068a81252bSEric Dumazet
6078a81252bSEric Dumazet rcu_read_lock_sched();
6088a81252bSEric Dumazet
609c02b1a9bSMateusz Guzik if (unlikely(files->resize_in_progress)) {
6108a81252bSEric Dumazet rcu_read_unlock_sched();
611c02b1a9bSMateusz Guzik spin_lock(&files->file_lock);
612c02b1a9bSMateusz Guzik fdt = files_fdtable(files);
613c02b1a9bSMateusz Guzik BUG_ON(fdt->fd[fd] != NULL);
614c02b1a9bSMateusz Guzik rcu_assign_pointer(fdt->fd[fd], file);
615c02b1a9bSMateusz Guzik spin_unlock(&files->file_lock);
616c02b1a9bSMateusz Guzik return;
6178a81252bSEric Dumazet }
6188a81252bSEric Dumazet /* coupled with smp_wmb() in expand_fdtable() */
6198a81252bSEric Dumazet smp_rmb();
6208a81252bSEric Dumazet fdt = rcu_dereference_sched(files->fdt);
62156007caeSAl Viro BUG_ON(fdt->fd[fd] != NULL);
62256007caeSAl Viro rcu_assign_pointer(fdt->fd[fd], file);
6238a81252bSEric Dumazet rcu_read_unlock_sched();
62456007caeSAl Viro }
62556007caeSAl Viro
62656007caeSAl Viro EXPORT_SYMBOL(fd_install);
6270ee8cdfeSAl Viro
628f49fd6d3SChristian Brauner /**
629f49fd6d3SChristian Brauner * pick_file - return file associatd with fd
630f49fd6d3SChristian Brauner * @files: file struct to retrieve file from
631f49fd6d3SChristian Brauner * @fd: file descriptor to retrieve file for
632f49fd6d3SChristian Brauner *
6336319194eSAl Viro * Context: files_lock must be held.
634f49fd6d3SChristian Brauner *
6356319194eSAl Viro * Returns: The file associated with @fd (NULL if @fd is not open)
636f49fd6d3SChristian Brauner */
pick_file(struct files_struct * files,unsigned fd)637278a5fbaSChristian Brauner static struct file *pick_file(struct files_struct *files, unsigned fd)
638483ce1d4SAl Viro {
6396319194eSAl Viro struct fdtable *fdt = files_fdtable(files);
640f49fd6d3SChristian Brauner struct file *file;
641483ce1d4SAl Viro
6426319194eSAl Viro if (fd >= fdt->max_fds)
6436319194eSAl Viro return NULL;
6446319194eSAl Viro
645609d5444STheodore Ts'o fd = array_index_nospec(fd, fdt->max_fds);
646483ce1d4SAl Viro file = fdt->fd[fd];
6476319194eSAl Viro if (file) {
648483ce1d4SAl Viro rcu_assign_pointer(fdt->fd[fd], NULL);
649483ce1d4SAl Viro __put_unused_fd(files, fd);
6506319194eSAl Viro }
651278a5fbaSChristian Brauner return file;
652278a5fbaSChristian Brauner }
653278a5fbaSChristian Brauner
close_fd(unsigned fd)6548760c909SEric W. Biederman int close_fd(unsigned fd)
655278a5fbaSChristian Brauner {
6568760c909SEric W. Biederman struct files_struct *files = current->files;
657278a5fbaSChristian Brauner struct file *file;
658278a5fbaSChristian Brauner
6596319194eSAl Viro spin_lock(&files->file_lock);
660278a5fbaSChristian Brauner file = pick_file(files, fd);
6616319194eSAl Viro spin_unlock(&files->file_lock);
6626319194eSAl Viro if (!file)
663483ce1d4SAl Viro return -EBADF;
664278a5fbaSChristian Brauner
665278a5fbaSChristian Brauner return filp_close(file, files);
666483ce1d4SAl Viro }
6678760c909SEric W. Biederman EXPORT_SYMBOL(close_fd); /* for ksys_close() */
668483ce1d4SAl Viro
6699b5b8722SChristian Brauner /**
6709b5b8722SChristian Brauner * last_fd - return last valid index into fd table
671*35931eb3SMatthew Wilcox (Oracle) * @fdt: File descriptor table.
6729b5b8722SChristian Brauner *
6739b5b8722SChristian Brauner * Context: Either rcu read lock or files_lock must be held.
6749b5b8722SChristian Brauner *
6759b5b8722SChristian Brauner * Returns: Last valid index into fdtable.
6769b5b8722SChristian Brauner */
last_fd(struct fdtable * fdt)6779b5b8722SChristian Brauner static inline unsigned last_fd(struct fdtable *fdt)
6789b5b8722SChristian Brauner {
6799b5b8722SChristian Brauner return fdt->max_fds - 1;
6809b5b8722SChristian Brauner }
6819b5b8722SChristian Brauner
__range_cloexec(struct files_struct * cur_fds,unsigned int fd,unsigned int max_fd)682582f1fb6SGiuseppe Scrivano static inline void __range_cloexec(struct files_struct *cur_fds,
683582f1fb6SGiuseppe Scrivano unsigned int fd, unsigned int max_fd)
684582f1fb6SGiuseppe Scrivano {
685582f1fb6SGiuseppe Scrivano struct fdtable *fdt;
686582f1fb6SGiuseppe Scrivano
6879b5b8722SChristian Brauner /* make sure we're using the correct maximum value */
688582f1fb6SGiuseppe Scrivano spin_lock(&cur_fds->file_lock);
689582f1fb6SGiuseppe Scrivano fdt = files_fdtable(cur_fds);
6909b5b8722SChristian Brauner max_fd = min(last_fd(fdt), max_fd);
6919b5b8722SChristian Brauner if (fd <= max_fd)
692582f1fb6SGiuseppe Scrivano bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
693582f1fb6SGiuseppe Scrivano spin_unlock(&cur_fds->file_lock);
694582f1fb6SGiuseppe Scrivano }
695582f1fb6SGiuseppe Scrivano
__range_close(struct files_struct * files,unsigned int fd,unsigned int max_fd)696ed192c59SMateusz Guzik static inline void __range_close(struct files_struct *files, unsigned int fd,
697582f1fb6SGiuseppe Scrivano unsigned int max_fd)
698582f1fb6SGiuseppe Scrivano {
699ed192c59SMateusz Guzik struct file *file;
7006319194eSAl Viro unsigned n;
7016319194eSAl Viro
702ed192c59SMateusz Guzik spin_lock(&files->file_lock);
703ed192c59SMateusz Guzik n = last_fd(files_fdtable(files));
7046319194eSAl Viro max_fd = min(max_fd, n);
7056319194eSAl Viro
706ed192c59SMateusz Guzik for (; fd <= max_fd; fd++) {
707ed192c59SMateusz Guzik file = pick_file(files, fd);
7086319194eSAl Viro if (file) {
709ed192c59SMateusz Guzik spin_unlock(&files->file_lock);
710ed192c59SMateusz Guzik filp_close(file, files);
711582f1fb6SGiuseppe Scrivano cond_resched();
712ed192c59SMateusz Guzik spin_lock(&files->file_lock);
713ed192c59SMateusz Guzik } else if (need_resched()) {
714ed192c59SMateusz Guzik spin_unlock(&files->file_lock);
715ed192c59SMateusz Guzik cond_resched();
716ed192c59SMateusz Guzik spin_lock(&files->file_lock);
717f49fd6d3SChristian Brauner }
718582f1fb6SGiuseppe Scrivano }
719ed192c59SMateusz Guzik spin_unlock(&files->file_lock);
720582f1fb6SGiuseppe Scrivano }
721582f1fb6SGiuseppe Scrivano
722278a5fbaSChristian Brauner /**
723278a5fbaSChristian Brauner * __close_range() - Close all file descriptors in a given range.
724278a5fbaSChristian Brauner *
725278a5fbaSChristian Brauner * @fd: starting file descriptor to close
726278a5fbaSChristian Brauner * @max_fd: last file descriptor to close
727*35931eb3SMatthew Wilcox (Oracle) * @flags: CLOSE_RANGE flags.
728278a5fbaSChristian Brauner *
729278a5fbaSChristian Brauner * This closes a range of file descriptors. All file descriptors
730278a5fbaSChristian Brauner * from @fd up to and including @max_fd are closed.
731278a5fbaSChristian Brauner */
__close_range(unsigned fd,unsigned max_fd,unsigned int flags)73260997c3dSChristian Brauner int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
733278a5fbaSChristian Brauner {
73460997c3dSChristian Brauner struct task_struct *me = current;
73560997c3dSChristian Brauner struct files_struct *cur_fds = me->files, *fds = NULL;
73660997c3dSChristian Brauner
737582f1fb6SGiuseppe Scrivano if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
73860997c3dSChristian Brauner return -EINVAL;
739278a5fbaSChristian Brauner
740278a5fbaSChristian Brauner if (fd > max_fd)
741278a5fbaSChristian Brauner return -EINVAL;
742278a5fbaSChristian Brauner
74360997c3dSChristian Brauner if (flags & CLOSE_RANGE_UNSHARE) {
74460997c3dSChristian Brauner int ret;
74560997c3dSChristian Brauner unsigned int max_unshare_fds = NR_OPEN_MAX;
74660997c3dSChristian Brauner
74760997c3dSChristian Brauner /*
74803ba0fe4SChristian Brauner * If the caller requested all fds to be made cloexec we always
74903ba0fe4SChristian Brauner * copy all of the file descriptors since they still want to
75003ba0fe4SChristian Brauner * use them.
75160997c3dSChristian Brauner */
75203ba0fe4SChristian Brauner if (!(flags & CLOSE_RANGE_CLOEXEC)) {
75303ba0fe4SChristian Brauner /*
75403ba0fe4SChristian Brauner * If the requested range is greater than the current
75503ba0fe4SChristian Brauner * maximum, we're closing everything so only copy all
75603ba0fe4SChristian Brauner * file descriptors beneath the lowest file descriptor.
75703ba0fe4SChristian Brauner */
75803ba0fe4SChristian Brauner rcu_read_lock();
75903ba0fe4SChristian Brauner if (max_fd >= last_fd(files_fdtable(cur_fds)))
76060997c3dSChristian Brauner max_unshare_fds = fd;
76103ba0fe4SChristian Brauner rcu_read_unlock();
76203ba0fe4SChristian Brauner }
76360997c3dSChristian Brauner
76460997c3dSChristian Brauner ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
76560997c3dSChristian Brauner if (ret)
76660997c3dSChristian Brauner return ret;
76760997c3dSChristian Brauner
76860997c3dSChristian Brauner /*
76960997c3dSChristian Brauner * We used to share our file descriptor table, and have now
77060997c3dSChristian Brauner * created a private one, make sure we're using it below.
77160997c3dSChristian Brauner */
77260997c3dSChristian Brauner if (fds)
77360997c3dSChristian Brauner swap(cur_fds, fds);
77460997c3dSChristian Brauner }
77560997c3dSChristian Brauner
776582f1fb6SGiuseppe Scrivano if (flags & CLOSE_RANGE_CLOEXEC)
777582f1fb6SGiuseppe Scrivano __range_cloexec(cur_fds, fd, max_fd);
778582f1fb6SGiuseppe Scrivano else
779582f1fb6SGiuseppe Scrivano __range_close(cur_fds, fd, max_fd);
780278a5fbaSChristian Brauner
78160997c3dSChristian Brauner if (fds) {
78260997c3dSChristian Brauner /*
78360997c3dSChristian Brauner * We're done closing the files we were supposed to. Time to install
78460997c3dSChristian Brauner * the new file descriptor table and drop the old one.
78560997c3dSChristian Brauner */
78660997c3dSChristian Brauner task_lock(me);
78760997c3dSChristian Brauner me->files = cur_fds;
78860997c3dSChristian Brauner task_unlock(me);
78960997c3dSChristian Brauner put_files_struct(fds);
79060997c3dSChristian Brauner }
79160997c3dSChristian Brauner
792278a5fbaSChristian Brauner return 0;
793278a5fbaSChristian Brauner }
794278a5fbaSChristian Brauner
79580cd7956STodd Kjos /*
79653dec2eaSJens Axboe * See close_fd_get_file() below, this variant assumes current->files->file_lock
79753dec2eaSJens Axboe * is held.
79853dec2eaSJens Axboe */
__close_fd_get_file(unsigned int fd)7996319194eSAl Viro struct file *__close_fd_get_file(unsigned int fd)
80053dec2eaSJens Axboe {
8016319194eSAl Viro return pick_file(current->files, fd);
80253dec2eaSJens Axboe }
80353dec2eaSJens Axboe
80453dec2eaSJens Axboe /*
8059fe83c43SEric W. Biederman * variant of close_fd that gets a ref on the file for later fput.
80640a19260SAl Viro * The caller must ensure that filp_close() called on the file.
80780cd7956STodd Kjos */
close_fd_get_file(unsigned int fd)8086319194eSAl Viro struct file *close_fd_get_file(unsigned int fd)
80980cd7956STodd Kjos {
81080cd7956STodd Kjos struct files_struct *files = current->files;
8116319194eSAl Viro struct file *file;
81280cd7956STodd Kjos
81380cd7956STodd Kjos spin_lock(&files->file_lock);
8146319194eSAl Viro file = pick_file(files, fd);
81580cd7956STodd Kjos spin_unlock(&files->file_lock);
81680cd7956STodd Kjos
8176319194eSAl Viro return file;
81880cd7956STodd Kjos }
81980cd7956STodd Kjos
do_close_on_exec(struct files_struct * files)8206a6d27deSAl Viro void do_close_on_exec(struct files_struct *files)
8216a6d27deSAl Viro {
8226a6d27deSAl Viro unsigned i;
8236a6d27deSAl Viro struct fdtable *fdt;
8246a6d27deSAl Viro
8256a6d27deSAl Viro /* exec unshares first */
8266a6d27deSAl Viro spin_lock(&files->file_lock);
8276a6d27deSAl Viro for (i = 0; ; i++) {
8286a6d27deSAl Viro unsigned long set;
8296a6d27deSAl Viro unsigned fd = i * BITS_PER_LONG;
8306a6d27deSAl Viro fdt = files_fdtable(files);
8316a6d27deSAl Viro if (fd >= fdt->max_fds)
8326a6d27deSAl Viro break;
8336a6d27deSAl Viro set = fdt->close_on_exec[i];
8346a6d27deSAl Viro if (!set)
8356a6d27deSAl Viro continue;
8366a6d27deSAl Viro fdt->close_on_exec[i] = 0;
8376a6d27deSAl Viro for ( ; set ; fd++, set >>= 1) {
8386a6d27deSAl Viro struct file *file;
8396a6d27deSAl Viro if (!(set & 1))
8406a6d27deSAl Viro continue;
8416a6d27deSAl Viro file = fdt->fd[fd];
8426a6d27deSAl Viro if (!file)
8436a6d27deSAl Viro continue;
8446a6d27deSAl Viro rcu_assign_pointer(fdt->fd[fd], NULL);
8456a6d27deSAl Viro __put_unused_fd(files, fd);
8466a6d27deSAl Viro spin_unlock(&files->file_lock);
8476a6d27deSAl Viro filp_close(file, files);
8486a6d27deSAl Viro cond_resched();
8496a6d27deSAl Viro spin_lock(&files->file_lock);
8506a6d27deSAl Viro }
8516a6d27deSAl Viro
8526a6d27deSAl Viro }
8536a6d27deSAl Viro spin_unlock(&files->file_lock);
8546a6d27deSAl Viro }
8556a6d27deSAl Viro
__fget_files_rcu(struct files_struct * files,unsigned int fd,fmode_t mask)856e386dfc5SLinus Torvalds static inline struct file *__fget_files_rcu(struct files_struct *files,
85781132a39SGou Hao unsigned int fd, fmode_t mask)
858e386dfc5SLinus Torvalds {
859e386dfc5SLinus Torvalds for (;;) {
860e386dfc5SLinus Torvalds struct file *file;
861e386dfc5SLinus Torvalds struct fdtable *fdt = rcu_dereference_raw(files->fdt);
862e386dfc5SLinus Torvalds struct file __rcu **fdentry;
863e386dfc5SLinus Torvalds
864e386dfc5SLinus Torvalds if (unlikely(fd >= fdt->max_fds))
865e386dfc5SLinus Torvalds return NULL;
866e386dfc5SLinus Torvalds
867e386dfc5SLinus Torvalds fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
868e386dfc5SLinus Torvalds file = rcu_dereference_raw(*fdentry);
869e386dfc5SLinus Torvalds if (unlikely(!file))
870e386dfc5SLinus Torvalds return NULL;
871e386dfc5SLinus Torvalds
872e386dfc5SLinus Torvalds if (unlikely(file->f_mode & mask))
873e386dfc5SLinus Torvalds return NULL;
874e386dfc5SLinus Torvalds
875e386dfc5SLinus Torvalds /*
876e386dfc5SLinus Torvalds * Ok, we have a file pointer. However, because we do
877e386dfc5SLinus Torvalds * this all locklessly under RCU, we may be racing with
878e386dfc5SLinus Torvalds * that file being closed.
879e386dfc5SLinus Torvalds *
880e386dfc5SLinus Torvalds * Such a race can take two forms:
881e386dfc5SLinus Torvalds *
882e386dfc5SLinus Torvalds * (a) the file ref already went down to zero,
88381132a39SGou Hao * and get_file_rcu() fails. Just try again:
884e386dfc5SLinus Torvalds */
88581132a39SGou Hao if (unlikely(!get_file_rcu(file)))
886e386dfc5SLinus Torvalds continue;
887e386dfc5SLinus Torvalds
888e386dfc5SLinus Torvalds /*
889e386dfc5SLinus Torvalds * (b) the file table entry has changed under us.
890e386dfc5SLinus Torvalds * Note that we don't need to re-check the 'fdt->fd'
891e386dfc5SLinus Torvalds * pointer having changed, because it always goes
892e386dfc5SLinus Torvalds * hand-in-hand with 'fdt'.
893e386dfc5SLinus Torvalds *
89481132a39SGou Hao * If so, we need to put our ref and try again.
895e386dfc5SLinus Torvalds */
896e386dfc5SLinus Torvalds if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
897e386dfc5SLinus Torvalds unlikely(rcu_dereference_raw(*fdentry) != file)) {
89881132a39SGou Hao fput(file);
899e386dfc5SLinus Torvalds continue;
900e386dfc5SLinus Torvalds }
901e386dfc5SLinus Torvalds
902e386dfc5SLinus Torvalds /*
903e386dfc5SLinus Torvalds * Ok, we have a ref to the file, and checked that it
904e386dfc5SLinus Torvalds * still exists.
905e386dfc5SLinus Torvalds */
906e386dfc5SLinus Torvalds return file;
907e386dfc5SLinus Torvalds }
908e386dfc5SLinus Torvalds }
909e386dfc5SLinus Torvalds
__fget_files(struct files_struct * files,unsigned int fd,fmode_t mask)9105e876fb4SSargun Dhillon static struct file *__fget_files(struct files_struct *files, unsigned int fd,
91181132a39SGou Hao fmode_t mask)
9120ee8cdfeSAl Viro {
9131deb46e2SOleg Nesterov struct file *file;
9140ee8cdfeSAl Viro
9150ee8cdfeSAl Viro rcu_read_lock();
91681132a39SGou Hao file = __fget_files_rcu(files, fd, mask);
9170ee8cdfeSAl Viro rcu_read_unlock();
9180ee8cdfeSAl Viro
9190ee8cdfeSAl Viro return file;
9200ee8cdfeSAl Viro }
9210ee8cdfeSAl Viro
__fget(unsigned int fd,fmode_t mask)92281132a39SGou Hao static inline struct file *__fget(unsigned int fd, fmode_t mask)
9235e876fb4SSargun Dhillon {
92481132a39SGou Hao return __fget_files(current->files, fd, mask);
925091141a4SJens Axboe }
926091141a4SJens Axboe
fget(unsigned int fd)9271deb46e2SOleg Nesterov struct file *fget(unsigned int fd)
9281deb46e2SOleg Nesterov {
92981132a39SGou Hao return __fget(fd, FMODE_PATH);
9301deb46e2SOleg Nesterov }
9310ee8cdfeSAl Viro EXPORT_SYMBOL(fget);
9320ee8cdfeSAl Viro
fget_raw(unsigned int fd)9330ee8cdfeSAl Viro struct file *fget_raw(unsigned int fd)
9340ee8cdfeSAl Viro {
93581132a39SGou Hao return __fget(fd, 0);
9360ee8cdfeSAl Viro }
9370ee8cdfeSAl Viro EXPORT_SYMBOL(fget_raw);
9380ee8cdfeSAl Viro
fget_task(struct task_struct * task,unsigned int fd)9395e876fb4SSargun Dhillon struct file *fget_task(struct task_struct *task, unsigned int fd)
9405e876fb4SSargun Dhillon {
9415e876fb4SSargun Dhillon struct file *file = NULL;
9425e876fb4SSargun Dhillon
9435e876fb4SSargun Dhillon task_lock(task);
9445e876fb4SSargun Dhillon if (task->files)
94581132a39SGou Hao file = __fget_files(task->files, fd, 0);
9465e876fb4SSargun Dhillon task_unlock(task);
9475e876fb4SSargun Dhillon
9485e876fb4SSargun Dhillon return file;
9495e876fb4SSargun Dhillon }
9505e876fb4SSargun Dhillon
task_lookup_fd_rcu(struct task_struct * task,unsigned int fd)9513a879fb3SEric W. Biederman struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
9523a879fb3SEric W. Biederman {
9533a879fb3SEric W. Biederman /* Must be called with rcu_read_lock held */
9543a879fb3SEric W. Biederman struct files_struct *files;
9553a879fb3SEric W. Biederman struct file *file = NULL;
9563a879fb3SEric W. Biederman
9573a879fb3SEric W. Biederman task_lock(task);
9583a879fb3SEric W. Biederman files = task->files;
9593a879fb3SEric W. Biederman if (files)
9603a879fb3SEric W. Biederman file = files_lookup_fd_rcu(files, fd);
9613a879fb3SEric W. Biederman task_unlock(task);
9623a879fb3SEric W. Biederman
9633a879fb3SEric W. Biederman return file;
9643a879fb3SEric W. Biederman }
9653a879fb3SEric W. Biederman
task_lookup_next_fd_rcu(struct task_struct * task,unsigned int * ret_fd)966e9a53aebSEric W. Biederman struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
967e9a53aebSEric W. Biederman {
968e9a53aebSEric W. Biederman /* Must be called with rcu_read_lock held */
969e9a53aebSEric W. Biederman struct files_struct *files;
970e9a53aebSEric W. Biederman unsigned int fd = *ret_fd;
971e9a53aebSEric W. Biederman struct file *file = NULL;
972e9a53aebSEric W. Biederman
973e9a53aebSEric W. Biederman task_lock(task);
974e9a53aebSEric W. Biederman files = task->files;
975e9a53aebSEric W. Biederman if (files) {
976e9a53aebSEric W. Biederman for (; fd < files_fdtable(files)->max_fds; fd++) {
977e9a53aebSEric W. Biederman file = files_lookup_fd_rcu(files, fd);
978e9a53aebSEric W. Biederman if (file)
979e9a53aebSEric W. Biederman break;
980e9a53aebSEric W. Biederman }
981e9a53aebSEric W. Biederman }
982e9a53aebSEric W. Biederman task_unlock(task);
983e9a53aebSEric W. Biederman *ret_fd = fd;
984e9a53aebSEric W. Biederman return file;
985e9a53aebSEric W. Biederman }
9864480c27cSAndreas Gruenbacher EXPORT_SYMBOL(task_lookup_next_fd_rcu);
987e9a53aebSEric W. Biederman
9880ee8cdfeSAl Viro /*
9890ee8cdfeSAl Viro * Lightweight file lookup - no refcnt increment if fd table isn't shared.
9900ee8cdfeSAl Viro *
9910ee8cdfeSAl Viro * You can use this instead of fget if you satisfy all of the following
9920ee8cdfeSAl Viro * conditions:
9930ee8cdfeSAl Viro * 1) You must call fput_light before exiting the syscall and returning control
9940ee8cdfeSAl Viro * to userspace (i.e. you cannot remember the returned struct file * after
9950ee8cdfeSAl Viro * returning to userspace).
9960ee8cdfeSAl Viro * 2) You must not call filp_close on the returned struct file * in between
9970ee8cdfeSAl Viro * calls to fget_light and fput_light.
9980ee8cdfeSAl Viro * 3) You must not clone the current task in between the calls to fget_light
9990ee8cdfeSAl Viro * and fput_light.
10000ee8cdfeSAl Viro *
10010ee8cdfeSAl Viro * The fput_needed flag returned by fget_light should be passed to the
10020ee8cdfeSAl Viro * corresponding fput_light.
10030ee8cdfeSAl Viro */
__fget_light(unsigned int fd,fmode_t mask)1004bd2a31d5SAl Viro static unsigned long __fget_light(unsigned int fd, fmode_t mask)
10050ee8cdfeSAl Viro {
10060ee8cdfeSAl Viro struct files_struct *files = current->files;
1007ad461834SOleg Nesterov struct file *file;
10080ee8cdfeSAl Viro
10097ee47dcfSJann Horn /*
10107ee47dcfSJann Horn * If another thread is concurrently calling close_fd() followed
10117ee47dcfSJann Horn * by put_files_struct(), we must not observe the old table
10127ee47dcfSJann Horn * entry combined with the new refcount - otherwise we could
10137ee47dcfSJann Horn * return a file that is concurrently being freed.
10147ee47dcfSJann Horn *
10157ee47dcfSJann Horn * atomic_read_acquire() pairs with atomic_dec_and_test() in
10167ee47dcfSJann Horn * put_files_struct().
10177ee47dcfSJann Horn */
10187ee47dcfSJann Horn if (atomic_read_acquire(&files->count) == 1) {
1019bebf684bSEric W. Biederman file = files_lookup_fd_raw(files, fd);
1020bd2a31d5SAl Viro if (!file || unlikely(file->f_mode & mask))
1021bd2a31d5SAl Viro return 0;
1022bd2a31d5SAl Viro return (unsigned long)file;
10230ee8cdfeSAl Viro } else {
102481132a39SGou Hao file = __fget(fd, mask);
1025bd2a31d5SAl Viro if (!file)
1026bd2a31d5SAl Viro return 0;
1027bd2a31d5SAl Viro return FDPUT_FPUT | (unsigned long)file;
1028bd2a31d5SAl Viro }
1029bd2a31d5SAl Viro }
__fdget(unsigned int fd)1030bd2a31d5SAl Viro unsigned long __fdget(unsigned int fd)
1031bd2a31d5SAl Viro {
1032bd2a31d5SAl Viro return __fget_light(fd, FMODE_PATH);
1033bd2a31d5SAl Viro }
1034bd2a31d5SAl Viro EXPORT_SYMBOL(__fdget);
1035bd2a31d5SAl Viro
__fdget_raw(unsigned int fd)1036bd2a31d5SAl Viro unsigned long __fdget_raw(unsigned int fd)
1037bd2a31d5SAl Viro {
1038bd2a31d5SAl Viro return __fget_light(fd, 0);
10390ee8cdfeSAl Viro }
10400ee8cdfeSAl Viro
1041bd2a31d5SAl Viro /*
1042ad461834SOleg Nesterov * Try to avoid f_pos locking. We only need it if the
104399aea681SEric Biggers * file is marked for FMODE_ATOMIC_POS, and it can be
104499aea681SEric Biggers * accessed multiple ways.
10450ee8cdfeSAl Viro *
10462be7d348SLinus Torvalds * Always do it for directories, because pidfd_getfd()
1047bd2a31d5SAl Viro * can make a file accessible even if it otherwise would
1048bd2a31d5SAl Viro * not be, and for directories this is a correctness
1049bd2a31d5SAl Viro * issue, not a "POSIX requirement".
1050bd2a31d5SAl Viro */
file_needs_f_pos_lock(struct file * file)1051bd2a31d5SAl Viro static inline bool file_needs_f_pos_lock(struct file *file)
105299aea681SEric Biggers {
1053bd2a31d5SAl Viro return (file->f_mode & FMODE_ATOMIC_POS) &&
1054bd2a31d5SAl Viro (file_count(file) > 1 || file->f_op->iterate_shared);
105563b6df14SAl Viro }
105663b6df14SAl Viro
__fdget_pos(unsigned int fd)105763b6df14SAl Viro unsigned long __fdget_pos(unsigned int fd)
105863b6df14SAl Viro {
105963b6df14SAl Viro unsigned long v = __fdget(fd);
1060bd2a31d5SAl Viro struct file *file = (struct file *)(v & ~3);
1061bd2a31d5SAl Viro
1062bd2a31d5SAl Viro if (file && file_needs_f_pos_lock(file)) {
1063bd2a31d5SAl Viro v |= FDPUT_POS_UNLOCK;
1064bd2a31d5SAl Viro mutex_lock(&file->f_pos_lock);
1065fe17f22dSAl Viro }
1066fe17f22dSAl Viro return v;
1067fe17f22dSAl Viro }
1068fe17f22dSAl Viro
__f_unlock_pos(struct file * f)1069fe17f22dSAl Viro void __f_unlock_pos(struct file *f)
1070fe17f22dSAl Viro {
1071fe17f22dSAl Viro mutex_unlock(&f->f_pos_lock);
1072fe17f22dSAl Viro }
1073fe17f22dSAl Viro
1074fe17f22dSAl Viro /*
1075fe17f22dSAl Viro * We only lock f_pos if we have threads or if the file might be
1076fe17f22dSAl Viro * shared with another process. In both cases we'll have an elevated
1077fe17f22dSAl Viro * file count (done either by fdget() or by fork()).
1078fe17f22dSAl Viro */
1079fe17f22dSAl Viro
set_close_on_exec(unsigned int fd,int flag)1080fe17f22dSAl Viro void set_close_on_exec(unsigned int fd, int flag)
1081fe17f22dSAl Viro {
1082fe17f22dSAl Viro struct files_struct *files = current->files;
1083fe17f22dSAl Viro struct fdtable *fdt;
1084fe17f22dSAl Viro spin_lock(&files->file_lock);
1085fe17f22dSAl Viro fdt = files_fdtable(files);
1086fe17f22dSAl Viro if (flag)
1087fe17f22dSAl Viro __set_close_on_exec(fd, fdt);
1088fe17f22dSAl Viro else
1089fe17f22dSAl Viro __clear_close_on_exec(fd, fdt);
1090fe17f22dSAl Viro spin_unlock(&files->file_lock);
10918280d161SAl Viro }
10928280d161SAl Viro
get_close_on_exec(unsigned int fd)1093e983094dSAl Viro bool get_close_on_exec(unsigned int fd)
10948280d161SAl Viro {
10958280d161SAl Viro struct files_struct *files = current->files;
10968280d161SAl Viro struct fdtable *fdt;
10978280d161SAl Viro bool res;
10988280d161SAl Viro rcu_read_lock();
10998280d161SAl Viro fdt = files_fdtable(files);
11008280d161SAl Viro res = close_on_exec(fd, fdt);
11018280d161SAl Viro rcu_read_unlock();
11028280d161SAl Viro return res;
11038280d161SAl Viro }
11048280d161SAl Viro
do_dup2(struct files_struct * files,struct file * file,unsigned fd,unsigned flags)11058280d161SAl Viro static int do_dup2(struct files_struct *files,
11068280d161SAl Viro struct file *file, unsigned fd, unsigned flags)
11078280d161SAl Viro __releases(&files->file_lock)
11088280d161SAl Viro {
11098280d161SAl Viro struct file *tofree;
11108280d161SAl Viro struct fdtable *fdt;
11118280d161SAl Viro
11128280d161SAl Viro /*
11138280d161SAl Viro * We need to detect attempts to do dup2() over allocated but still
11148280d161SAl Viro * not finished descriptor. NB: OpenBSD avoids that at the price of
11158280d161SAl Viro * extra work in their equivalent of fget() - they insert struct
11168280d161SAl Viro * file immediately after grabbing descriptor, mark it larval if
11178280d161SAl Viro * more work (e.g. actual opening) is needed and make sure that
11188280d161SAl Viro * fget() treats larval files as absent. Potentially interesting,
11198280d161SAl Viro * but while extra work in fget() is trivial, locking implications
11208280d161SAl Viro * and amount of surgery on open()-related paths in VFS are not.
11218280d161SAl Viro * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
11228280d161SAl Viro * deadlocks in rather amusing ways, AFAICS. All of that is out of
11238280d161SAl Viro * scope of POSIX or SUS, since neither considers shared descriptor
11248280d161SAl Viro * tables and this condition does not arise without those.
11258280d161SAl Viro */
11268280d161SAl Viro fdt = files_fdtable(files);
11278280d161SAl Viro tofree = fdt->fd[fd];
11288280d161SAl Viro if (!tofree && fd_is_open(fd, fdt))
11298280d161SAl Viro goto Ebusy;
11308280d161SAl Viro get_file(file);
11318280d161SAl Viro rcu_assign_pointer(fdt->fd[fd], file);
11328280d161SAl Viro __set_open_fd(fd, fdt);
11338280d161SAl Viro if (flags & O_CLOEXEC)
11348280d161SAl Viro __set_close_on_exec(fd, fdt);
11358280d161SAl Viro else
11368280d161SAl Viro __clear_close_on_exec(fd, fdt);
11378280d161SAl Viro spin_unlock(&files->file_lock);
11388280d161SAl Viro
11398280d161SAl Viro if (tofree)
11408280d161SAl Viro filp_close(tofree, files);
11418760c909SEric W. Biederman
11428280d161SAl Viro return fd;
11438280d161SAl Viro
114408f05c49SAl Viro Ebusy:
11458280d161SAl Viro spin_unlock(&files->file_lock);
11468280d161SAl Viro return -EBUSY;
11478280d161SAl Viro }
11488280d161SAl Viro
replace_fd(unsigned fd,struct file * file,unsigned flags)11498280d161SAl Viro int replace_fd(unsigned fd, struct file *file, unsigned flags)
11508280d161SAl Viro {
11518280d161SAl Viro int err;
11528280d161SAl Viro struct files_struct *files = current->files;
11538280d161SAl Viro
11548280d161SAl Viro if (!file)
11558280d161SAl Viro return close_fd(fd);
11568280d161SAl Viro
115766590610SKees Cook if (fd >= rlimit(RLIMIT_NOFILE))
115866590610SKees Cook return -EBADF;
115966590610SKees Cook
116066590610SKees Cook spin_lock(&files->file_lock);
116166590610SKees Cook err = expand_files(files, fd);
116266590610SKees Cook if (unlikely(err < 0))
116366590610SKees Cook goto out_unlock;
1164deefa7f3SKees Cook return do_dup2(files, file, fd, flags);
1165deefa7f3SKees Cook
116666590610SKees Cook out_unlock:
116766590610SKees Cook spin_unlock(&files->file_lock);
116866590610SKees Cook return err;
116966590610SKees Cook }
1170deefa7f3SKees Cook
117166590610SKees Cook /**
117242eb0d54SChristoph Hellwig * __receive_fd() - Install received file into file descriptor table
117366590610SKees Cook * @file: struct file that was received from another process
117466590610SKees Cook * @ufd: __user pointer to write new fd number to
117566590610SKees Cook * @o_flags: the O_* flags to apply to the new fd entry
117666590610SKees Cook *
117766590610SKees Cook * Installs a received file into the file descriptor table, with appropriate
117866590610SKees Cook * checks and count updates. Optionally writes the fd number to userspace, if
117966590610SKees Cook * @ufd is non-NULL.
118066590610SKees Cook *
118166590610SKees Cook * This helper handles its own reference counting of the incoming
118266590610SKees Cook * struct file.
118366590610SKees Cook *
118466590610SKees Cook * Returns newly install fd or -ve on error.
1185deefa7f3SKees Cook */
__receive_fd(struct file * file,int __user * ufd,unsigned int o_flags)118666590610SKees Cook int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
118766590610SKees Cook {
118866590610SKees Cook int new_fd;
118966590610SKees Cook int error;
119066590610SKees Cook
1191deefa7f3SKees Cook error = security_file_receive(file);
119266590610SKees Cook if (error)
119317381715SKees Cook return error;
119442eb0d54SChristoph Hellwig
119542eb0d54SChristoph Hellwig new_fd = get_unused_fd_flags(o_flags);
119642eb0d54SChristoph Hellwig if (new_fd < 0)
119742eb0d54SChristoph Hellwig return new_fd;
119842eb0d54SChristoph Hellwig
119942eb0d54SChristoph Hellwig if (ufd) {
120042eb0d54SChristoph Hellwig error = put_user(new_fd, ufd);
120142eb0d54SChristoph Hellwig if (error) {
120242eb0d54SChristoph Hellwig put_unused_fd(new_fd);
120342eb0d54SChristoph Hellwig return error;
120442eb0d54SChristoph Hellwig }
120517381715SKees Cook }
120617381715SKees Cook
120717381715SKees Cook fd_install(new_fd, get_file(file));
120866590610SKees Cook __receive_sock(file);
1209deefa7f3SKees Cook return new_fd;
121066590610SKees Cook }
121166590610SKees Cook
receive_fd_replace(int new_fd,struct file * file,unsigned int o_flags)12129c930054SXie Yongji int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
12139c930054SXie Yongji {
12149c930054SXie Yongji int error;
12159c930054SXie Yongji
12169c930054SXie Yongji error = security_file_receive(file);
12179c930054SXie Yongji if (error)
1218c7248321SDominik Brodowski return error;
1219fe17f22dSAl Viro error = replace_fd(new_fd, file, o_flags);
1220fe17f22dSAl Viro if (error)
12218280d161SAl Viro return error;
1222fe17f22dSAl Viro __receive_sock(file);
1223fe17f22dSAl Viro return new_fd;
1224fe17f22dSAl Viro }
1225fe17f22dSAl Viro
receive_fd(struct file * file,unsigned int o_flags)1226fe17f22dSAl Viro int receive_fd(struct file *file, unsigned int o_flags)
1227aed97647SRichard W.M. Jones {
1228aed97647SRichard W.M. Jones return __receive_fd(file, NULL, o_flags);
1229aed97647SRichard W.M. Jones }
1230fe17f22dSAl Viro EXPORT_SYMBOL_GPL(receive_fd);
123108f05c49SAl Viro
ksys_dup3(unsigned int oldfd,unsigned int newfd,int flags)1232fe17f22dSAl Viro static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
1233fe17f22dSAl Viro {
1234fe17f22dSAl Viro int err = -EBADF;
1235120ce2b0SEric W. Biederman struct file *file;
1236fe17f22dSAl Viro struct files_struct *files = current->files;
1237fe17f22dSAl Viro
1238fe17f22dSAl Viro if ((flags & ~O_CLOEXEC) != 0)
1239fe17f22dSAl Viro return -EINVAL;
1240fe17f22dSAl Viro
1241fe17f22dSAl Viro if (unlikely(oldfd == newfd))
1242fe17f22dSAl Viro return -EINVAL;
12438280d161SAl Viro
1244fe17f22dSAl Viro if (newfd >= rlimit(RLIMIT_NOFILE))
1245fe17f22dSAl Viro return -EBADF;
1246fe17f22dSAl Viro
1247fe17f22dSAl Viro spin_lock(&files->file_lock);
1248fe17f22dSAl Viro err = expand_files(files, newfd);
1249fe17f22dSAl Viro file = files_lookup_fd_locked(files, oldfd);
1250fe17f22dSAl Viro if (unlikely(!file))
1251fe17f22dSAl Viro goto Ebadf;
1252c7248321SDominik Brodowski if (unlikely(err < 0)) {
1253c7248321SDominik Brodowski if (err == -EMFILE)
1254c7248321SDominik Brodowski goto Ebadf;
1255c7248321SDominik Brodowski goto out_unlock;
1256c7248321SDominik Brodowski }
1257fe17f22dSAl Viro return do_dup2(files, file, newfd, flags);
1258fe17f22dSAl Viro
1259fe17f22dSAl Viro Ebadf:
1260fe17f22dSAl Viro err = -EBADF;
1261fe17f22dSAl Viro out_unlock:
1262fe17f22dSAl Viro spin_unlock(&files->file_lock);
1263fe17f22dSAl Viro return err;
1264f36c2943SEric W. Biederman }
1265fe17f22dSAl Viro
SYSCALL_DEFINE3(dup3,unsigned int,oldfd,unsigned int,newfd,int,flags)1266fe17f22dSAl Viro SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
1267fe17f22dSAl Viro {
1268fe17f22dSAl Viro return ksys_dup3(oldfd, newfd, flags);
1269c7248321SDominik Brodowski }
1270fe17f22dSAl Viro
SYSCALL_DEFINE2(dup2,unsigned int,oldfd,unsigned int,newfd)1271fe17f22dSAl Viro SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
1272bc1cd99aSChristoph Hellwig {
1273fe17f22dSAl Viro if (unlikely(newfd == oldfd)) { /* corner case */
1274fe17f22dSAl Viro struct files_struct *files = current->files;
1275fe17f22dSAl Viro int retval = oldfd;
1276fe17f22dSAl Viro
1277fe17f22dSAl Viro rcu_read_lock();
12788d10a035SYann Droneaud if (!files_lookup_fd_rcu(files, oldfd))
1279fe17f22dSAl Viro retval = -EBADF;
1280fe17f22dSAl Viro rcu_read_unlock();
1281fe17f22dSAl Viro return retval;
1282fe17f22dSAl Viro }
1283fe17f22dSAl Viro return ksys_dup3(oldfd, newfd, 0);
1284fe17f22dSAl Viro }
1285fe17f22dSAl Viro
SYSCALL_DEFINE1(dup,unsigned int,fildes)1286fe17f22dSAl Viro SYSCALL_DEFINE1(dup, unsigned int, fildes)
1287fe17f22dSAl Viro {
1288fe17f22dSAl Viro int ret = -EBADF;
1289e06b53c2SEric W. Biederman struct file *file = fget_raw(fildes);
1290fe17f22dSAl Viro
1291e06b53c2SEric W. Biederman if (file) {
1292fe17f22dSAl Viro ret = get_unused_fd_flags(0);
1293e06b53c2SEric W. Biederman if (ret >= 0)
1294fe17f22dSAl Viro fd_install(ret, file);
1295fe17f22dSAl Viro else
1296fe17f22dSAl Viro fput(file);
1297fe17f22dSAl Viro }
1298fe17f22dSAl Viro return ret;
1299fe17f22dSAl Viro }
1300c3c073f8SAl Viro
f_dupfd(unsigned int from,struct file * file,unsigned flags)1301c3c073f8SAl Viro int f_dupfd(unsigned int from, struct file *file, unsigned flags)
1302c3c073f8SAl Viro {
1303c3c073f8SAl Viro unsigned long nofile = rlimit(RLIMIT_NOFILE);
1304c3c073f8SAl Viro int err;
1305c3c073f8SAl Viro if (from >= nofile)
1306c3c073f8SAl Viro return -EINVAL;
1307c3c073f8SAl Viro err = alloc_fd(from, nofile, flags);
1308c3c073f8SAl Viro if (err >= 0) {
1309c3c073f8SAl Viro get_file(file);
1310a77cfcb4SAl Viro fd_install(err, file);
1311a77cfcb4SAl Viro }
1312a77cfcb4SAl Viro return err;
1313a77cfcb4SAl Viro }
1314a77cfcb4SAl Viro
iterate_fd(struct files_struct * files,unsigned n,int (* f)(const void *,struct file *,unsigned),const void * p)1315c3c073f8SAl Viro int iterate_fd(struct files_struct *files, unsigned n,
1316a77cfcb4SAl Viro int (*f)(const void *, struct file *, unsigned),
1317a77cfcb4SAl Viro const void *p)
1318c3c073f8SAl Viro {
1319c3c073f8SAl Viro struct fdtable *fdt;
1320c3c073f8SAl Viro int res = 0;
1321c3c073f8SAl Viro if (!files)
1322c3c073f8SAl Viro return 0;
1323 spin_lock(&files->file_lock);
1324 for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1325 struct file *file;
1326 file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1327 if (!file)
1328 continue;
1329 res = f(p, file, n);
1330 if (res)
1331 break;
1332 }
1333 spin_unlock(&files->file_lock);
1334 return res;
1335 }
1336 EXPORT_SYMBOL(iterate_fd);
1337