xref: /openbmc/linux/fs/file.c (revision aad29a73199b7fbccfbabea3f1ee627ad1924f52)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  *  linux/fs/file.c
41da177e4SLinus Torvalds  *
51da177e4SLinus Torvalds  *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
61da177e4SLinus Torvalds  *
71da177e4SLinus Torvalds  *  Manage the dynamic fd arrays in the process files_struct.
81da177e4SLinus Torvalds  */
91da177e4SLinus Torvalds 
10fe17f22dSAl Viro #include <linux/syscalls.h>
11630d9c47SPaul Gortmaker #include <linux/export.h>
121da177e4SLinus Torvalds #include <linux/fs.h>
13278a5fbaSChristian Brauner #include <linux/kernel.h>
141da177e4SLinus Torvalds #include <linux/mm.h>
153f07c014SIngo Molnar #include <linux/sched/signal.h>
161da177e4SLinus Torvalds #include <linux/slab.h>
171da177e4SLinus Torvalds #include <linux/file.h>
189f3acc31SAl Viro #include <linux/fdtable.h>
191da177e4SLinus Torvalds #include <linux/bitops.h>
20ab2af1f5SDipankar Sarma #include <linux/spinlock.h>
21ab2af1f5SDipankar Sarma #include <linux/rcupdate.h>
2260997c3dSChristian Brauner #include <linux/close_range.h>
2366590610SKees Cook #include <net/sock.h>
24*62861a5dSZhang Kunbo #include <linux/init_task.h>
25ab2af1f5SDipankar Sarma 
2653dec2eaSJens Axboe #include "internal.h"
2753dec2eaSJens Axboe 
289b80a184SAlexey Dobriyan unsigned int sysctl_nr_open __read_mostly = 1024*1024;
299b80a184SAlexey Dobriyan unsigned int sysctl_nr_open_min = BITS_PER_LONG;
30752343beSRasmus Villemoes /* our min() is unusable in constant expressions ;-/ */
31752343beSRasmus Villemoes #define __const_min(x, y) ((x) < (y) ? (x) : (y))
329b80a184SAlexey Dobriyan unsigned int sysctl_nr_open_max =
339b80a184SAlexey Dobriyan 	__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
349cfe015aSEric Dumazet 
__free_fdtable(struct fdtable * fdt)35a892e2d7SChangli Gao static void __free_fdtable(struct fdtable *fdt)
361da177e4SLinus Torvalds {
37f6c0a192SAl Viro 	kvfree(fdt->fd);
38f6c0a192SAl Viro 	kvfree(fdt->open_fds);
39a892e2d7SChangli Gao 	kfree(fdt);
40ab2af1f5SDipankar Sarma }
41ab2af1f5SDipankar Sarma 
free_fdtable_rcu(struct rcu_head * rcu)427cf4dc3cSAl Viro static void free_fdtable_rcu(struct rcu_head *rcu)
43ab2af1f5SDipankar Sarma {
44ac3e3c5bSAl Viro 	__free_fdtable(container_of(rcu, struct fdtable, rcu));
45ab2af1f5SDipankar Sarma }
46ab2af1f5SDipankar Sarma 
47f3f86e33SLinus Torvalds #define BITBIT_NR(nr)	BITS_TO_LONGS(BITS_TO_LONGS(nr))
48f3f86e33SLinus Torvalds #define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
49f3f86e33SLinus Torvalds 
50dd72ae8bSAl Viro #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
511da177e4SLinus Torvalds /*
52ea5c58e7SEric Biggers  * Copy 'count' fd bits from the old table to the new table and clear the extra
53ea5c58e7SEric Biggers  * space if any.  This does not copy the file pointers.  Called with the files
54ea5c58e7SEric Biggers  * spinlock held for write.
55ea5c58e7SEric Biggers  */
copy_fd_bitmaps(struct fdtable * nfdt,struct fdtable * ofdt,unsigned int copy_words)56dd72ae8bSAl Viro static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
57dd72ae8bSAl Viro 			    unsigned int copy_words)
58ea5c58e7SEric Biggers {
59dd72ae8bSAl Viro 	unsigned int nwords = fdt_words(nfdt);
60ea5c58e7SEric Biggers 
61dd72ae8bSAl Viro 	bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
62dd72ae8bSAl Viro 			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
63dd72ae8bSAl Viro 	bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
64dd72ae8bSAl Viro 			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
65dd72ae8bSAl Viro 	bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
66dd72ae8bSAl Viro 			copy_words, nwords);
67ea5c58e7SEric Biggers }
68ea5c58e7SEric Biggers 
69ea5c58e7SEric Biggers /*
70ea5c58e7SEric Biggers  * Copy all file descriptors from the old table to the new, expanded table and
71ea5c58e7SEric Biggers  * clear the extra space.  Called with the files spinlock held for write.
721da177e4SLinus Torvalds  */
copy_fdtable(struct fdtable * nfdt,struct fdtable * ofdt)735466b456SVadim Lobanov static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
74ab2af1f5SDipankar Sarma {
754e89b721SAl Viro 	size_t cpy, set;
761da177e4SLinus Torvalds 
775466b456SVadim Lobanov 	BUG_ON(nfdt->max_fds < ofdt->max_fds);
785466b456SVadim Lobanov 
795466b456SVadim Lobanov 	cpy = ofdt->max_fds * sizeof(struct file *);
805466b456SVadim Lobanov 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
815466b456SVadim Lobanov 	memcpy(nfdt->fd, ofdt->fd, cpy);
82ea5c58e7SEric Biggers 	memset((char *)nfdt->fd + cpy, 0, set);
835466b456SVadim Lobanov 
84dd72ae8bSAl Viro 	copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
851da177e4SLinus Torvalds }
861da177e4SLinus Torvalds 
871c24a186SLinus Torvalds /*
881c24a186SLinus Torvalds  * Note how the fdtable bitmap allocations very much have to be a multiple of
891c24a186SLinus Torvalds  * BITS_PER_LONG. This is not only because we walk those things in chunks of
901c24a186SLinus Torvalds  * 'unsigned long' in some places, but simply because that is how the Linux
911c24a186SLinus Torvalds  * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
921c24a186SLinus Torvalds  * they are very much "bits in an array of unsigned long".
931c24a186SLinus Torvalds  *
941c24a186SLinus Torvalds  * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
951c24a186SLinus Torvalds  * by that "1024/sizeof(ptr)" before, we already know there are sufficient
961c24a186SLinus Torvalds  * clear low bits. Clang seems to realize that, gcc ends up being confused.
971c24a186SLinus Torvalds  *
981c24a186SLinus Torvalds  * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
991c24a186SLinus Torvalds  * let's consider it documentation (and maybe a test-case for gcc to improve
1001c24a186SLinus Torvalds  * its code generation ;)
1011c24a186SLinus Torvalds  */
alloc_fdtable(unsigned int nr)1025466b456SVadim Lobanov static struct fdtable * alloc_fdtable(unsigned int nr)
1031da177e4SLinus Torvalds {
1045466b456SVadim Lobanov 	struct fdtable *fdt;
1051fd36adcSDavid Howells 	void *data;
1061da177e4SLinus Torvalds 
1075466b456SVadim Lobanov 	/*
1085466b456SVadim Lobanov 	 * Figure out how many fds we actually want to support in this fdtable.
1095466b456SVadim Lobanov 	 * Allocation steps are keyed to the size of the fdarray, since it
1105466b456SVadim Lobanov 	 * grows far faster than any of the other dynamic data. We try to fit
1115466b456SVadim Lobanov 	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
1125466b456SVadim Lobanov 	 * and growing in powers of two from there on.
1135466b456SVadim Lobanov 	 */
1145466b456SVadim Lobanov 	nr /= (1024 / sizeof(struct file *));
1155466b456SVadim Lobanov 	nr = roundup_pow_of_two(nr + 1);
1165466b456SVadim Lobanov 	nr *= (1024 / sizeof(struct file *));
1171c24a186SLinus Torvalds 	nr = ALIGN(nr, BITS_PER_LONG);
1185c598b34SAl Viro 	/*
1195c598b34SAl Viro 	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
1205c598b34SAl Viro 	 * had been set lower between the check in expand_files() and here.  Deal
1215c598b34SAl Viro 	 * with that in caller, it's cheaper that way.
1225c598b34SAl Viro 	 *
1235c598b34SAl Viro 	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
1245c598b34SAl Viro 	 * bitmaps handling below becomes unpleasant, to put it mildly...
1255c598b34SAl Viro 	 */
1265c598b34SAl Viro 	if (unlikely(nr > sysctl_nr_open))
1275c598b34SAl Viro 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
1285466b456SVadim Lobanov 
1295d097056SVladimir Davydov 	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
130ab2af1f5SDipankar Sarma 	if (!fdt)
1311da177e4SLinus Torvalds 		goto out;
1325466b456SVadim Lobanov 	fdt->max_fds = nr;
133c823bd92SMichal Hocko 	data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
1345466b456SVadim Lobanov 	if (!data)
1355466b456SVadim Lobanov 		goto out_fdt;
1361fd36adcSDavid Howells 	fdt->fd = data;
1371fd36adcSDavid Howells 
138c823bd92SMichal Hocko 	data = kvmalloc(max_t(size_t,
139c823bd92SMichal Hocko 				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
140c823bd92SMichal Hocko 				 GFP_KERNEL_ACCOUNT);
1415466b456SVadim Lobanov 	if (!data)
1425466b456SVadim Lobanov 		goto out_arr;
1431fd36adcSDavid Howells 	fdt->open_fds = data;
1445466b456SVadim Lobanov 	data += nr / BITS_PER_BYTE;
1451fd36adcSDavid Howells 	fdt->close_on_exec = data;
146f3f86e33SLinus Torvalds 	data += nr / BITS_PER_BYTE;
147f3f86e33SLinus Torvalds 	fdt->full_fds_bits = data;
1481da177e4SLinus Torvalds 
149ab2af1f5SDipankar Sarma 	return fdt;
1505466b456SVadim Lobanov 
1515466b456SVadim Lobanov out_arr:
152f6c0a192SAl Viro 	kvfree(fdt->fd);
1535466b456SVadim Lobanov out_fdt:
154ab2af1f5SDipankar Sarma 	kfree(fdt);
1555466b456SVadim Lobanov out:
156ab2af1f5SDipankar Sarma 	return NULL;
157ab2af1f5SDipankar Sarma }
158ab2af1f5SDipankar Sarma 
159ab2af1f5SDipankar Sarma /*
16074d392aaSVadim Lobanov  * Expand the file descriptor table.
16174d392aaSVadim Lobanov  * This function will allocate a new fdtable and both fd array and fdset, of
16274d392aaSVadim Lobanov  * the given size.
16374d392aaSVadim Lobanov  * Return <0 error code on error; 1 on successful completion.
16474d392aaSVadim Lobanov  * The files->file_lock should be held on entry, and will be held on exit.
165ab2af1f5SDipankar Sarma  */
expand_fdtable(struct files_struct * files,unsigned int nr)1669b80a184SAlexey Dobriyan static int expand_fdtable(struct files_struct *files, unsigned int nr)
167ab2af1f5SDipankar Sarma 	__releases(files->file_lock)
168ab2af1f5SDipankar Sarma 	__acquires(files->file_lock)
169ab2af1f5SDipankar Sarma {
17074d392aaSVadim Lobanov 	struct fdtable *new_fdt, *cur_fdt;
171ab2af1f5SDipankar Sarma 
172ab2af1f5SDipankar Sarma 	spin_unlock(&files->file_lock);
17374d392aaSVadim Lobanov 	new_fdt = alloc_fdtable(nr);
1748a81252bSEric Dumazet 
175d74ba04dSEric W. Biederman 	/* make sure all fd_install() have seen resize_in_progress
1768a81252bSEric Dumazet 	 * or have finished their rcu_read_lock_sched() section.
1778a81252bSEric Dumazet 	 */
1788a81252bSEric Dumazet 	if (atomic_read(&files->count) > 1)
179c93ffc15SPaul E. McKenney 		synchronize_rcu();
1808a81252bSEric Dumazet 
1811da177e4SLinus Torvalds 	spin_lock(&files->file_lock);
18274d392aaSVadim Lobanov 	if (!new_fdt)
18374d392aaSVadim Lobanov 		return -ENOMEM;
184ab2af1f5SDipankar Sarma 	/*
1855c598b34SAl Viro 	 * extremely unlikely race - sysctl_nr_open decreased between the check in
1865c598b34SAl Viro 	 * caller and alloc_fdtable().  Cheaper to catch it here...
1875c598b34SAl Viro 	 */
1885c598b34SAl Viro 	if (unlikely(new_fdt->max_fds <= nr)) {
189a892e2d7SChangli Gao 		__free_fdtable(new_fdt);
1905c598b34SAl Viro 		return -EMFILE;
1915c598b34SAl Viro 	}
19274d392aaSVadim Lobanov 	cur_fdt = files_fdtable(files);
1938a81252bSEric Dumazet 	BUG_ON(nr < cur_fdt->max_fds);
19474d392aaSVadim Lobanov 	copy_fdtable(new_fdt, cur_fdt);
19574d392aaSVadim Lobanov 	rcu_assign_pointer(files->fdt, new_fdt);
196ac3e3c5bSAl Viro 	if (cur_fdt != &files->fdtab)
1971983e781SAl Viro 		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
198d74ba04dSEric W. Biederman 	/* coupled with smp_rmb() in fd_install() */
1998a81252bSEric Dumazet 	smp_wmb();
20074d392aaSVadim Lobanov 	return 1;
2011da177e4SLinus Torvalds }
2021da177e4SLinus Torvalds 
2031da177e4SLinus Torvalds /*
2041da177e4SLinus Torvalds  * Expand files.
20574d392aaSVadim Lobanov  * This function will expand the file structures, if the requested size exceeds
20674d392aaSVadim Lobanov  * the current capacity and there is room for expansion.
20774d392aaSVadim Lobanov  * Return <0 error code on error; 0 when nothing done; 1 when files were
20874d392aaSVadim Lobanov  * expanded and execution may have blocked.
20974d392aaSVadim Lobanov  * The files->file_lock should be held on entry, and will be held on exit.
2101da177e4SLinus Torvalds  */
expand_files(struct files_struct * files,unsigned int nr)2119b80a184SAlexey Dobriyan static int expand_files(struct files_struct *files, unsigned int nr)
2128a81252bSEric Dumazet 	__releases(files->file_lock)
2138a81252bSEric Dumazet 	__acquires(files->file_lock)
2141da177e4SLinus Torvalds {
215badf1662SDipankar Sarma 	struct fdtable *fdt;
2168a81252bSEric Dumazet 	int expanded = 0;
2171da177e4SLinus Torvalds 
2188a81252bSEric Dumazet repeat:
219badf1662SDipankar Sarma 	fdt = files_fdtable(files);
2204e1e018eSAl Viro 
22174d392aaSVadim Lobanov 	/* Do we need to expand? */
222bbea9f69SVadim Lobanov 	if (nr < fdt->max_fds)
2238a81252bSEric Dumazet 		return expanded;
2244e1e018eSAl Viro 
22574d392aaSVadim Lobanov 	/* Can we expand? */
2269cfe015aSEric Dumazet 	if (nr >= sysctl_nr_open)
22774d392aaSVadim Lobanov 		return -EMFILE;
22874d392aaSVadim Lobanov 
2298a81252bSEric Dumazet 	if (unlikely(files->resize_in_progress)) {
2308a81252bSEric Dumazet 		spin_unlock(&files->file_lock);
2318a81252bSEric Dumazet 		expanded = 1;
2328a81252bSEric Dumazet 		wait_event(files->resize_wait, !files->resize_in_progress);
2338a81252bSEric Dumazet 		spin_lock(&files->file_lock);
2348a81252bSEric Dumazet 		goto repeat;
2358a81252bSEric Dumazet 	}
2368a81252bSEric Dumazet 
23774d392aaSVadim Lobanov 	/* All good, so we try */
2388a81252bSEric Dumazet 	files->resize_in_progress = true;
2398a81252bSEric Dumazet 	expanded = expand_fdtable(files, nr);
2408a81252bSEric Dumazet 	files->resize_in_progress = false;
2418a81252bSEric Dumazet 
2428a81252bSEric Dumazet 	wake_up_all(&files->resize_wait);
2438a81252bSEric Dumazet 	return expanded;
2441da177e4SLinus Torvalds }
245ab2af1f5SDipankar Sarma 
__set_close_on_exec(unsigned int fd,struct fdtable * fdt)2469b80a184SAlexey Dobriyan static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
247b8318b01SAl Viro {
248b8318b01SAl Viro 	__set_bit(fd, fdt->close_on_exec);
249b8318b01SAl Viro }
250b8318b01SAl Viro 
__clear_close_on_exec(unsigned int fd,struct fdtable * fdt)2519b80a184SAlexey Dobriyan static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
252b8318b01SAl Viro {
253fc90888dSLinus Torvalds 	if (test_bit(fd, fdt->close_on_exec))
254b8318b01SAl Viro 		__clear_bit(fd, fdt->close_on_exec);
255b8318b01SAl Viro }
256b8318b01SAl Viro 
__set_open_fd(unsigned int fd,struct fdtable * fdt)257f3f86e33SLinus Torvalds static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
258b8318b01SAl Viro {
259b8318b01SAl Viro 	__set_bit(fd, fdt->open_fds);
260f3f86e33SLinus Torvalds 	fd /= BITS_PER_LONG;
261f3f86e33SLinus Torvalds 	if (!~fdt->open_fds[fd])
262f3f86e33SLinus Torvalds 		__set_bit(fd, fdt->full_fds_bits);
263b8318b01SAl Viro }
264b8318b01SAl Viro 
__clear_open_fd(unsigned int fd,struct fdtable * fdt)265f3f86e33SLinus Torvalds static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
266b8318b01SAl Viro {
267b8318b01SAl Viro 	__clear_bit(fd, fdt->open_fds);
268f3f86e33SLinus Torvalds 	__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
269b8318b01SAl Viro }
270b8318b01SAl Viro 
2711c24a186SLinus Torvalds /*
2721c24a186SLinus Torvalds  * Note that a sane fdtable size always has to be a multiple of
2731c24a186SLinus Torvalds  * BITS_PER_LONG, since we have bitmaps that are sized by this.
2741c24a186SLinus Torvalds  *
275a8023f8bSAl Viro  * punch_hole is optional - when close_range() is asked to unshare
276a8023f8bSAl Viro  * and close, we don't need to copy descriptors in that range, so
277a8023f8bSAl Viro  * a smaller cloned descriptor table might suffice if the last
278a8023f8bSAl Viro  * currently opened descriptor falls into that range.
2791c24a186SLinus Torvalds  */
sane_fdtable_size(struct fdtable * fdt,struct fd_range * punch_hole)280a8023f8bSAl Viro static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
28160997c3dSChristian Brauner {
282a8023f8bSAl Viro 	unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
28360997c3dSChristian Brauner 
284a8023f8bSAl Viro 	if (last == fdt->max_fds)
285a8023f8bSAl Viro 		return NR_OPEN_DEFAULT;
286a8023f8bSAl Viro 	if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
287a8023f8bSAl Viro 		last = find_last_bit(fdt->open_fds, punch_hole->from);
288a8023f8bSAl Viro 		if (last == punch_hole->from)
289a8023f8bSAl Viro 			return NR_OPEN_DEFAULT;
290a8023f8bSAl Viro 	}
291a8023f8bSAl Viro 	return ALIGN(last + 1, BITS_PER_LONG);
29260997c3dSChristian Brauner }
29360997c3dSChristian Brauner 
29402afc626SAl Viro /*
295a8023f8bSAl Viro  * Allocate a new descriptor table and copy contents from the passed in
296a8023f8bSAl Viro  * instance.  Returns a pointer to cloned table on success, ERR_PTR()
297a8023f8bSAl Viro  * on failure.  For 'punch_hole' see sane_fdtable_size().
29802afc626SAl Viro  */
dup_fd(struct files_struct * oldf,struct fd_range * punch_hole)299a8023f8bSAl Viro struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
30002afc626SAl Viro {
30102afc626SAl Viro 	struct files_struct *newf;
30202afc626SAl Viro 	struct file **old_fds, **new_fds;
3039b80a184SAlexey Dobriyan 	unsigned int open_files, i;
30402afc626SAl Viro 	struct fdtable *old_fdt, *new_fdt;
305a8023f8bSAl Viro 	int error;
30602afc626SAl Viro 
307afbec7ffSAl Viro 	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
30802afc626SAl Viro 	if (!newf)
309a8023f8bSAl Viro 		return ERR_PTR(-ENOMEM);
31002afc626SAl Viro 
311afbec7ffSAl Viro 	atomic_set(&newf->count, 1);
312afbec7ffSAl Viro 
313afbec7ffSAl Viro 	spin_lock_init(&newf->file_lock);
3148a81252bSEric Dumazet 	newf->resize_in_progress = false;
3158a81252bSEric Dumazet 	init_waitqueue_head(&newf->resize_wait);
316afbec7ffSAl Viro 	newf->next_fd = 0;
317afbec7ffSAl Viro 	new_fdt = &newf->fdtab;
318afbec7ffSAl Viro 	new_fdt->max_fds = NR_OPEN_DEFAULT;
3191fd36adcSDavid Howells 	new_fdt->close_on_exec = newf->close_on_exec_init;
3201fd36adcSDavid Howells 	new_fdt->open_fds = newf->open_fds_init;
321f3f86e33SLinus Torvalds 	new_fdt->full_fds_bits = newf->full_fds_bits_init;
322afbec7ffSAl Viro 	new_fdt->fd = &newf->fd_array[0];
323afbec7ffSAl Viro 
32402afc626SAl Viro 	spin_lock(&oldf->file_lock);
32502afc626SAl Viro 	old_fdt = files_fdtable(oldf);
326a8023f8bSAl Viro 	open_files = sane_fdtable_size(old_fdt, punch_hole);
32702afc626SAl Viro 
32802afc626SAl Viro 	/*
32902afc626SAl Viro 	 * Check whether we need to allocate a larger fd array and fd set.
33002afc626SAl Viro 	 */
331adbecb12SAl Viro 	while (unlikely(open_files > new_fdt->max_fds)) {
33202afc626SAl Viro 		spin_unlock(&oldf->file_lock);
3339dec3c4dSAl Viro 
334a892e2d7SChangli Gao 		if (new_fdt != &newf->fdtab)
335a892e2d7SChangli Gao 			__free_fdtable(new_fdt);
336adbecb12SAl Viro 
3379dec3c4dSAl Viro 		new_fdt = alloc_fdtable(open_files - 1);
3389dec3c4dSAl Viro 		if (!new_fdt) {
339a8023f8bSAl Viro 			error = -ENOMEM;
34002afc626SAl Viro 			goto out_release;
3419dec3c4dSAl Viro 		}
3429dec3c4dSAl Viro 
3439dec3c4dSAl Viro 		/* beyond sysctl_nr_open; nothing to do */
3449dec3c4dSAl Viro 		if (unlikely(new_fdt->max_fds < open_files)) {
345a892e2d7SChangli Gao 			__free_fdtable(new_fdt);
346a8023f8bSAl Viro 			error = -EMFILE;
3479dec3c4dSAl Viro 			goto out_release;
3489dec3c4dSAl Viro 		}
3499dec3c4dSAl Viro 
35002afc626SAl Viro 		/*
35102afc626SAl Viro 		 * Reacquire the oldf lock and a pointer to its fd table
35202afc626SAl Viro 		 * who knows it may have a new bigger fd table. We need
35302afc626SAl Viro 		 * the latest pointer.
35402afc626SAl Viro 		 */
35502afc626SAl Viro 		spin_lock(&oldf->file_lock);
35602afc626SAl Viro 		old_fdt = files_fdtable(oldf);
357a8023f8bSAl Viro 		open_files = sane_fdtable_size(old_fdt, punch_hole);
35802afc626SAl Viro 	}
35902afc626SAl Viro 
360dd72ae8bSAl Viro 	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
361ea5c58e7SEric Biggers 
36202afc626SAl Viro 	old_fds = old_fdt->fd;
36302afc626SAl Viro 	new_fds = new_fdt->fd;
36402afc626SAl Viro 
36502afc626SAl Viro 	for (i = open_files; i != 0; i--) {
36602afc626SAl Viro 		struct file *f = *old_fds++;
36702afc626SAl Viro 		if (f) {
36802afc626SAl Viro 			get_file(f);
36902afc626SAl Viro 		} else {
37002afc626SAl Viro 			/*
37102afc626SAl Viro 			 * The fd may be claimed in the fd bitmap but not yet
37202afc626SAl Viro 			 * instantiated in the files array if a sibling thread
37302afc626SAl Viro 			 * is partway through open().  So make sure that this
37402afc626SAl Viro 			 * fd is available to the new process.
37502afc626SAl Viro 			 */
3761dce27c5SDavid Howells 			__clear_open_fd(open_files - i, new_fdt);
37702afc626SAl Viro 		}
37802afc626SAl Viro 		rcu_assign_pointer(*new_fds++, f);
37902afc626SAl Viro 	}
38002afc626SAl Viro 	spin_unlock(&oldf->file_lock);
38102afc626SAl Viro 
382ea5c58e7SEric Biggers 	/* clear the remainder */
383ea5c58e7SEric Biggers 	memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
38402afc626SAl Viro 
385afbec7ffSAl Viro 	rcu_assign_pointer(newf->fdt, new_fdt);
386afbec7ffSAl Viro 
38702afc626SAl Viro 	return newf;
38802afc626SAl Viro 
38902afc626SAl Viro out_release:
39002afc626SAl Viro 	kmem_cache_free(files_cachep, newf);
391a8023f8bSAl Viro 	return ERR_PTR(error);
39202afc626SAl Viro }
39302afc626SAl Viro 
close_files(struct files_struct * files)394ce08b62dSOleg Nesterov static struct fdtable *close_files(struct files_struct * files)
3957cf4dc3cSAl Viro {
3967cf4dc3cSAl Viro 	/*
3977cf4dc3cSAl Viro 	 * It is safe to dereference the fd table without RCU or
3987cf4dc3cSAl Viro 	 * ->file_lock because this is the last reference to the
399ce08b62dSOleg Nesterov 	 * files structure.
4007cf4dc3cSAl Viro 	 */
401ce08b62dSOleg Nesterov 	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
4029b80a184SAlexey Dobriyan 	unsigned int i, j = 0;
403ce08b62dSOleg Nesterov 
4047cf4dc3cSAl Viro 	for (;;) {
4057cf4dc3cSAl Viro 		unsigned long set;
4067cf4dc3cSAl Viro 		i = j * BITS_PER_LONG;
4077cf4dc3cSAl Viro 		if (i >= fdt->max_fds)
4087cf4dc3cSAl Viro 			break;
4097cf4dc3cSAl Viro 		set = fdt->open_fds[j++];
4107cf4dc3cSAl Viro 		while (set) {
4117cf4dc3cSAl Viro 			if (set & 1) {
4127cf4dc3cSAl Viro 				struct file * file = xchg(&fdt->fd[i], NULL);
4137cf4dc3cSAl Viro 				if (file) {
4147cf4dc3cSAl Viro 					filp_close(file, files);
415388a4c88SPaul E. McKenney 					cond_resched();
4167cf4dc3cSAl Viro 				}
4177cf4dc3cSAl Viro 			}
4187cf4dc3cSAl Viro 			i++;
4197cf4dc3cSAl Viro 			set >>= 1;
4207cf4dc3cSAl Viro 		}
4217cf4dc3cSAl Viro 	}
422ce08b62dSOleg Nesterov 
423ce08b62dSOleg Nesterov 	return fdt;
4247cf4dc3cSAl Viro }
4257cf4dc3cSAl Viro 
put_files_struct(struct files_struct * files)4267cf4dc3cSAl Viro void put_files_struct(struct files_struct *files)
4277cf4dc3cSAl Viro {
4287cf4dc3cSAl Viro 	if (atomic_dec_and_test(&files->count)) {
429ce08b62dSOleg Nesterov 		struct fdtable *fdt = close_files(files);
430ce08b62dSOleg Nesterov 
431b9e02af0SAl Viro 		/* free the arrays if they are not embedded */
432b9e02af0SAl Viro 		if (fdt != &files->fdtab)
433b9e02af0SAl Viro 			__free_fdtable(fdt);
434b9e02af0SAl Viro 		kmem_cache_free(files_cachep, files);
4357cf4dc3cSAl Viro 	}
4367cf4dc3cSAl Viro }
4377cf4dc3cSAl Viro 
exit_files(struct task_struct * tsk)4387cf4dc3cSAl Viro void exit_files(struct task_struct *tsk)
4397cf4dc3cSAl Viro {
4407cf4dc3cSAl Viro 	struct files_struct * files = tsk->files;
4417cf4dc3cSAl Viro 
4427cf4dc3cSAl Viro 	if (files) {
4437cf4dc3cSAl Viro 		task_lock(tsk);
4447cf4dc3cSAl Viro 		tsk->files = NULL;
4457cf4dc3cSAl Viro 		task_unlock(tsk);
4467cf4dc3cSAl Viro 		put_files_struct(files);
4477cf4dc3cSAl Viro 	}
4487cf4dc3cSAl Viro }
4497cf4dc3cSAl Viro 
450f52111b1SAl Viro struct files_struct init_files = {
451f52111b1SAl Viro 	.count		= ATOMIC_INIT(1),
452f52111b1SAl Viro 	.fdt		= &init_files.fdtab,
453f52111b1SAl Viro 	.fdtab		= {
454f52111b1SAl Viro 		.max_fds	= NR_OPEN_DEFAULT,
455f52111b1SAl Viro 		.fd		= &init_files.fd_array[0],
4561fd36adcSDavid Howells 		.close_on_exec	= init_files.close_on_exec_init,
4571fd36adcSDavid Howells 		.open_fds	= init_files.open_fds_init,
458f3f86e33SLinus Torvalds 		.full_fds_bits	= init_files.full_fds_bits_init,
459f52111b1SAl Viro 	},
460eece09ecSThomas Gleixner 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
4615704a068SShuriyc Chu 	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
462f52111b1SAl Viro };
4631027abe8SAl Viro 
find_next_fd(struct fdtable * fdt,unsigned int start)4649b80a184SAlexey Dobriyan static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
465f3f86e33SLinus Torvalds {
466bd56b910SYuntao Wang 	unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
4679b80a184SAlexey Dobriyan 	unsigned int maxbit = maxfd / BITS_PER_LONG;
4689b80a184SAlexey Dobriyan 	unsigned int bitbit = start / BITS_PER_LONG;
469f3f86e33SLinus Torvalds 
470f3f86e33SLinus Torvalds 	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
471bd56b910SYuntao Wang 	if (bitbit >= maxfd)
472f3f86e33SLinus Torvalds 		return maxfd;
473f3f86e33SLinus Torvalds 	if (bitbit > start)
474f3f86e33SLinus Torvalds 		start = bitbit;
475f3f86e33SLinus Torvalds 	return find_next_zero_bit(fdt->open_fds, maxfd, start);
476f3f86e33SLinus Torvalds }
477f3f86e33SLinus Torvalds 
4781027abe8SAl Viro /*
4791027abe8SAl Viro  * allocate a file descriptor, mark it busy.
4801027abe8SAl Viro  */
alloc_fd(unsigned start,unsigned end,unsigned flags)481aa384d10SEric W. Biederman static int alloc_fd(unsigned start, unsigned end, unsigned flags)
4821027abe8SAl Viro {
483aa384d10SEric W. Biederman 	struct files_struct *files = current->files;
4841027abe8SAl Viro 	unsigned int fd;
4851027abe8SAl Viro 	int error;
4861027abe8SAl Viro 	struct fdtable *fdt;
4871027abe8SAl Viro 
4881027abe8SAl Viro 	spin_lock(&files->file_lock);
4891027abe8SAl Viro repeat:
4901027abe8SAl Viro 	fdt = files_fdtable(files);
4911027abe8SAl Viro 	fd = start;
4921027abe8SAl Viro 	if (fd < files->next_fd)
4931027abe8SAl Viro 		fd = files->next_fd;
4941027abe8SAl Viro 
4951027abe8SAl Viro 	if (fd < fdt->max_fds)
496f3f86e33SLinus Torvalds 		fd = find_next_fd(fdt, fd);
4971027abe8SAl Viro 
498f33ff992SAl Viro 	/*
499f33ff992SAl Viro 	 * N.B. For clone tasks sharing a files structure, this test
500f33ff992SAl Viro 	 * will limit the total number of files that can be opened.
501f33ff992SAl Viro 	 */
502f33ff992SAl Viro 	error = -EMFILE;
503f33ff992SAl Viro 	if (fd >= end)
504f33ff992SAl Viro 		goto out;
505f33ff992SAl Viro 
5061027abe8SAl Viro 	error = expand_files(files, fd);
5071027abe8SAl Viro 	if (error < 0)
5081027abe8SAl Viro 		goto out;
5091027abe8SAl Viro 
5101027abe8SAl Viro 	/*
5111027abe8SAl Viro 	 * If we needed to expand the fs array we
5121027abe8SAl Viro 	 * might have blocked - try again.
5131027abe8SAl Viro 	 */
5141027abe8SAl Viro 	if (error)
5151027abe8SAl Viro 		goto repeat;
5161027abe8SAl Viro 
5171027abe8SAl Viro 	if (start <= files->next_fd)
5181027abe8SAl Viro 		files->next_fd = fd + 1;
5191027abe8SAl Viro 
5201dce27c5SDavid Howells 	__set_open_fd(fd, fdt);
5211027abe8SAl Viro 	if (flags & O_CLOEXEC)
5221dce27c5SDavid Howells 		__set_close_on_exec(fd, fdt);
5231027abe8SAl Viro 	else
5241dce27c5SDavid Howells 		__clear_close_on_exec(fd, fdt);
5251027abe8SAl Viro 	error = fd;
5261027abe8SAl Viro #if 1
5271027abe8SAl Viro 	/* Sanity check */
528add1f099SPaul E. McKenney 	if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
5291027abe8SAl Viro 		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
5301027abe8SAl Viro 		rcu_assign_pointer(fdt->fd[fd], NULL);
5311027abe8SAl Viro 	}
5321027abe8SAl Viro #endif
5331027abe8SAl Viro 
5341027abe8SAl Viro out:
5351027abe8SAl Viro 	spin_unlock(&files->file_lock);
5361027abe8SAl Viro 	return error;
5371027abe8SAl Viro }
5381027abe8SAl Viro 
__get_unused_fd_flags(unsigned flags,unsigned long nofile)5394022e7afSJens Axboe int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
5404022e7afSJens Axboe {
541aa384d10SEric W. Biederman 	return alloc_fd(0, nofile, flags);
5424022e7afSJens Axboe }
5434022e7afSJens Axboe 
get_unused_fd_flags(unsigned flags)5441a7bd226SAl Viro int get_unused_fd_flags(unsigned flags)
5451027abe8SAl Viro {
5464022e7afSJens Axboe 	return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
5471027abe8SAl Viro }
5481a7bd226SAl Viro EXPORT_SYMBOL(get_unused_fd_flags);
54956007caeSAl Viro 
__put_unused_fd(struct files_struct * files,unsigned int fd)55056007caeSAl Viro static void __put_unused_fd(struct files_struct *files, unsigned int fd)
55156007caeSAl Viro {
55256007caeSAl Viro 	struct fdtable *fdt = files_fdtable(files);
55356007caeSAl Viro 	__clear_open_fd(fd, fdt);
55456007caeSAl Viro 	if (fd < files->next_fd)
55556007caeSAl Viro 		files->next_fd = fd;
55656007caeSAl Viro }
55756007caeSAl Viro 
put_unused_fd(unsigned int fd)55856007caeSAl Viro void put_unused_fd(unsigned int fd)
55956007caeSAl Viro {
56056007caeSAl Viro 	struct files_struct *files = current->files;
56156007caeSAl Viro 	spin_lock(&files->file_lock);
56256007caeSAl Viro 	__put_unused_fd(files, fd);
56356007caeSAl Viro 	spin_unlock(&files->file_lock);
56456007caeSAl Viro }
56556007caeSAl Viro 
56656007caeSAl Viro EXPORT_SYMBOL(put_unused_fd);
56756007caeSAl Viro 
56856007caeSAl Viro /*
56956007caeSAl Viro  * Install a file pointer in the fd array.
57056007caeSAl Viro  *
57156007caeSAl Viro  * The VFS is full of places where we drop the files lock between
57256007caeSAl Viro  * setting the open_fds bitmap and installing the file in the file
57356007caeSAl Viro  * array.  At any such point, we are vulnerable to a dup2() race
57456007caeSAl Viro  * installing a file in the array before us.  We need to detect this and
57556007caeSAl Viro  * fput() the struct file we are about to overwrite in this case.
57656007caeSAl Viro  *
57756007caeSAl Viro  * It should never happen - if we allow dup2() do it, _really_ bad things
57856007caeSAl Viro  * will follow.
579f869e8a7SAl Viro  *
580d74ba04dSEric W. Biederman  * This consumes the "file" refcount, so callers should treat it
581d74ba04dSEric W. Biederman  * as if they had called fput(file).
58256007caeSAl Viro  */
58356007caeSAl Viro 
fd_install(unsigned int fd,struct file * file)584d74ba04dSEric W. Biederman void fd_install(unsigned int fd, struct file *file)
58556007caeSAl Viro {
586d74ba04dSEric W. Biederman 	struct files_struct *files = current->files;
58756007caeSAl Viro 	struct fdtable *fdt;
5888a81252bSEric Dumazet 
5898a81252bSEric Dumazet 	rcu_read_lock_sched();
5908a81252bSEric Dumazet 
591c02b1a9bSMateusz Guzik 	if (unlikely(files->resize_in_progress)) {
5928a81252bSEric Dumazet 		rcu_read_unlock_sched();
593c02b1a9bSMateusz Guzik 		spin_lock(&files->file_lock);
594c02b1a9bSMateusz Guzik 		fdt = files_fdtable(files);
595c02b1a9bSMateusz Guzik 		BUG_ON(fdt->fd[fd] != NULL);
596c02b1a9bSMateusz Guzik 		rcu_assign_pointer(fdt->fd[fd], file);
597c02b1a9bSMateusz Guzik 		spin_unlock(&files->file_lock);
598c02b1a9bSMateusz Guzik 		return;
5998a81252bSEric Dumazet 	}
6008a81252bSEric Dumazet 	/* coupled with smp_wmb() in expand_fdtable() */
6018a81252bSEric Dumazet 	smp_rmb();
6028a81252bSEric Dumazet 	fdt = rcu_dereference_sched(files->fdt);
60356007caeSAl Viro 	BUG_ON(fdt->fd[fd] != NULL);
60456007caeSAl Viro 	rcu_assign_pointer(fdt->fd[fd], file);
6058a81252bSEric Dumazet 	rcu_read_unlock_sched();
60656007caeSAl Viro }
60756007caeSAl Viro 
60856007caeSAl Viro EXPORT_SYMBOL(fd_install);
6090ee8cdfeSAl Viro 
610f49fd6d3SChristian Brauner /**
611f49fd6d3SChristian Brauner  * pick_file - return file associatd with fd
612f49fd6d3SChristian Brauner  * @files: file struct to retrieve file from
613f49fd6d3SChristian Brauner  * @fd: file descriptor to retrieve file for
614f49fd6d3SChristian Brauner  *
6156319194eSAl Viro  * Context: files_lock must be held.
616f49fd6d3SChristian Brauner  *
6176319194eSAl Viro  * Returns: The file associated with @fd (NULL if @fd is not open)
618f49fd6d3SChristian Brauner  */
pick_file(struct files_struct * files,unsigned fd)619278a5fbaSChristian Brauner static struct file *pick_file(struct files_struct *files, unsigned fd)
620483ce1d4SAl Viro {
6216319194eSAl Viro 	struct fdtable *fdt = files_fdtable(files);
622f49fd6d3SChristian Brauner 	struct file *file;
623483ce1d4SAl Viro 
6246319194eSAl Viro 	if (fd >= fdt->max_fds)
6256319194eSAl Viro 		return NULL;
6266319194eSAl Viro 
627609d5444STheodore Ts'o 	fd = array_index_nospec(fd, fdt->max_fds);
628483ce1d4SAl Viro 	file = fdt->fd[fd];
6296319194eSAl Viro 	if (file) {
630483ce1d4SAl Viro 		rcu_assign_pointer(fdt->fd[fd], NULL);
631483ce1d4SAl Viro 		__put_unused_fd(files, fd);
6326319194eSAl Viro 	}
633278a5fbaSChristian Brauner 	return file;
634278a5fbaSChristian Brauner }
635278a5fbaSChristian Brauner 
close_fd(unsigned fd)6368760c909SEric W. Biederman int close_fd(unsigned fd)
637278a5fbaSChristian Brauner {
6388760c909SEric W. Biederman 	struct files_struct *files = current->files;
639278a5fbaSChristian Brauner 	struct file *file;
640278a5fbaSChristian Brauner 
6416319194eSAl Viro 	spin_lock(&files->file_lock);
642278a5fbaSChristian Brauner 	file = pick_file(files, fd);
6436319194eSAl Viro 	spin_unlock(&files->file_lock);
6446319194eSAl Viro 	if (!file)
645483ce1d4SAl Viro 		return -EBADF;
646278a5fbaSChristian Brauner 
647278a5fbaSChristian Brauner 	return filp_close(file, files);
648483ce1d4SAl Viro }
6498760c909SEric W. Biederman EXPORT_SYMBOL(close_fd); /* for ksys_close() */
650483ce1d4SAl Viro 
6519b5b8722SChristian Brauner /**
6529b5b8722SChristian Brauner  * last_fd - return last valid index into fd table
65335931eb3SMatthew Wilcox (Oracle)  * @fdt: File descriptor table.
6549b5b8722SChristian Brauner  *
6559b5b8722SChristian Brauner  * Context: Either rcu read lock or files_lock must be held.
6569b5b8722SChristian Brauner  *
6579b5b8722SChristian Brauner  * Returns: Last valid index into fdtable.
6589b5b8722SChristian Brauner  */
last_fd(struct fdtable * fdt)6599b5b8722SChristian Brauner static inline unsigned last_fd(struct fdtable *fdt)
6609b5b8722SChristian Brauner {
6619b5b8722SChristian Brauner 	return fdt->max_fds - 1;
6629b5b8722SChristian Brauner }
6639b5b8722SChristian Brauner 
__range_cloexec(struct files_struct * cur_fds,unsigned int fd,unsigned int max_fd)664582f1fb6SGiuseppe Scrivano static inline void __range_cloexec(struct files_struct *cur_fds,
665582f1fb6SGiuseppe Scrivano 				   unsigned int fd, unsigned int max_fd)
666582f1fb6SGiuseppe Scrivano {
667582f1fb6SGiuseppe Scrivano 	struct fdtable *fdt;
668582f1fb6SGiuseppe Scrivano 
6699b5b8722SChristian Brauner 	/* make sure we're using the correct maximum value */
670582f1fb6SGiuseppe Scrivano 	spin_lock(&cur_fds->file_lock);
671582f1fb6SGiuseppe Scrivano 	fdt = files_fdtable(cur_fds);
6729b5b8722SChristian Brauner 	max_fd = min(last_fd(fdt), max_fd);
6739b5b8722SChristian Brauner 	if (fd <= max_fd)
674582f1fb6SGiuseppe Scrivano 		bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
675582f1fb6SGiuseppe Scrivano 	spin_unlock(&cur_fds->file_lock);
676582f1fb6SGiuseppe Scrivano }
677582f1fb6SGiuseppe Scrivano 
__range_close(struct files_struct * files,unsigned int fd,unsigned int max_fd)678ed192c59SMateusz Guzik static inline void __range_close(struct files_struct *files, unsigned int fd,
679582f1fb6SGiuseppe Scrivano 				 unsigned int max_fd)
680582f1fb6SGiuseppe Scrivano {
681ed192c59SMateusz Guzik 	struct file *file;
6826319194eSAl Viro 	unsigned n;
6836319194eSAl Viro 
684ed192c59SMateusz Guzik 	spin_lock(&files->file_lock);
685ed192c59SMateusz Guzik 	n = last_fd(files_fdtable(files));
6866319194eSAl Viro 	max_fd = min(max_fd, n);
6876319194eSAl Viro 
688ed192c59SMateusz Guzik 	for (; fd <= max_fd; fd++) {
689ed192c59SMateusz Guzik 		file = pick_file(files, fd);
6906319194eSAl Viro 		if (file) {
691ed192c59SMateusz Guzik 			spin_unlock(&files->file_lock);
692ed192c59SMateusz Guzik 			filp_close(file, files);
693582f1fb6SGiuseppe Scrivano 			cond_resched();
694ed192c59SMateusz Guzik 			spin_lock(&files->file_lock);
695ed192c59SMateusz Guzik 		} else if (need_resched()) {
696ed192c59SMateusz Guzik 			spin_unlock(&files->file_lock);
697ed192c59SMateusz Guzik 			cond_resched();
698ed192c59SMateusz Guzik 			spin_lock(&files->file_lock);
699f49fd6d3SChristian Brauner 		}
700582f1fb6SGiuseppe Scrivano 	}
701ed192c59SMateusz Guzik 	spin_unlock(&files->file_lock);
702582f1fb6SGiuseppe Scrivano }
703582f1fb6SGiuseppe Scrivano 
704278a5fbaSChristian Brauner /**
705278a5fbaSChristian Brauner  * __close_range() - Close all file descriptors in a given range.
706278a5fbaSChristian Brauner  *
707278a5fbaSChristian Brauner  * @fd:     starting file descriptor to close
708278a5fbaSChristian Brauner  * @max_fd: last file descriptor to close
70935931eb3SMatthew Wilcox (Oracle)  * @flags:  CLOSE_RANGE flags.
710278a5fbaSChristian Brauner  *
711278a5fbaSChristian Brauner  * This closes a range of file descriptors. All file descriptors
712278a5fbaSChristian Brauner  * from @fd up to and including @max_fd are closed.
713278a5fbaSChristian Brauner  */
__close_range(unsigned fd,unsigned max_fd,unsigned int flags)71460997c3dSChristian Brauner int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
715278a5fbaSChristian Brauner {
71660997c3dSChristian Brauner 	struct task_struct *me = current;
71760997c3dSChristian Brauner 	struct files_struct *cur_fds = me->files, *fds = NULL;
71860997c3dSChristian Brauner 
719582f1fb6SGiuseppe Scrivano 	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
72060997c3dSChristian Brauner 		return -EINVAL;
721278a5fbaSChristian Brauner 
722278a5fbaSChristian Brauner 	if (fd > max_fd)
723278a5fbaSChristian Brauner 		return -EINVAL;
724278a5fbaSChristian Brauner 
725a8023f8bSAl Viro 	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
726a8023f8bSAl Viro 		struct fd_range range = {fd, max_fd}, *punch_hole = &range;
72760997c3dSChristian Brauner 
72860997c3dSChristian Brauner 		/*
72903ba0fe4SChristian Brauner 		 * If the caller requested all fds to be made cloexec we always
73003ba0fe4SChristian Brauner 		 * copy all of the file descriptors since they still want to
73103ba0fe4SChristian Brauner 		 * use them.
73260997c3dSChristian Brauner 		 */
733a8023f8bSAl Viro 		if (flags & CLOSE_RANGE_CLOEXEC)
734a8023f8bSAl Viro 			punch_hole = NULL;
73560997c3dSChristian Brauner 
736a8023f8bSAl Viro 		fds = dup_fd(cur_fds, punch_hole);
737a8023f8bSAl Viro 		if (IS_ERR(fds))
738a8023f8bSAl Viro 			return PTR_ERR(fds);
73960997c3dSChristian Brauner 		/*
74060997c3dSChristian Brauner 		 * We used to share our file descriptor table, and have now
74160997c3dSChristian Brauner 		 * created a private one, make sure we're using it below.
74260997c3dSChristian Brauner 		 */
74360997c3dSChristian Brauner 		swap(cur_fds, fds);
74460997c3dSChristian Brauner 	}
74560997c3dSChristian Brauner 
746582f1fb6SGiuseppe Scrivano 	if (flags & CLOSE_RANGE_CLOEXEC)
747582f1fb6SGiuseppe Scrivano 		__range_cloexec(cur_fds, fd, max_fd);
748582f1fb6SGiuseppe Scrivano 	else
749582f1fb6SGiuseppe Scrivano 		__range_close(cur_fds, fd, max_fd);
750278a5fbaSChristian Brauner 
75160997c3dSChristian Brauner 	if (fds) {
75260997c3dSChristian Brauner 		/*
75360997c3dSChristian Brauner 		 * We're done closing the files we were supposed to. Time to install
75460997c3dSChristian Brauner 		 * the new file descriptor table and drop the old one.
75560997c3dSChristian Brauner 		 */
75660997c3dSChristian Brauner 		task_lock(me);
75760997c3dSChristian Brauner 		me->files = cur_fds;
75860997c3dSChristian Brauner 		task_unlock(me);
75960997c3dSChristian Brauner 		put_files_struct(fds);
76060997c3dSChristian Brauner 	}
76160997c3dSChristian Brauner 
762278a5fbaSChristian Brauner 	return 0;
763278a5fbaSChristian Brauner }
764278a5fbaSChristian Brauner 
76580cd7956STodd Kjos /*
76653dec2eaSJens Axboe  * See close_fd_get_file() below, this variant assumes current->files->file_lock
76753dec2eaSJens Axboe  * is held.
76853dec2eaSJens Axboe  */
__close_fd_get_file(unsigned int fd)7696319194eSAl Viro struct file *__close_fd_get_file(unsigned int fd)
77053dec2eaSJens Axboe {
7716319194eSAl Viro 	return pick_file(current->files, fd);
77253dec2eaSJens Axboe }
77353dec2eaSJens Axboe 
77453dec2eaSJens Axboe /*
7759fe83c43SEric W. Biederman  * variant of close_fd that gets a ref on the file for later fput.
77640a19260SAl Viro  * The caller must ensure that filp_close() called on the file.
77780cd7956STodd Kjos  */
close_fd_get_file(unsigned int fd)7786319194eSAl Viro struct file *close_fd_get_file(unsigned int fd)
77980cd7956STodd Kjos {
78080cd7956STodd Kjos 	struct files_struct *files = current->files;
7816319194eSAl Viro 	struct file *file;
78280cd7956STodd Kjos 
78380cd7956STodd Kjos 	spin_lock(&files->file_lock);
7846319194eSAl Viro 	file = pick_file(files, fd);
78580cd7956STodd Kjos 	spin_unlock(&files->file_lock);
78680cd7956STodd Kjos 
7876319194eSAl Viro 	return file;
78880cd7956STodd Kjos }
78980cd7956STodd Kjos 
do_close_on_exec(struct files_struct * files)7906a6d27deSAl Viro void do_close_on_exec(struct files_struct *files)
7916a6d27deSAl Viro {
7926a6d27deSAl Viro 	unsigned i;
7936a6d27deSAl Viro 	struct fdtable *fdt;
7946a6d27deSAl Viro 
7956a6d27deSAl Viro 	/* exec unshares first */
7966a6d27deSAl Viro 	spin_lock(&files->file_lock);
7976a6d27deSAl Viro 	for (i = 0; ; i++) {
7986a6d27deSAl Viro 		unsigned long set;
7996a6d27deSAl Viro 		unsigned fd = i * BITS_PER_LONG;
8006a6d27deSAl Viro 		fdt = files_fdtable(files);
8016a6d27deSAl Viro 		if (fd >= fdt->max_fds)
8026a6d27deSAl Viro 			break;
8036a6d27deSAl Viro 		set = fdt->close_on_exec[i];
8046a6d27deSAl Viro 		if (!set)
8056a6d27deSAl Viro 			continue;
8066a6d27deSAl Viro 		fdt->close_on_exec[i] = 0;
8076a6d27deSAl Viro 		for ( ; set ; fd++, set >>= 1) {
8086a6d27deSAl Viro 			struct file *file;
8096a6d27deSAl Viro 			if (!(set & 1))
8106a6d27deSAl Viro 				continue;
8116a6d27deSAl Viro 			file = fdt->fd[fd];
8126a6d27deSAl Viro 			if (!file)
8136a6d27deSAl Viro 				continue;
8146a6d27deSAl Viro 			rcu_assign_pointer(fdt->fd[fd], NULL);
8156a6d27deSAl Viro 			__put_unused_fd(files, fd);
8166a6d27deSAl Viro 			spin_unlock(&files->file_lock);
8176a6d27deSAl Viro 			filp_close(file, files);
8186a6d27deSAl Viro 			cond_resched();
8196a6d27deSAl Viro 			spin_lock(&files->file_lock);
8206a6d27deSAl Viro 		}
8216a6d27deSAl Viro 
8226a6d27deSAl Viro 	}
8236a6d27deSAl Viro 	spin_unlock(&files->file_lock);
8246a6d27deSAl Viro }
8256a6d27deSAl Viro 
__fget_files_rcu(struct files_struct * files,unsigned int fd,fmode_t mask)826e386dfc5SLinus Torvalds static inline struct file *__fget_files_rcu(struct files_struct *files,
82781132a39SGou Hao 	unsigned int fd, fmode_t mask)
828e386dfc5SLinus Torvalds {
829e386dfc5SLinus Torvalds 	for (;;) {
830e386dfc5SLinus Torvalds 		struct file *file;
831e386dfc5SLinus Torvalds 		struct fdtable *fdt = rcu_dereference_raw(files->fdt);
832e386dfc5SLinus Torvalds 		struct file __rcu **fdentry;
833e386dfc5SLinus Torvalds 
834e386dfc5SLinus Torvalds 		if (unlikely(fd >= fdt->max_fds))
835e386dfc5SLinus Torvalds 			return NULL;
836e386dfc5SLinus Torvalds 
837e386dfc5SLinus Torvalds 		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
838e386dfc5SLinus Torvalds 		file = rcu_dereference_raw(*fdentry);
839e386dfc5SLinus Torvalds 		if (unlikely(!file))
840e386dfc5SLinus Torvalds 			return NULL;
841e386dfc5SLinus Torvalds 
842e386dfc5SLinus Torvalds 		if (unlikely(file->f_mode & mask))
843e386dfc5SLinus Torvalds 			return NULL;
844e386dfc5SLinus Torvalds 
845e386dfc5SLinus Torvalds 		/*
846e386dfc5SLinus Torvalds 		 * Ok, we have a file pointer. However, because we do
847e386dfc5SLinus Torvalds 		 * this all locklessly under RCU, we may be racing with
848e386dfc5SLinus Torvalds 		 * that file being closed.
849e386dfc5SLinus Torvalds 		 *
850e386dfc5SLinus Torvalds 		 * Such a race can take two forms:
851e386dfc5SLinus Torvalds 		 *
852e386dfc5SLinus Torvalds 		 *  (a) the file ref already went down to zero,
85381132a39SGou Hao 		 *      and get_file_rcu() fails. Just try again:
854e386dfc5SLinus Torvalds 		 */
85581132a39SGou Hao 		if (unlikely(!get_file_rcu(file)))
856e386dfc5SLinus Torvalds 			continue;
857e386dfc5SLinus Torvalds 
858e386dfc5SLinus Torvalds 		/*
859e386dfc5SLinus Torvalds 		 *  (b) the file table entry has changed under us.
860e386dfc5SLinus Torvalds 		 *       Note that we don't need to re-check the 'fdt->fd'
861e386dfc5SLinus Torvalds 		 *       pointer having changed, because it always goes
862e386dfc5SLinus Torvalds 		 *       hand-in-hand with 'fdt'.
863e386dfc5SLinus Torvalds 		 *
86481132a39SGou Hao 		 * If so, we need to put our ref and try again.
865e386dfc5SLinus Torvalds 		 */
866e386dfc5SLinus Torvalds 		if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
867e386dfc5SLinus Torvalds 		    unlikely(rcu_dereference_raw(*fdentry) != file)) {
86881132a39SGou Hao 			fput(file);
869e386dfc5SLinus Torvalds 			continue;
870e386dfc5SLinus Torvalds 		}
871e386dfc5SLinus Torvalds 
872e386dfc5SLinus Torvalds 		/*
873e386dfc5SLinus Torvalds 		 * Ok, we have a ref to the file, and checked that it
874e386dfc5SLinus Torvalds 		 * still exists.
875e386dfc5SLinus Torvalds 		 */
876e386dfc5SLinus Torvalds 		return file;
877e386dfc5SLinus Torvalds 	}
878e386dfc5SLinus Torvalds }
879e386dfc5SLinus Torvalds 
__fget_files(struct files_struct * files,unsigned int fd,fmode_t mask)8805e876fb4SSargun Dhillon static struct file *__fget_files(struct files_struct *files, unsigned int fd,
88181132a39SGou Hao 				 fmode_t mask)
8820ee8cdfeSAl Viro {
8831deb46e2SOleg Nesterov 	struct file *file;
8840ee8cdfeSAl Viro 
8850ee8cdfeSAl Viro 	rcu_read_lock();
88681132a39SGou Hao 	file = __fget_files_rcu(files, fd, mask);
8870ee8cdfeSAl Viro 	rcu_read_unlock();
8880ee8cdfeSAl Viro 
8890ee8cdfeSAl Viro 	return file;
8900ee8cdfeSAl Viro }
8910ee8cdfeSAl Viro 
__fget(unsigned int fd,fmode_t mask)89281132a39SGou Hao static inline struct file *__fget(unsigned int fd, fmode_t mask)
8935e876fb4SSargun Dhillon {
89481132a39SGou Hao 	return __fget_files(current->files, fd, mask);
895091141a4SJens Axboe }
896091141a4SJens Axboe 
fget(unsigned int fd)8971deb46e2SOleg Nesterov struct file *fget(unsigned int fd)
8981deb46e2SOleg Nesterov {
89981132a39SGou Hao 	return __fget(fd, FMODE_PATH);
9001deb46e2SOleg Nesterov }
9010ee8cdfeSAl Viro EXPORT_SYMBOL(fget);
9020ee8cdfeSAl Viro 
fget_raw(unsigned int fd)9030ee8cdfeSAl Viro struct file *fget_raw(unsigned int fd)
9040ee8cdfeSAl Viro {
90581132a39SGou Hao 	return __fget(fd, 0);
9060ee8cdfeSAl Viro }
9070ee8cdfeSAl Viro EXPORT_SYMBOL(fget_raw);
9080ee8cdfeSAl Viro 
fget_task(struct task_struct * task,unsigned int fd)9095e876fb4SSargun Dhillon struct file *fget_task(struct task_struct *task, unsigned int fd)
9105e876fb4SSargun Dhillon {
9115e876fb4SSargun Dhillon 	struct file *file = NULL;
9125e876fb4SSargun Dhillon 
9135e876fb4SSargun Dhillon 	task_lock(task);
9145e876fb4SSargun Dhillon 	if (task->files)
91581132a39SGou Hao 		file = __fget_files(task->files, fd, 0);
9165e876fb4SSargun Dhillon 	task_unlock(task);
9175e876fb4SSargun Dhillon 
9185e876fb4SSargun Dhillon 	return file;
9195e876fb4SSargun Dhillon }
9205e876fb4SSargun Dhillon 
task_lookup_fd_rcu(struct task_struct * task,unsigned int fd)9213a879fb3SEric W. Biederman struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
9223a879fb3SEric W. Biederman {
9233a879fb3SEric W. Biederman 	/* Must be called with rcu_read_lock held */
9243a879fb3SEric W. Biederman 	struct files_struct *files;
9253a879fb3SEric W. Biederman 	struct file *file = NULL;
9263a879fb3SEric W. Biederman 
9273a879fb3SEric W. Biederman 	task_lock(task);
9283a879fb3SEric W. Biederman 	files = task->files;
9293a879fb3SEric W. Biederman 	if (files)
9303a879fb3SEric W. Biederman 		file = files_lookup_fd_rcu(files, fd);
9313a879fb3SEric W. Biederman 	task_unlock(task);
9323a879fb3SEric W. Biederman 
9333a879fb3SEric W. Biederman 	return file;
9343a879fb3SEric W. Biederman }
9353a879fb3SEric W. Biederman 
task_lookup_next_fd_rcu(struct task_struct * task,unsigned int * ret_fd)936e9a53aebSEric W. Biederman struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
937e9a53aebSEric W. Biederman {
938e9a53aebSEric W. Biederman 	/* Must be called with rcu_read_lock held */
939e9a53aebSEric W. Biederman 	struct files_struct *files;
940e9a53aebSEric W. Biederman 	unsigned int fd = *ret_fd;
941e9a53aebSEric W. Biederman 	struct file *file = NULL;
942e9a53aebSEric W. Biederman 
943e9a53aebSEric W. Biederman 	task_lock(task);
944e9a53aebSEric W. Biederman 	files = task->files;
945e9a53aebSEric W. Biederman 	if (files) {
946e9a53aebSEric W. Biederman 		for (; fd < files_fdtable(files)->max_fds; fd++) {
947e9a53aebSEric W. Biederman 			file = files_lookup_fd_rcu(files, fd);
948e9a53aebSEric W. Biederman 			if (file)
949e9a53aebSEric W. Biederman 				break;
950e9a53aebSEric W. Biederman 		}
951e9a53aebSEric W. Biederman 	}
952e9a53aebSEric W. Biederman 	task_unlock(task);
953e9a53aebSEric W. Biederman 	*ret_fd = fd;
954e9a53aebSEric W. Biederman 	return file;
955e9a53aebSEric W. Biederman }
9564480c27cSAndreas Gruenbacher EXPORT_SYMBOL(task_lookup_next_fd_rcu);
957e9a53aebSEric W. Biederman 
9580ee8cdfeSAl Viro /*
9590ee8cdfeSAl Viro  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
9600ee8cdfeSAl Viro  *
9610ee8cdfeSAl Viro  * You can use this instead of fget if you satisfy all of the following
9620ee8cdfeSAl Viro  * conditions:
9630ee8cdfeSAl Viro  * 1) You must call fput_light before exiting the syscall and returning control
9640ee8cdfeSAl Viro  *    to userspace (i.e. you cannot remember the returned struct file * after
9650ee8cdfeSAl Viro  *    returning to userspace).
9660ee8cdfeSAl Viro  * 2) You must not call filp_close on the returned struct file * in between
9670ee8cdfeSAl Viro  *    calls to fget_light and fput_light.
9680ee8cdfeSAl Viro  * 3) You must not clone the current task in between the calls to fget_light
9690ee8cdfeSAl Viro  *    and fput_light.
9700ee8cdfeSAl Viro  *
9710ee8cdfeSAl Viro  * The fput_needed flag returned by fget_light should be passed to the
9720ee8cdfeSAl Viro  * corresponding fput_light.
9730ee8cdfeSAl Viro  */
__fget_light(unsigned int fd,fmode_t mask)974bd2a31d5SAl Viro static unsigned long __fget_light(unsigned int fd, fmode_t mask)
9750ee8cdfeSAl Viro {
9760ee8cdfeSAl Viro 	struct files_struct *files = current->files;
977ad461834SOleg Nesterov 	struct file *file;
9780ee8cdfeSAl Viro 
9797ee47dcfSJann Horn 	/*
9807ee47dcfSJann Horn 	 * If another thread is concurrently calling close_fd() followed
9817ee47dcfSJann Horn 	 * by put_files_struct(), we must not observe the old table
9827ee47dcfSJann Horn 	 * entry combined with the new refcount - otherwise we could
9837ee47dcfSJann Horn 	 * return a file that is concurrently being freed.
9847ee47dcfSJann Horn 	 *
9857ee47dcfSJann Horn 	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
9867ee47dcfSJann Horn 	 * put_files_struct().
9877ee47dcfSJann Horn 	 */
9887ee47dcfSJann Horn 	if (atomic_read_acquire(&files->count) == 1) {
989bebf684bSEric W. Biederman 		file = files_lookup_fd_raw(files, fd);
990bd2a31d5SAl Viro 		if (!file || unlikely(file->f_mode & mask))
991bd2a31d5SAl Viro 			return 0;
992bd2a31d5SAl Viro 		return (unsigned long)file;
9930ee8cdfeSAl Viro 	} else {
99481132a39SGou Hao 		file = __fget(fd, mask);
995bd2a31d5SAl Viro 		if (!file)
996bd2a31d5SAl Viro 			return 0;
997bd2a31d5SAl Viro 		return FDPUT_FPUT | (unsigned long)file;
998bd2a31d5SAl Viro 	}
999bd2a31d5SAl Viro }
__fdget(unsigned int fd)1000bd2a31d5SAl Viro unsigned long __fdget(unsigned int fd)
1001bd2a31d5SAl Viro {
1002bd2a31d5SAl Viro 	return __fget_light(fd, FMODE_PATH);
1003bd2a31d5SAl Viro }
1004bd2a31d5SAl Viro EXPORT_SYMBOL(__fdget);
1005bd2a31d5SAl Viro 
__fdget_raw(unsigned int fd)1006bd2a31d5SAl Viro unsigned long __fdget_raw(unsigned int fd)
1007bd2a31d5SAl Viro {
1008bd2a31d5SAl Viro 	return __fget_light(fd, 0);
10090ee8cdfeSAl Viro }
10100ee8cdfeSAl Viro 
101179796425SLinus Torvalds /*
101279796425SLinus Torvalds  * Try to avoid f_pos locking. We only need it if the
101379796425SLinus Torvalds  * file is marked for FMODE_ATOMIC_POS, and it can be
101479796425SLinus Torvalds  * accessed multiple ways.
101579796425SLinus Torvalds  *
101679796425SLinus Torvalds  * Always do it for directories, because pidfd_getfd()
101779796425SLinus Torvalds  * can make a file accessible even if it otherwise would
101879796425SLinus Torvalds  * not be, and for directories this is a correctness
101979796425SLinus Torvalds  * issue, not a "POSIX requirement".
102079796425SLinus Torvalds  */
file_needs_f_pos_lock(struct file * file)102179796425SLinus Torvalds static inline bool file_needs_f_pos_lock(struct file *file)
102279796425SLinus Torvalds {
102379796425SLinus Torvalds 	return (file->f_mode & FMODE_ATOMIC_POS) &&
10247d84d1b9SChristian Brauner 		(file_count(file) > 1 || file->f_op->iterate_shared);
102579796425SLinus Torvalds }
102679796425SLinus Torvalds 
__fdget_pos(unsigned int fd)1027bd2a31d5SAl Viro unsigned long __fdget_pos(unsigned int fd)
1028ad461834SOleg Nesterov {
102999aea681SEric Biggers 	unsigned long v = __fdget(fd);
103099aea681SEric Biggers 	struct file *file = (struct file *)(v & ~3);
10310ee8cdfeSAl Viro 
103279796425SLinus Torvalds 	if (file && file_needs_f_pos_lock(file)) {
1033bd2a31d5SAl Viro 		v |= FDPUT_POS_UNLOCK;
1034bd2a31d5SAl Viro 		mutex_lock(&file->f_pos_lock);
1035bd2a31d5SAl Viro 	}
103699aea681SEric Biggers 	return v;
1037bd2a31d5SAl Viro }
1038bd2a31d5SAl Viro 
__f_unlock_pos(struct file * f)103963b6df14SAl Viro void __f_unlock_pos(struct file *f)
104063b6df14SAl Viro {
104163b6df14SAl Viro 	mutex_unlock(&f->f_pos_lock);
104263b6df14SAl Viro }
104363b6df14SAl Viro 
1044bd2a31d5SAl Viro /*
1045bd2a31d5SAl Viro  * We only lock f_pos if we have threads or if the file might be
1046bd2a31d5SAl Viro  * shared with another process. In both cases we'll have an elevated
1047bd2a31d5SAl Viro  * file count (done either by fdget() or by fork()).
1048bd2a31d5SAl Viro  */
1049fe17f22dSAl Viro 
set_close_on_exec(unsigned int fd,int flag)1050fe17f22dSAl Viro void set_close_on_exec(unsigned int fd, int flag)
1051fe17f22dSAl Viro {
1052fe17f22dSAl Viro 	struct files_struct *files = current->files;
1053fe17f22dSAl Viro 	struct fdtable *fdt;
1054fe17f22dSAl Viro 	spin_lock(&files->file_lock);
1055fe17f22dSAl Viro 	fdt = files_fdtable(files);
1056fe17f22dSAl Viro 	if (flag)
1057fe17f22dSAl Viro 		__set_close_on_exec(fd, fdt);
1058fe17f22dSAl Viro 	else
1059fe17f22dSAl Viro 		__clear_close_on_exec(fd, fdt);
1060fe17f22dSAl Viro 	spin_unlock(&files->file_lock);
1061fe17f22dSAl Viro }
1062fe17f22dSAl Viro 
get_close_on_exec(unsigned int fd)1063fe17f22dSAl Viro bool get_close_on_exec(unsigned int fd)
1064fe17f22dSAl Viro {
1065fe17f22dSAl Viro 	struct files_struct *files = current->files;
1066fe17f22dSAl Viro 	struct fdtable *fdt;
1067fe17f22dSAl Viro 	bool res;
1068fe17f22dSAl Viro 	rcu_read_lock();
1069fe17f22dSAl Viro 	fdt = files_fdtable(files);
1070fe17f22dSAl Viro 	res = close_on_exec(fd, fdt);
1071fe17f22dSAl Viro 	rcu_read_unlock();
1072fe17f22dSAl Viro 	return res;
1073fe17f22dSAl Viro }
1074fe17f22dSAl Viro 
do_dup2(struct files_struct * files,struct file * file,unsigned fd,unsigned flags)10758280d161SAl Viro static int do_dup2(struct files_struct *files,
10768280d161SAl Viro 	struct file *file, unsigned fd, unsigned flags)
1077e983094dSAl Viro __releases(&files->file_lock)
10788280d161SAl Viro {
10798280d161SAl Viro 	struct file *tofree;
10808280d161SAl Viro 	struct fdtable *fdt;
10818280d161SAl Viro 
10828280d161SAl Viro 	/*
10838280d161SAl Viro 	 * We need to detect attempts to do dup2() over allocated but still
10848280d161SAl Viro 	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
10858280d161SAl Viro 	 * extra work in their equivalent of fget() - they insert struct
10868280d161SAl Viro 	 * file immediately after grabbing descriptor, mark it larval if
10878280d161SAl Viro 	 * more work (e.g. actual opening) is needed and make sure that
10888280d161SAl Viro 	 * fget() treats larval files as absent.  Potentially interesting,
10898280d161SAl Viro 	 * but while extra work in fget() is trivial, locking implications
10908280d161SAl Viro 	 * and amount of surgery on open()-related paths in VFS are not.
10918280d161SAl Viro 	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
10928280d161SAl Viro 	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
10938280d161SAl Viro 	 * scope of POSIX or SUS, since neither considers shared descriptor
10948280d161SAl Viro 	 * tables and this condition does not arise without those.
10958280d161SAl Viro 	 */
10968280d161SAl Viro 	fdt = files_fdtable(files);
1097da72e783SAl Viro 	fd = array_index_nospec(fd, fdt->max_fds);
10988280d161SAl Viro 	tofree = fdt->fd[fd];
10998280d161SAl Viro 	if (!tofree && fd_is_open(fd, fdt))
11008280d161SAl Viro 		goto Ebusy;
11018280d161SAl Viro 	get_file(file);
11028280d161SAl Viro 	rcu_assign_pointer(fdt->fd[fd], file);
11038280d161SAl Viro 	__set_open_fd(fd, fdt);
11048280d161SAl Viro 	if (flags & O_CLOEXEC)
11058280d161SAl Viro 		__set_close_on_exec(fd, fdt);
11068280d161SAl Viro 	else
11078280d161SAl Viro 		__clear_close_on_exec(fd, fdt);
11088280d161SAl Viro 	spin_unlock(&files->file_lock);
11098280d161SAl Viro 
11108280d161SAl Viro 	if (tofree)
11118280d161SAl Viro 		filp_close(tofree, files);
11128280d161SAl Viro 
11138280d161SAl Viro 	return fd;
11148280d161SAl Viro 
11158280d161SAl Viro Ebusy:
11168280d161SAl Viro 	spin_unlock(&files->file_lock);
11178280d161SAl Viro 	return -EBUSY;
11188280d161SAl Viro }
11198280d161SAl Viro 
replace_fd(unsigned fd,struct file * file,unsigned flags)11208280d161SAl Viro int replace_fd(unsigned fd, struct file *file, unsigned flags)
11218280d161SAl Viro {
11228280d161SAl Viro 	int err;
11238280d161SAl Viro 	struct files_struct *files = current->files;
11248280d161SAl Viro 
11258280d161SAl Viro 	if (!file)
11268760c909SEric W. Biederman 		return close_fd(fd);
11278280d161SAl Viro 
11288280d161SAl Viro 	if (fd >= rlimit(RLIMIT_NOFILE))
112908f05c49SAl Viro 		return -EBADF;
11308280d161SAl Viro 
11318280d161SAl Viro 	spin_lock(&files->file_lock);
11328280d161SAl Viro 	err = expand_files(files, fd);
11338280d161SAl Viro 	if (unlikely(err < 0))
11348280d161SAl Viro 		goto out_unlock;
11358280d161SAl Viro 	return do_dup2(files, file, fd, flags);
11368280d161SAl Viro 
11378280d161SAl Viro out_unlock:
11388280d161SAl Viro 	spin_unlock(&files->file_lock);
11398280d161SAl Viro 	return err;
11408280d161SAl Viro }
11418280d161SAl Viro 
114266590610SKees Cook /**
114366590610SKees Cook  * __receive_fd() - Install received file into file descriptor table
114466590610SKees Cook  * @file: struct file that was received from another process
114566590610SKees Cook  * @ufd: __user pointer to write new fd number to
114666590610SKees Cook  * @o_flags: the O_* flags to apply to the new fd entry
114766590610SKees Cook  *
114866590610SKees Cook  * Installs a received file into the file descriptor table, with appropriate
1149deefa7f3SKees Cook  * checks and count updates. Optionally writes the fd number to userspace, if
1150deefa7f3SKees Cook  * @ufd is non-NULL.
115166590610SKees Cook  *
115266590610SKees Cook  * This helper handles its own reference counting of the incoming
115366590610SKees Cook  * struct file.
115466590610SKees Cook  *
1155deefa7f3SKees Cook  * Returns newly install fd or -ve on error.
115666590610SKees Cook  */
__receive_fd(struct file * file,int __user * ufd,unsigned int o_flags)115742eb0d54SChristoph Hellwig int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
115866590610SKees Cook {
115966590610SKees Cook 	int new_fd;
116066590610SKees Cook 	int error;
116166590610SKees Cook 
116266590610SKees Cook 	error = security_file_receive(file);
116366590610SKees Cook 	if (error)
116466590610SKees Cook 		return error;
116566590610SKees Cook 
116666590610SKees Cook 	new_fd = get_unused_fd_flags(o_flags);
116766590610SKees Cook 	if (new_fd < 0)
116866590610SKees Cook 		return new_fd;
116966590610SKees Cook 
1170deefa7f3SKees Cook 	if (ufd) {
117166590610SKees Cook 		error = put_user(new_fd, ufd);
117266590610SKees Cook 		if (error) {
117366590610SKees Cook 			put_unused_fd(new_fd);
117466590610SKees Cook 			return error;
117566590610SKees Cook 		}
1176deefa7f3SKees Cook 	}
117766590610SKees Cook 
117817381715SKees Cook 	fd_install(new_fd, get_file(file));
117942eb0d54SChristoph Hellwig 	__receive_sock(file);
118042eb0d54SChristoph Hellwig 	return new_fd;
118142eb0d54SChristoph Hellwig }
118242eb0d54SChristoph Hellwig 
receive_fd_replace(int new_fd,struct file * file,unsigned int o_flags)118342eb0d54SChristoph Hellwig int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
118442eb0d54SChristoph Hellwig {
118542eb0d54SChristoph Hellwig 	int error;
118642eb0d54SChristoph Hellwig 
118742eb0d54SChristoph Hellwig 	error = security_file_receive(file);
118842eb0d54SChristoph Hellwig 	if (error)
118942eb0d54SChristoph Hellwig 		return error;
119017381715SKees Cook 	error = replace_fd(new_fd, file, o_flags);
119117381715SKees Cook 	if (error)
119217381715SKees Cook 		return error;
119366590610SKees Cook 	__receive_sock(file);
1194deefa7f3SKees Cook 	return new_fd;
119566590610SKees Cook }
119666590610SKees Cook 
receive_fd(struct file * file,unsigned int o_flags)11979c930054SXie Yongji int receive_fd(struct file *file, unsigned int o_flags)
11989c930054SXie Yongji {
11999c930054SXie Yongji 	return __receive_fd(file, NULL, o_flags);
12009c930054SXie Yongji }
12019c930054SXie Yongji EXPORT_SYMBOL_GPL(receive_fd);
12029c930054SXie Yongji 
ksys_dup3(unsigned int oldfd,unsigned int newfd,int flags)1203c7248321SDominik Brodowski static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
1204fe17f22dSAl Viro {
1205fe17f22dSAl Viro 	int err = -EBADF;
12068280d161SAl Viro 	struct file *file;
1207fe17f22dSAl Viro 	struct files_struct *files = current->files;
1208fe17f22dSAl Viro 
1209fe17f22dSAl Viro 	if ((flags & ~O_CLOEXEC) != 0)
1210fe17f22dSAl Viro 		return -EINVAL;
1211fe17f22dSAl Viro 
1212aed97647SRichard W.M. Jones 	if (unlikely(oldfd == newfd))
1213aed97647SRichard W.M. Jones 		return -EINVAL;
1214aed97647SRichard W.M. Jones 
1215fe17f22dSAl Viro 	if (newfd >= rlimit(RLIMIT_NOFILE))
121608f05c49SAl Viro 		return -EBADF;
1217fe17f22dSAl Viro 
1218fe17f22dSAl Viro 	spin_lock(&files->file_lock);
1219fe17f22dSAl Viro 	err = expand_files(files, newfd);
1220120ce2b0SEric W. Biederman 	file = files_lookup_fd_locked(files, oldfd);
1221fe17f22dSAl Viro 	if (unlikely(!file))
1222fe17f22dSAl Viro 		goto Ebadf;
1223fe17f22dSAl Viro 	if (unlikely(err < 0)) {
1224fe17f22dSAl Viro 		if (err == -EMFILE)
1225fe17f22dSAl Viro 			goto Ebadf;
1226fe17f22dSAl Viro 		goto out_unlock;
1227fe17f22dSAl Viro 	}
12288280d161SAl Viro 	return do_dup2(files, file, newfd, flags);
1229fe17f22dSAl Viro 
1230fe17f22dSAl Viro Ebadf:
1231fe17f22dSAl Viro 	err = -EBADF;
1232fe17f22dSAl Viro out_unlock:
1233fe17f22dSAl Viro 	spin_unlock(&files->file_lock);
1234fe17f22dSAl Viro 	return err;
1235fe17f22dSAl Viro }
1236fe17f22dSAl Viro 
SYSCALL_DEFINE3(dup3,unsigned int,oldfd,unsigned int,newfd,int,flags)1237c7248321SDominik Brodowski SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
1238c7248321SDominik Brodowski {
1239c7248321SDominik Brodowski 	return ksys_dup3(oldfd, newfd, flags);
1240c7248321SDominik Brodowski }
1241c7248321SDominik Brodowski 
SYSCALL_DEFINE2(dup2,unsigned int,oldfd,unsigned int,newfd)1242fe17f22dSAl Viro SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
1243fe17f22dSAl Viro {
1244fe17f22dSAl Viro 	if (unlikely(newfd == oldfd)) { /* corner case */
1245fe17f22dSAl Viro 		struct files_struct *files = current->files;
1246fe17f22dSAl Viro 		int retval = oldfd;
1247fe17f22dSAl Viro 
1248fe17f22dSAl Viro 		rcu_read_lock();
1249f36c2943SEric W. Biederman 		if (!files_lookup_fd_rcu(files, oldfd))
1250fe17f22dSAl Viro 			retval = -EBADF;
1251fe17f22dSAl Viro 		rcu_read_unlock();
1252fe17f22dSAl Viro 		return retval;
1253fe17f22dSAl Viro 	}
1254c7248321SDominik Brodowski 	return ksys_dup3(oldfd, newfd, 0);
1255fe17f22dSAl Viro }
1256fe17f22dSAl Viro 
SYSCALL_DEFINE1(dup,unsigned int,fildes)1257bc1cd99aSChristoph Hellwig SYSCALL_DEFINE1(dup, unsigned int, fildes)
1258fe17f22dSAl Viro {
1259fe17f22dSAl Viro 	int ret = -EBADF;
1260fe17f22dSAl Viro 	struct file *file = fget_raw(fildes);
1261fe17f22dSAl Viro 
1262fe17f22dSAl Viro 	if (file) {
12638d10a035SYann Droneaud 		ret = get_unused_fd_flags(0);
1264fe17f22dSAl Viro 		if (ret >= 0)
1265fe17f22dSAl Viro 			fd_install(ret, file);
1266fe17f22dSAl Viro 		else
1267fe17f22dSAl Viro 			fput(file);
1268fe17f22dSAl Viro 	}
1269fe17f22dSAl Viro 	return ret;
1270fe17f22dSAl Viro }
1271fe17f22dSAl Viro 
f_dupfd(unsigned int from,struct file * file,unsigned flags)1272fe17f22dSAl Viro int f_dupfd(unsigned int from, struct file *file, unsigned flags)
1273fe17f22dSAl Viro {
1274e06b53c2SEric W. Biederman 	unsigned long nofile = rlimit(RLIMIT_NOFILE);
1275fe17f22dSAl Viro 	int err;
1276e06b53c2SEric W. Biederman 	if (from >= nofile)
1277fe17f22dSAl Viro 		return -EINVAL;
1278e06b53c2SEric W. Biederman 	err = alloc_fd(from, nofile, flags);
1279fe17f22dSAl Viro 	if (err >= 0) {
1280fe17f22dSAl Viro 		get_file(file);
1281fe17f22dSAl Viro 		fd_install(err, file);
1282fe17f22dSAl Viro 	}
1283fe17f22dSAl Viro 	return err;
1284fe17f22dSAl Viro }
1285c3c073f8SAl Viro 
iterate_fd(struct files_struct * files,unsigned n,int (* f)(const void *,struct file *,unsigned),const void * p)1286c3c073f8SAl Viro int iterate_fd(struct files_struct *files, unsigned n,
1287c3c073f8SAl Viro 		int (*f)(const void *, struct file *, unsigned),
1288c3c073f8SAl Viro 		const void *p)
1289c3c073f8SAl Viro {
1290c3c073f8SAl Viro 	struct fdtable *fdt;
1291c3c073f8SAl Viro 	int res = 0;
1292c3c073f8SAl Viro 	if (!files)
1293c3c073f8SAl Viro 		return 0;
1294c3c073f8SAl Viro 	spin_lock(&files->file_lock);
1295a77cfcb4SAl Viro 	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1296a77cfcb4SAl Viro 		struct file *file;
1297a77cfcb4SAl Viro 		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1298a77cfcb4SAl Viro 		if (!file)
1299a77cfcb4SAl Viro 			continue;
1300c3c073f8SAl Viro 		res = f(p, file, n);
1301a77cfcb4SAl Viro 		if (res)
1302a77cfcb4SAl Viro 			break;
1303c3c073f8SAl Viro 	}
1304c3c073f8SAl Viro 	spin_unlock(&files->file_lock);
1305c3c073f8SAl Viro 	return res;
1306c3c073f8SAl Viro }
1307c3c073f8SAl Viro EXPORT_SYMBOL(iterate_fd);
1308