xref: /openbmc/linux/fs/file.c (revision 35931eb3)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  *  linux/fs/file.c
41da177e4SLinus Torvalds  *
51da177e4SLinus Torvalds  *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
61da177e4SLinus Torvalds  *
71da177e4SLinus Torvalds  *  Manage the dynamic fd arrays in the process files_struct.
81da177e4SLinus Torvalds  */
91da177e4SLinus Torvalds 
10fe17f22dSAl Viro #include <linux/syscalls.h>
11630d9c47SPaul Gortmaker #include <linux/export.h>
121da177e4SLinus Torvalds #include <linux/fs.h>
13278a5fbaSChristian Brauner #include <linux/kernel.h>
141da177e4SLinus Torvalds #include <linux/mm.h>
153f07c014SIngo Molnar #include <linux/sched/signal.h>
161da177e4SLinus Torvalds #include <linux/slab.h>
171da177e4SLinus Torvalds #include <linux/file.h>
189f3acc31SAl Viro #include <linux/fdtable.h>
191da177e4SLinus Torvalds #include <linux/bitops.h>
20ab2af1f5SDipankar Sarma #include <linux/spinlock.h>
21ab2af1f5SDipankar Sarma #include <linux/rcupdate.h>
2260997c3dSChristian Brauner #include <linux/close_range.h>
2366590610SKees Cook #include <net/sock.h>
24ab2af1f5SDipankar Sarma 
2553dec2eaSJens Axboe #include "internal.h"
2653dec2eaSJens Axboe 
279b80a184SAlexey Dobriyan unsigned int sysctl_nr_open __read_mostly = 1024*1024;
289b80a184SAlexey Dobriyan unsigned int sysctl_nr_open_min = BITS_PER_LONG;
29752343beSRasmus Villemoes /* our min() is unusable in constant expressions ;-/ */
30752343beSRasmus Villemoes #define __const_min(x, y) ((x) < (y) ? (x) : (y))
319b80a184SAlexey Dobriyan unsigned int sysctl_nr_open_max =
329b80a184SAlexey Dobriyan 	__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
339cfe015aSEric Dumazet 
__free_fdtable(struct fdtable * fdt)34a892e2d7SChangli Gao static void __free_fdtable(struct fdtable *fdt)
351da177e4SLinus Torvalds {
36f6c0a192SAl Viro 	kvfree(fdt->fd);
37f6c0a192SAl Viro 	kvfree(fdt->open_fds);
38a892e2d7SChangli Gao 	kfree(fdt);
39ab2af1f5SDipankar Sarma }
40ab2af1f5SDipankar Sarma 
free_fdtable_rcu(struct rcu_head * rcu)417cf4dc3cSAl Viro static void free_fdtable_rcu(struct rcu_head *rcu)
42ab2af1f5SDipankar Sarma {
43ac3e3c5bSAl Viro 	__free_fdtable(container_of(rcu, struct fdtable, rcu));
44ab2af1f5SDipankar Sarma }
45ab2af1f5SDipankar Sarma 
46f3f86e33SLinus Torvalds #define BITBIT_NR(nr)	BITS_TO_LONGS(BITS_TO_LONGS(nr))
47f3f86e33SLinus Torvalds #define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
48f3f86e33SLinus Torvalds 
491da177e4SLinus Torvalds /*
50ea5c58e7SEric Biggers  * Copy 'count' fd bits from the old table to the new table and clear the extra
51ea5c58e7SEric Biggers  * space if any.  This does not copy the file pointers.  Called with the files
52ea5c58e7SEric Biggers  * spinlock held for write.
53ea5c58e7SEric Biggers  */
copy_fd_bitmaps(struct fdtable * nfdt,struct fdtable * ofdt,unsigned int count)54ea5c58e7SEric Biggers static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
55ea5c58e7SEric Biggers 			    unsigned int count)
56ea5c58e7SEric Biggers {
57ea5c58e7SEric Biggers 	unsigned int cpy, set;
58ea5c58e7SEric Biggers 
59ea5c58e7SEric Biggers 	cpy = count / BITS_PER_BYTE;
60ea5c58e7SEric Biggers 	set = (nfdt->max_fds - count) / BITS_PER_BYTE;
61ea5c58e7SEric Biggers 	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
62ea5c58e7SEric Biggers 	memset((char *)nfdt->open_fds + cpy, 0, set);
63ea5c58e7SEric Biggers 	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
64ea5c58e7SEric Biggers 	memset((char *)nfdt->close_on_exec + cpy, 0, set);
65ea5c58e7SEric Biggers 
66ea5c58e7SEric Biggers 	cpy = BITBIT_SIZE(count);
67ea5c58e7SEric Biggers 	set = BITBIT_SIZE(nfdt->max_fds) - cpy;
68ea5c58e7SEric Biggers 	memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
69ea5c58e7SEric Biggers 	memset((char *)nfdt->full_fds_bits + cpy, 0, set);
70ea5c58e7SEric Biggers }
71ea5c58e7SEric Biggers 
72ea5c58e7SEric Biggers /*
73ea5c58e7SEric Biggers  * Copy all file descriptors from the old table to the new, expanded table and
74ea5c58e7SEric Biggers  * clear the extra space.  Called with the files spinlock held for write.
751da177e4SLinus Torvalds  */
copy_fdtable(struct fdtable * nfdt,struct fdtable * ofdt)765466b456SVadim Lobanov static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
77ab2af1f5SDipankar Sarma {
784e89b721SAl Viro 	size_t cpy, set;
791da177e4SLinus Torvalds 
805466b456SVadim Lobanov 	BUG_ON(nfdt->max_fds < ofdt->max_fds);
815466b456SVadim Lobanov 
825466b456SVadim Lobanov 	cpy = ofdt->max_fds * sizeof(struct file *);
835466b456SVadim Lobanov 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
845466b456SVadim Lobanov 	memcpy(nfdt->fd, ofdt->fd, cpy);
85ea5c58e7SEric Biggers 	memset((char *)nfdt->fd + cpy, 0, set);
865466b456SVadim Lobanov 
87ea5c58e7SEric Biggers 	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
881da177e4SLinus Torvalds }
891da177e4SLinus Torvalds 
901c24a186SLinus Torvalds /*
911c24a186SLinus Torvalds  * Note how the fdtable bitmap allocations very much have to be a multiple of
921c24a186SLinus Torvalds  * BITS_PER_LONG. This is not only because we walk those things in chunks of
931c24a186SLinus Torvalds  * 'unsigned long' in some places, but simply because that is how the Linux
941c24a186SLinus Torvalds  * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
951c24a186SLinus Torvalds  * they are very much "bits in an array of unsigned long".
961c24a186SLinus Torvalds  *
971c24a186SLinus Torvalds  * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
981c24a186SLinus Torvalds  * by that "1024/sizeof(ptr)" before, we already know there are sufficient
991c24a186SLinus Torvalds  * clear low bits. Clang seems to realize that, gcc ends up being confused.
1001c24a186SLinus Torvalds  *
1011c24a186SLinus Torvalds  * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
1021c24a186SLinus Torvalds  * let's consider it documentation (and maybe a test-case for gcc to improve
1031c24a186SLinus Torvalds  * its code generation ;)
1041c24a186SLinus Torvalds  */
alloc_fdtable(unsigned int nr)1055466b456SVadim Lobanov static struct fdtable * alloc_fdtable(unsigned int nr)
1061da177e4SLinus Torvalds {
1075466b456SVadim Lobanov 	struct fdtable *fdt;
1081fd36adcSDavid Howells 	void *data;
1091da177e4SLinus Torvalds 
1105466b456SVadim Lobanov 	/*
1115466b456SVadim Lobanov 	 * Figure out how many fds we actually want to support in this fdtable.
1125466b456SVadim Lobanov 	 * Allocation steps are keyed to the size of the fdarray, since it
1135466b456SVadim Lobanov 	 * grows far faster than any of the other dynamic data. We try to fit
1145466b456SVadim Lobanov 	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
1155466b456SVadim Lobanov 	 * and growing in powers of two from there on.
1165466b456SVadim Lobanov 	 */
1175466b456SVadim Lobanov 	nr /= (1024 / sizeof(struct file *));
1185466b456SVadim Lobanov 	nr = roundup_pow_of_two(nr + 1);
1195466b456SVadim Lobanov 	nr *= (1024 / sizeof(struct file *));
1201c24a186SLinus Torvalds 	nr = ALIGN(nr, BITS_PER_LONG);
1215c598b34SAl Viro 	/*
1225c598b34SAl Viro 	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
1235c598b34SAl Viro 	 * had been set lower between the check in expand_files() and here.  Deal
1245c598b34SAl Viro 	 * with that in caller, it's cheaper that way.
1255c598b34SAl Viro 	 *
1265c598b34SAl Viro 	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
1275c598b34SAl Viro 	 * bitmaps handling below becomes unpleasant, to put it mildly...
1285c598b34SAl Viro 	 */
1295c598b34SAl Viro 	if (unlikely(nr > sysctl_nr_open))
1305c598b34SAl Viro 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
1315466b456SVadim Lobanov 
1325d097056SVladimir Davydov 	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
133ab2af1f5SDipankar Sarma 	if (!fdt)
1341da177e4SLinus Torvalds 		goto out;
1355466b456SVadim Lobanov 	fdt->max_fds = nr;
136c823bd92SMichal Hocko 	data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
1375466b456SVadim Lobanov 	if (!data)
1385466b456SVadim Lobanov 		goto out_fdt;
1391fd36adcSDavid Howells 	fdt->fd = data;
1401fd36adcSDavid Howells 
141c823bd92SMichal Hocko 	data = kvmalloc(max_t(size_t,
142c823bd92SMichal Hocko 				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
143c823bd92SMichal Hocko 				 GFP_KERNEL_ACCOUNT);
1445466b456SVadim Lobanov 	if (!data)
1455466b456SVadim Lobanov 		goto out_arr;
1461fd36adcSDavid Howells 	fdt->open_fds = data;
1475466b456SVadim Lobanov 	data += nr / BITS_PER_BYTE;
1481fd36adcSDavid Howells 	fdt->close_on_exec = data;
149f3f86e33SLinus Torvalds 	data += nr / BITS_PER_BYTE;
150f3f86e33SLinus Torvalds 	fdt->full_fds_bits = data;
1511da177e4SLinus Torvalds 
152ab2af1f5SDipankar Sarma 	return fdt;
1535466b456SVadim Lobanov 
1545466b456SVadim Lobanov out_arr:
155f6c0a192SAl Viro 	kvfree(fdt->fd);
1565466b456SVadim Lobanov out_fdt:
157ab2af1f5SDipankar Sarma 	kfree(fdt);
1585466b456SVadim Lobanov out:
159ab2af1f5SDipankar Sarma 	return NULL;
160ab2af1f5SDipankar Sarma }
161ab2af1f5SDipankar Sarma 
162ab2af1f5SDipankar Sarma /*
16374d392aaSVadim Lobanov  * Expand the file descriptor table.
16474d392aaSVadim Lobanov  * This function will allocate a new fdtable and both fd array and fdset, of
16574d392aaSVadim Lobanov  * the given size.
16674d392aaSVadim Lobanov  * Return <0 error code on error; 1 on successful completion.
16774d392aaSVadim Lobanov  * The files->file_lock should be held on entry, and will be held on exit.
168ab2af1f5SDipankar Sarma  */
expand_fdtable(struct files_struct * files,unsigned int nr)1699b80a184SAlexey Dobriyan static int expand_fdtable(struct files_struct *files, unsigned int nr)
170ab2af1f5SDipankar Sarma 	__releases(files->file_lock)
171ab2af1f5SDipankar Sarma 	__acquires(files->file_lock)
172ab2af1f5SDipankar Sarma {
17374d392aaSVadim Lobanov 	struct fdtable *new_fdt, *cur_fdt;
174ab2af1f5SDipankar Sarma 
175ab2af1f5SDipankar Sarma 	spin_unlock(&files->file_lock);
17674d392aaSVadim Lobanov 	new_fdt = alloc_fdtable(nr);
1778a81252bSEric Dumazet 
178d74ba04dSEric W. Biederman 	/* make sure all fd_install() have seen resize_in_progress
1798a81252bSEric Dumazet 	 * or have finished their rcu_read_lock_sched() section.
1808a81252bSEric Dumazet 	 */
1818a81252bSEric Dumazet 	if (atomic_read(&files->count) > 1)
182c93ffc15SPaul E. McKenney 		synchronize_rcu();
1838a81252bSEric Dumazet 
1841da177e4SLinus Torvalds 	spin_lock(&files->file_lock);
18574d392aaSVadim Lobanov 	if (!new_fdt)
18674d392aaSVadim Lobanov 		return -ENOMEM;
187ab2af1f5SDipankar Sarma 	/*
1885c598b34SAl Viro 	 * extremely unlikely race - sysctl_nr_open decreased between the check in
1895c598b34SAl Viro 	 * caller and alloc_fdtable().  Cheaper to catch it here...
1905c598b34SAl Viro 	 */
1915c598b34SAl Viro 	if (unlikely(new_fdt->max_fds <= nr)) {
192a892e2d7SChangli Gao 		__free_fdtable(new_fdt);
1935c598b34SAl Viro 		return -EMFILE;
1945c598b34SAl Viro 	}
19574d392aaSVadim Lobanov 	cur_fdt = files_fdtable(files);
1968a81252bSEric Dumazet 	BUG_ON(nr < cur_fdt->max_fds);
19774d392aaSVadim Lobanov 	copy_fdtable(new_fdt, cur_fdt);
19874d392aaSVadim Lobanov 	rcu_assign_pointer(files->fdt, new_fdt);
199ac3e3c5bSAl Viro 	if (cur_fdt != &files->fdtab)
2001983e781SAl Viro 		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
201d74ba04dSEric W. Biederman 	/* coupled with smp_rmb() in fd_install() */
2028a81252bSEric Dumazet 	smp_wmb();
20374d392aaSVadim Lobanov 	return 1;
2041da177e4SLinus Torvalds }
2051da177e4SLinus Torvalds 
2061da177e4SLinus Torvalds /*
2071da177e4SLinus Torvalds  * Expand files.
20874d392aaSVadim Lobanov  * This function will expand the file structures, if the requested size exceeds
20974d392aaSVadim Lobanov  * the current capacity and there is room for expansion.
21074d392aaSVadim Lobanov  * Return <0 error code on error; 0 when nothing done; 1 when files were
21174d392aaSVadim Lobanov  * expanded and execution may have blocked.
21274d392aaSVadim Lobanov  * The files->file_lock should be held on entry, and will be held on exit.
2131da177e4SLinus Torvalds  */
expand_files(struct files_struct * files,unsigned int nr)2149b80a184SAlexey Dobriyan static int expand_files(struct files_struct *files, unsigned int nr)
2158a81252bSEric Dumazet 	__releases(files->file_lock)
2168a81252bSEric Dumazet 	__acquires(files->file_lock)
2171da177e4SLinus Torvalds {
218badf1662SDipankar Sarma 	struct fdtable *fdt;
2198a81252bSEric Dumazet 	int expanded = 0;
2201da177e4SLinus Torvalds 
2218a81252bSEric Dumazet repeat:
222badf1662SDipankar Sarma 	fdt = files_fdtable(files);
2234e1e018eSAl Viro 
22474d392aaSVadim Lobanov 	/* Do we need to expand? */
225bbea9f69SVadim Lobanov 	if (nr < fdt->max_fds)
2268a81252bSEric Dumazet 		return expanded;
2274e1e018eSAl Viro 
22874d392aaSVadim Lobanov 	/* Can we expand? */
2299cfe015aSEric Dumazet 	if (nr >= sysctl_nr_open)
23074d392aaSVadim Lobanov 		return -EMFILE;
23174d392aaSVadim Lobanov 
2328a81252bSEric Dumazet 	if (unlikely(files->resize_in_progress)) {
2338a81252bSEric Dumazet 		spin_unlock(&files->file_lock);
2348a81252bSEric Dumazet 		expanded = 1;
2358a81252bSEric Dumazet 		wait_event(files->resize_wait, !files->resize_in_progress);
2368a81252bSEric Dumazet 		spin_lock(&files->file_lock);
2378a81252bSEric Dumazet 		goto repeat;
2388a81252bSEric Dumazet 	}
2398a81252bSEric Dumazet 
24074d392aaSVadim Lobanov 	/* All good, so we try */
2418a81252bSEric Dumazet 	files->resize_in_progress = true;
2428a81252bSEric Dumazet 	expanded = expand_fdtable(files, nr);
2438a81252bSEric Dumazet 	files->resize_in_progress = false;
2448a81252bSEric Dumazet 
2458a81252bSEric Dumazet 	wake_up_all(&files->resize_wait);
2468a81252bSEric Dumazet 	return expanded;
2471da177e4SLinus Torvalds }
248ab2af1f5SDipankar Sarma 
__set_close_on_exec(unsigned int fd,struct fdtable * fdt)2499b80a184SAlexey Dobriyan static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
250b8318b01SAl Viro {
251b8318b01SAl Viro 	__set_bit(fd, fdt->close_on_exec);
252b8318b01SAl Viro }
253b8318b01SAl Viro 
__clear_close_on_exec(unsigned int fd,struct fdtable * fdt)2549b80a184SAlexey Dobriyan static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
255b8318b01SAl Viro {
256fc90888dSLinus Torvalds 	if (test_bit(fd, fdt->close_on_exec))
257b8318b01SAl Viro 		__clear_bit(fd, fdt->close_on_exec);
258b8318b01SAl Viro }
259b8318b01SAl Viro 
__set_open_fd(unsigned int fd,struct fdtable * fdt)260f3f86e33SLinus Torvalds static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
261b8318b01SAl Viro {
262b8318b01SAl Viro 	__set_bit(fd, fdt->open_fds);
263f3f86e33SLinus Torvalds 	fd /= BITS_PER_LONG;
264f3f86e33SLinus Torvalds 	if (!~fdt->open_fds[fd])
265f3f86e33SLinus Torvalds 		__set_bit(fd, fdt->full_fds_bits);
266b8318b01SAl Viro }
267b8318b01SAl Viro 
__clear_open_fd(unsigned int fd,struct fdtable * fdt)268f3f86e33SLinus Torvalds static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
269b8318b01SAl Viro {
270b8318b01SAl Viro 	__clear_bit(fd, fdt->open_fds);
271f3f86e33SLinus Torvalds 	__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
272b8318b01SAl Viro }
273b8318b01SAl Viro 
count_open_files(struct fdtable * fdt)2749b80a184SAlexey Dobriyan static unsigned int count_open_files(struct fdtable *fdt)
27502afc626SAl Viro {
2769b80a184SAlexey Dobriyan 	unsigned int size = fdt->max_fds;
2779b80a184SAlexey Dobriyan 	unsigned int i;
27802afc626SAl Viro 
27902afc626SAl Viro 	/* Find the last open fd */
2801fd36adcSDavid Howells 	for (i = size / BITS_PER_LONG; i > 0; ) {
2811fd36adcSDavid Howells 		if (fdt->open_fds[--i])
28202afc626SAl Viro 			break;
28302afc626SAl Viro 	}
2841fd36adcSDavid Howells 	i = (i + 1) * BITS_PER_LONG;
28502afc626SAl Viro 	return i;
28602afc626SAl Viro }
28702afc626SAl Viro 
2881c24a186SLinus Torvalds /*
2891c24a186SLinus Torvalds  * Note that a sane fdtable size always has to be a multiple of
2901c24a186SLinus Torvalds  * BITS_PER_LONG, since we have bitmaps that are sized by this.
2911c24a186SLinus Torvalds  *
2921c24a186SLinus Torvalds  * 'max_fds' will normally already be properly aligned, but it
2931c24a186SLinus Torvalds  * turns out that in the close_range() -> __close_range() ->
2941c24a186SLinus Torvalds  * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
2951c24a186SLinus Torvalds  * up having a 'max_fds' value that isn't already aligned.
2961c24a186SLinus Torvalds  *
2971c24a186SLinus Torvalds  * Rather than make close_range() have to worry about this,
2981c24a186SLinus Torvalds  * just make that BITS_PER_LONG alignment be part of a sane
2991c24a186SLinus Torvalds  * fdtable size. Becuase that's really what it is.
3001c24a186SLinus Torvalds  */
sane_fdtable_size(struct fdtable * fdt,unsigned int max_fds)30160997c3dSChristian Brauner static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
30260997c3dSChristian Brauner {
30360997c3dSChristian Brauner 	unsigned int count;
30460997c3dSChristian Brauner 
30560997c3dSChristian Brauner 	count = count_open_files(fdt);
30660997c3dSChristian Brauner 	if (max_fds < NR_OPEN_DEFAULT)
30760997c3dSChristian Brauner 		max_fds = NR_OPEN_DEFAULT;
308d888c83fSLinus Torvalds 	return ALIGN(min(count, max_fds), BITS_PER_LONG);
30960997c3dSChristian Brauner }
31060997c3dSChristian Brauner 
31102afc626SAl Viro /*
31202afc626SAl Viro  * Allocate a new files structure and copy contents from the
31302afc626SAl Viro  * passed in files structure.
31402afc626SAl Viro  * errorp will be valid only when the returned files_struct is NULL.
31502afc626SAl Viro  */
dup_fd(struct files_struct * oldf,unsigned int max_fds,int * errorp)31660997c3dSChristian Brauner struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
31702afc626SAl Viro {
31802afc626SAl Viro 	struct files_struct *newf;
31902afc626SAl Viro 	struct file **old_fds, **new_fds;
3209b80a184SAlexey Dobriyan 	unsigned int open_files, i;
32102afc626SAl Viro 	struct fdtable *old_fdt, *new_fdt;
32202afc626SAl Viro 
32302afc626SAl Viro 	*errorp = -ENOMEM;
324afbec7ffSAl Viro 	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
32502afc626SAl Viro 	if (!newf)
32602afc626SAl Viro 		goto out;
32702afc626SAl Viro 
328afbec7ffSAl Viro 	atomic_set(&newf->count, 1);
329afbec7ffSAl Viro 
330afbec7ffSAl Viro 	spin_lock_init(&newf->file_lock);
3318a81252bSEric Dumazet 	newf->resize_in_progress = false;
3328a81252bSEric Dumazet 	init_waitqueue_head(&newf->resize_wait);
333afbec7ffSAl Viro 	newf->next_fd = 0;
334afbec7ffSAl Viro 	new_fdt = &newf->fdtab;
335afbec7ffSAl Viro 	new_fdt->max_fds = NR_OPEN_DEFAULT;
3361fd36adcSDavid Howells 	new_fdt->close_on_exec = newf->close_on_exec_init;
3371fd36adcSDavid Howells 	new_fdt->open_fds = newf->open_fds_init;
338f3f86e33SLinus Torvalds 	new_fdt->full_fds_bits = newf->full_fds_bits_init;
339afbec7ffSAl Viro 	new_fdt->fd = &newf->fd_array[0];
340afbec7ffSAl Viro 
34102afc626SAl Viro 	spin_lock(&oldf->file_lock);
34202afc626SAl Viro 	old_fdt = files_fdtable(oldf);
34360997c3dSChristian Brauner 	open_files = sane_fdtable_size(old_fdt, max_fds);
34402afc626SAl Viro 
34502afc626SAl Viro 	/*
34602afc626SAl Viro 	 * Check whether we need to allocate a larger fd array and fd set.
34702afc626SAl Viro 	 */
348adbecb12SAl Viro 	while (unlikely(open_files > new_fdt->max_fds)) {
34902afc626SAl Viro 		spin_unlock(&oldf->file_lock);
3509dec3c4dSAl Viro 
351a892e2d7SChangli Gao 		if (new_fdt != &newf->fdtab)
352a892e2d7SChangli Gao 			__free_fdtable(new_fdt);
353adbecb12SAl Viro 
3549dec3c4dSAl Viro 		new_fdt = alloc_fdtable(open_files - 1);
3559dec3c4dSAl Viro 		if (!new_fdt) {
3569dec3c4dSAl Viro 			*errorp = -ENOMEM;
35702afc626SAl Viro 			goto out_release;
3589dec3c4dSAl Viro 		}
3599dec3c4dSAl Viro 
3609dec3c4dSAl Viro 		/* beyond sysctl_nr_open; nothing to do */
3619dec3c4dSAl Viro 		if (unlikely(new_fdt->max_fds < open_files)) {
362a892e2d7SChangli Gao 			__free_fdtable(new_fdt);
3639dec3c4dSAl Viro 			*errorp = -EMFILE;
3649dec3c4dSAl Viro 			goto out_release;
3659dec3c4dSAl Viro 		}
3669dec3c4dSAl Viro 
36702afc626SAl Viro 		/*
36802afc626SAl Viro 		 * Reacquire the oldf lock and a pointer to its fd table
36902afc626SAl Viro 		 * who knows it may have a new bigger fd table. We need
37002afc626SAl Viro 		 * the latest pointer.
37102afc626SAl Viro 		 */
37202afc626SAl Viro 		spin_lock(&oldf->file_lock);
37302afc626SAl Viro 		old_fdt = files_fdtable(oldf);
37460997c3dSChristian Brauner 		open_files = sane_fdtable_size(old_fdt, max_fds);
37502afc626SAl Viro 	}
37602afc626SAl Viro 
377ea5c58e7SEric Biggers 	copy_fd_bitmaps(new_fdt, old_fdt, open_files);
378ea5c58e7SEric Biggers 
37902afc626SAl Viro 	old_fds = old_fdt->fd;
38002afc626SAl Viro 	new_fds = new_fdt->fd;
38102afc626SAl Viro 
38202afc626SAl Viro 	for (i = open_files; i != 0; i--) {
38302afc626SAl Viro 		struct file *f = *old_fds++;
38402afc626SAl Viro 		if (f) {
38502afc626SAl Viro 			get_file(f);
38602afc626SAl Viro 		} else {
38702afc626SAl Viro 			/*
38802afc626SAl Viro 			 * The fd may be claimed in the fd bitmap but not yet
38902afc626SAl Viro 			 * instantiated in the files array if a sibling thread
39002afc626SAl Viro 			 * is partway through open().  So make sure that this
39102afc626SAl Viro 			 * fd is available to the new process.
39202afc626SAl Viro 			 */
3931dce27c5SDavid Howells 			__clear_open_fd(open_files - i, new_fdt);
39402afc626SAl Viro 		}
39502afc626SAl Viro 		rcu_assign_pointer(*new_fds++, f);
39602afc626SAl Viro 	}
39702afc626SAl Viro 	spin_unlock(&oldf->file_lock);
39802afc626SAl Viro 
399ea5c58e7SEric Biggers 	/* clear the remainder */
400ea5c58e7SEric Biggers 	memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
40102afc626SAl Viro 
402afbec7ffSAl Viro 	rcu_assign_pointer(newf->fdt, new_fdt);
403afbec7ffSAl Viro 
40402afc626SAl Viro 	return newf;
40502afc626SAl Viro 
40602afc626SAl Viro out_release:
40702afc626SAl Viro 	kmem_cache_free(files_cachep, newf);
40802afc626SAl Viro out:
40902afc626SAl Viro 	return NULL;
41002afc626SAl Viro }
41102afc626SAl Viro 
close_files(struct files_struct * files)412ce08b62dSOleg Nesterov static struct fdtable *close_files(struct files_struct * files)
4137cf4dc3cSAl Viro {
4147cf4dc3cSAl Viro 	/*
4157cf4dc3cSAl Viro 	 * It is safe to dereference the fd table without RCU or
4167cf4dc3cSAl Viro 	 * ->file_lock because this is the last reference to the
417ce08b62dSOleg Nesterov 	 * files structure.
4187cf4dc3cSAl Viro 	 */
419ce08b62dSOleg Nesterov 	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
4209b80a184SAlexey Dobriyan 	unsigned int i, j = 0;
421ce08b62dSOleg Nesterov 
4227cf4dc3cSAl Viro 	for (;;) {
4237cf4dc3cSAl Viro 		unsigned long set;
4247cf4dc3cSAl Viro 		i = j * BITS_PER_LONG;
4257cf4dc3cSAl Viro 		if (i >= fdt->max_fds)
4267cf4dc3cSAl Viro 			break;
4277cf4dc3cSAl Viro 		set = fdt->open_fds[j++];
4287cf4dc3cSAl Viro 		while (set) {
4297cf4dc3cSAl Viro 			if (set & 1) {
4307cf4dc3cSAl Viro 				struct file * file = xchg(&fdt->fd[i], NULL);
4317cf4dc3cSAl Viro 				if (file) {
4327cf4dc3cSAl Viro 					filp_close(file, files);
433388a4c88SPaul E. McKenney 					cond_resched();
4347cf4dc3cSAl Viro 				}
4357cf4dc3cSAl Viro 			}
4367cf4dc3cSAl Viro 			i++;
4377cf4dc3cSAl Viro 			set >>= 1;
4387cf4dc3cSAl Viro 		}
4397cf4dc3cSAl Viro 	}
440ce08b62dSOleg Nesterov 
441ce08b62dSOleg Nesterov 	return fdt;
4427cf4dc3cSAl Viro }
4437cf4dc3cSAl Viro 
put_files_struct(struct files_struct * files)4447cf4dc3cSAl Viro void put_files_struct(struct files_struct *files)
4457cf4dc3cSAl Viro {
4467cf4dc3cSAl Viro 	if (atomic_dec_and_test(&files->count)) {
447ce08b62dSOleg Nesterov 		struct fdtable *fdt = close_files(files);
448ce08b62dSOleg Nesterov 
449b9e02af0SAl Viro 		/* free the arrays if they are not embedded */
450b9e02af0SAl Viro 		if (fdt != &files->fdtab)
451b9e02af0SAl Viro 			__free_fdtable(fdt);
452b9e02af0SAl Viro 		kmem_cache_free(files_cachep, files);
4537cf4dc3cSAl Viro 	}
4547cf4dc3cSAl Viro }
4557cf4dc3cSAl Viro 
exit_files(struct task_struct * tsk)4567cf4dc3cSAl Viro void exit_files(struct task_struct *tsk)
4577cf4dc3cSAl Viro {
4587cf4dc3cSAl Viro 	struct files_struct * files = tsk->files;
4597cf4dc3cSAl Viro 
4607cf4dc3cSAl Viro 	if (files) {
4617cf4dc3cSAl Viro 		task_lock(tsk);
4627cf4dc3cSAl Viro 		tsk->files = NULL;
4637cf4dc3cSAl Viro 		task_unlock(tsk);
4647cf4dc3cSAl Viro 		put_files_struct(files);
4657cf4dc3cSAl Viro 	}
4667cf4dc3cSAl Viro }
4677cf4dc3cSAl Viro 
468f52111b1SAl Viro struct files_struct init_files = {
469f52111b1SAl Viro 	.count		= ATOMIC_INIT(1),
470f52111b1SAl Viro 	.fdt		= &init_files.fdtab,
471f52111b1SAl Viro 	.fdtab		= {
472f52111b1SAl Viro 		.max_fds	= NR_OPEN_DEFAULT,
473f52111b1SAl Viro 		.fd		= &init_files.fd_array[0],
4741fd36adcSDavid Howells 		.close_on_exec	= init_files.close_on_exec_init,
4751fd36adcSDavid Howells 		.open_fds	= init_files.open_fds_init,
476f3f86e33SLinus Torvalds 		.full_fds_bits	= init_files.full_fds_bits_init,
477f52111b1SAl Viro 	},
478eece09ecSThomas Gleixner 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
4795704a068SShuriyc Chu 	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
480f52111b1SAl Viro };
4811027abe8SAl Viro 
find_next_fd(struct fdtable * fdt,unsigned int start)4829b80a184SAlexey Dobriyan static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
483f3f86e33SLinus Torvalds {
4849b80a184SAlexey Dobriyan 	unsigned int maxfd = fdt->max_fds;
4859b80a184SAlexey Dobriyan 	unsigned int maxbit = maxfd / BITS_PER_LONG;
4869b80a184SAlexey Dobriyan 	unsigned int bitbit = start / BITS_PER_LONG;
487f3f86e33SLinus Torvalds 
488f3f86e33SLinus Torvalds 	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
489f3f86e33SLinus Torvalds 	if (bitbit > maxfd)
490f3f86e33SLinus Torvalds 		return maxfd;
491f3f86e33SLinus Torvalds 	if (bitbit > start)
492f3f86e33SLinus Torvalds 		start = bitbit;
493f3f86e33SLinus Torvalds 	return find_next_zero_bit(fdt->open_fds, maxfd, start);
494f3f86e33SLinus Torvalds }
495f3f86e33SLinus Torvalds 
4961027abe8SAl Viro /*
4971027abe8SAl Viro  * allocate a file descriptor, mark it busy.
4981027abe8SAl Viro  */
alloc_fd(unsigned start,unsigned end,unsigned flags)499aa384d10SEric W. Biederman static int alloc_fd(unsigned start, unsigned end, unsigned flags)
5001027abe8SAl Viro {
501aa384d10SEric W. Biederman 	struct files_struct *files = current->files;
5021027abe8SAl Viro 	unsigned int fd;
5031027abe8SAl Viro 	int error;
5041027abe8SAl Viro 	struct fdtable *fdt;
5051027abe8SAl Viro 
5061027abe8SAl Viro 	spin_lock(&files->file_lock);
5071027abe8SAl Viro repeat:
5081027abe8SAl Viro 	fdt = files_fdtable(files);
5091027abe8SAl Viro 	fd = start;
5101027abe8SAl Viro 	if (fd < files->next_fd)
5111027abe8SAl Viro 		fd = files->next_fd;
5121027abe8SAl Viro 
5131027abe8SAl Viro 	if (fd < fdt->max_fds)
514f3f86e33SLinus Torvalds 		fd = find_next_fd(fdt, fd);
5151027abe8SAl Viro 
516f33ff992SAl Viro 	/*
517f33ff992SAl Viro 	 * N.B. For clone tasks sharing a files structure, this test
518f33ff992SAl Viro 	 * will limit the total number of files that can be opened.
519f33ff992SAl Viro 	 */
520f33ff992SAl Viro 	error = -EMFILE;
521f33ff992SAl Viro 	if (fd >= end)
522f33ff992SAl Viro 		goto out;
523f33ff992SAl Viro 
5241027abe8SAl Viro 	error = expand_files(files, fd);
5251027abe8SAl Viro 	if (error < 0)
5261027abe8SAl Viro 		goto out;
5271027abe8SAl Viro 
5281027abe8SAl Viro 	/*
5291027abe8SAl Viro 	 * If we needed to expand the fs array we
5301027abe8SAl Viro 	 * might have blocked - try again.
5311027abe8SAl Viro 	 */
5321027abe8SAl Viro 	if (error)
5331027abe8SAl Viro 		goto repeat;
5341027abe8SAl Viro 
5351027abe8SAl Viro 	if (start <= files->next_fd)
5361027abe8SAl Viro 		files->next_fd = fd + 1;
5371027abe8SAl Viro 
5381dce27c5SDavid Howells 	__set_open_fd(fd, fdt);
5391027abe8SAl Viro 	if (flags & O_CLOEXEC)
5401dce27c5SDavid Howells 		__set_close_on_exec(fd, fdt);
5411027abe8SAl Viro 	else
5421dce27c5SDavid Howells 		__clear_close_on_exec(fd, fdt);
5431027abe8SAl Viro 	error = fd;
5441027abe8SAl Viro #if 1
5451027abe8SAl Viro 	/* Sanity check */
546add1f099SPaul E. McKenney 	if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
5471027abe8SAl Viro 		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
5481027abe8SAl Viro 		rcu_assign_pointer(fdt->fd[fd], NULL);
5491027abe8SAl Viro 	}
5501027abe8SAl Viro #endif
5511027abe8SAl Viro 
5521027abe8SAl Viro out:
5531027abe8SAl Viro 	spin_unlock(&files->file_lock);
5541027abe8SAl Viro 	return error;
5551027abe8SAl Viro }
5561027abe8SAl Viro 
__get_unused_fd_flags(unsigned flags,unsigned long nofile)5574022e7afSJens Axboe int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
5584022e7afSJens Axboe {
559aa384d10SEric W. Biederman 	return alloc_fd(0, nofile, flags);
5604022e7afSJens Axboe }
5614022e7afSJens Axboe 
get_unused_fd_flags(unsigned flags)5621a7bd226SAl Viro int get_unused_fd_flags(unsigned flags)
5631027abe8SAl Viro {
5644022e7afSJens Axboe 	return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
5651027abe8SAl Viro }
5661a7bd226SAl Viro EXPORT_SYMBOL(get_unused_fd_flags);
56756007caeSAl Viro 
__put_unused_fd(struct files_struct * files,unsigned int fd)56856007caeSAl Viro static void __put_unused_fd(struct files_struct *files, unsigned int fd)
56956007caeSAl Viro {
57056007caeSAl Viro 	struct fdtable *fdt = files_fdtable(files);
57156007caeSAl Viro 	__clear_open_fd(fd, fdt);
57256007caeSAl Viro 	if (fd < files->next_fd)
57356007caeSAl Viro 		files->next_fd = fd;
57456007caeSAl Viro }
57556007caeSAl Viro 
put_unused_fd(unsigned int fd)57656007caeSAl Viro void put_unused_fd(unsigned int fd)
57756007caeSAl Viro {
57856007caeSAl Viro 	struct files_struct *files = current->files;
57956007caeSAl Viro 	spin_lock(&files->file_lock);
58056007caeSAl Viro 	__put_unused_fd(files, fd);
58156007caeSAl Viro 	spin_unlock(&files->file_lock);
58256007caeSAl Viro }
58356007caeSAl Viro 
58456007caeSAl Viro EXPORT_SYMBOL(put_unused_fd);
58556007caeSAl Viro 
58656007caeSAl Viro /*
58756007caeSAl Viro  * Install a file pointer in the fd array.
58856007caeSAl Viro  *
58956007caeSAl Viro  * The VFS is full of places where we drop the files lock between
59056007caeSAl Viro  * setting the open_fds bitmap and installing the file in the file
59156007caeSAl Viro  * array.  At any such point, we are vulnerable to a dup2() race
59256007caeSAl Viro  * installing a file in the array before us.  We need to detect this and
59356007caeSAl Viro  * fput() the struct file we are about to overwrite in this case.
59456007caeSAl Viro  *
59556007caeSAl Viro  * It should never happen - if we allow dup2() do it, _really_ bad things
59656007caeSAl Viro  * will follow.
597f869e8a7SAl Viro  *
598d74ba04dSEric W. Biederman  * This consumes the "file" refcount, so callers should treat it
599d74ba04dSEric W. Biederman  * as if they had called fput(file).
60056007caeSAl Viro  */
60156007caeSAl Viro 
fd_install(unsigned int fd,struct file * file)602d74ba04dSEric W. Biederman void fd_install(unsigned int fd, struct file *file)
60356007caeSAl Viro {
604d74ba04dSEric W. Biederman 	struct files_struct *files = current->files;
60556007caeSAl Viro 	struct fdtable *fdt;
6068a81252bSEric Dumazet 
6078a81252bSEric Dumazet 	rcu_read_lock_sched();
6088a81252bSEric Dumazet 
609c02b1a9bSMateusz Guzik 	if (unlikely(files->resize_in_progress)) {
6108a81252bSEric Dumazet 		rcu_read_unlock_sched();
611c02b1a9bSMateusz Guzik 		spin_lock(&files->file_lock);
612c02b1a9bSMateusz Guzik 		fdt = files_fdtable(files);
613c02b1a9bSMateusz Guzik 		BUG_ON(fdt->fd[fd] != NULL);
614c02b1a9bSMateusz Guzik 		rcu_assign_pointer(fdt->fd[fd], file);
615c02b1a9bSMateusz Guzik 		spin_unlock(&files->file_lock);
616c02b1a9bSMateusz Guzik 		return;
6178a81252bSEric Dumazet 	}
6188a81252bSEric Dumazet 	/* coupled with smp_wmb() in expand_fdtable() */
6198a81252bSEric Dumazet 	smp_rmb();
6208a81252bSEric Dumazet 	fdt = rcu_dereference_sched(files->fdt);
62156007caeSAl Viro 	BUG_ON(fdt->fd[fd] != NULL);
62256007caeSAl Viro 	rcu_assign_pointer(fdt->fd[fd], file);
6238a81252bSEric Dumazet 	rcu_read_unlock_sched();
62456007caeSAl Viro }
62556007caeSAl Viro 
62656007caeSAl Viro EXPORT_SYMBOL(fd_install);
6270ee8cdfeSAl Viro 
628f49fd6d3SChristian Brauner /**
629f49fd6d3SChristian Brauner  * pick_file - return file associatd with fd
630f49fd6d3SChristian Brauner  * @files: file struct to retrieve file from
631f49fd6d3SChristian Brauner  * @fd: file descriptor to retrieve file for
632f49fd6d3SChristian Brauner  *
6336319194eSAl Viro  * Context: files_lock must be held.
634f49fd6d3SChristian Brauner  *
6356319194eSAl Viro  * Returns: The file associated with @fd (NULL if @fd is not open)
636f49fd6d3SChristian Brauner  */
pick_file(struct files_struct * files,unsigned fd)637278a5fbaSChristian Brauner static struct file *pick_file(struct files_struct *files, unsigned fd)
638483ce1d4SAl Viro {
6396319194eSAl Viro 	struct fdtable *fdt = files_fdtable(files);
640f49fd6d3SChristian Brauner 	struct file *file;
641483ce1d4SAl Viro 
6426319194eSAl Viro 	if (fd >= fdt->max_fds)
6436319194eSAl Viro 		return NULL;
6446319194eSAl Viro 
645609d5444STheodore Ts'o 	fd = array_index_nospec(fd, fdt->max_fds);
646483ce1d4SAl Viro 	file = fdt->fd[fd];
6476319194eSAl Viro 	if (file) {
648483ce1d4SAl Viro 		rcu_assign_pointer(fdt->fd[fd], NULL);
649483ce1d4SAl Viro 		__put_unused_fd(files, fd);
6506319194eSAl Viro 	}
651278a5fbaSChristian Brauner 	return file;
652278a5fbaSChristian Brauner }
653278a5fbaSChristian Brauner 
close_fd(unsigned fd)6548760c909SEric W. Biederman int close_fd(unsigned fd)
655278a5fbaSChristian Brauner {
6568760c909SEric W. Biederman 	struct files_struct *files = current->files;
657278a5fbaSChristian Brauner 	struct file *file;
658278a5fbaSChristian Brauner 
6596319194eSAl Viro 	spin_lock(&files->file_lock);
660278a5fbaSChristian Brauner 	file = pick_file(files, fd);
6616319194eSAl Viro 	spin_unlock(&files->file_lock);
6626319194eSAl Viro 	if (!file)
663483ce1d4SAl Viro 		return -EBADF;
664278a5fbaSChristian Brauner 
665278a5fbaSChristian Brauner 	return filp_close(file, files);
666483ce1d4SAl Viro }
6678760c909SEric W. Biederman EXPORT_SYMBOL(close_fd); /* for ksys_close() */
668483ce1d4SAl Viro 
6699b5b8722SChristian Brauner /**
6709b5b8722SChristian Brauner  * last_fd - return last valid index into fd table
671*35931eb3SMatthew Wilcox (Oracle)  * @fdt: File descriptor table.
6729b5b8722SChristian Brauner  *
6739b5b8722SChristian Brauner  * Context: Either rcu read lock or files_lock must be held.
6749b5b8722SChristian Brauner  *
6759b5b8722SChristian Brauner  * Returns: Last valid index into fdtable.
6769b5b8722SChristian Brauner  */
last_fd(struct fdtable * fdt)6779b5b8722SChristian Brauner static inline unsigned last_fd(struct fdtable *fdt)
6789b5b8722SChristian Brauner {
6799b5b8722SChristian Brauner 	return fdt->max_fds - 1;
6809b5b8722SChristian Brauner }
6819b5b8722SChristian Brauner 
__range_cloexec(struct files_struct * cur_fds,unsigned int fd,unsigned int max_fd)682582f1fb6SGiuseppe Scrivano static inline void __range_cloexec(struct files_struct *cur_fds,
683582f1fb6SGiuseppe Scrivano 				   unsigned int fd, unsigned int max_fd)
684582f1fb6SGiuseppe Scrivano {
685582f1fb6SGiuseppe Scrivano 	struct fdtable *fdt;
686582f1fb6SGiuseppe Scrivano 
6879b5b8722SChristian Brauner 	/* make sure we're using the correct maximum value */
688582f1fb6SGiuseppe Scrivano 	spin_lock(&cur_fds->file_lock);
689582f1fb6SGiuseppe Scrivano 	fdt = files_fdtable(cur_fds);
6909b5b8722SChristian Brauner 	max_fd = min(last_fd(fdt), max_fd);
6919b5b8722SChristian Brauner 	if (fd <= max_fd)
692582f1fb6SGiuseppe Scrivano 		bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
693582f1fb6SGiuseppe Scrivano 	spin_unlock(&cur_fds->file_lock);
694582f1fb6SGiuseppe Scrivano }
695582f1fb6SGiuseppe Scrivano 
__range_close(struct files_struct * files,unsigned int fd,unsigned int max_fd)696ed192c59SMateusz Guzik static inline void __range_close(struct files_struct *files, unsigned int fd,
697582f1fb6SGiuseppe Scrivano 				 unsigned int max_fd)
698582f1fb6SGiuseppe Scrivano {
699ed192c59SMateusz Guzik 	struct file *file;
7006319194eSAl Viro 	unsigned n;
7016319194eSAl Viro 
702ed192c59SMateusz Guzik 	spin_lock(&files->file_lock);
703ed192c59SMateusz Guzik 	n = last_fd(files_fdtable(files));
7046319194eSAl Viro 	max_fd = min(max_fd, n);
7056319194eSAl Viro 
706ed192c59SMateusz Guzik 	for (; fd <= max_fd; fd++) {
707ed192c59SMateusz Guzik 		file = pick_file(files, fd);
7086319194eSAl Viro 		if (file) {
709ed192c59SMateusz Guzik 			spin_unlock(&files->file_lock);
710ed192c59SMateusz Guzik 			filp_close(file, files);
711582f1fb6SGiuseppe Scrivano 			cond_resched();
712ed192c59SMateusz Guzik 			spin_lock(&files->file_lock);
713ed192c59SMateusz Guzik 		} else if (need_resched()) {
714ed192c59SMateusz Guzik 			spin_unlock(&files->file_lock);
715ed192c59SMateusz Guzik 			cond_resched();
716ed192c59SMateusz Guzik 			spin_lock(&files->file_lock);
717f49fd6d3SChristian Brauner 		}
718582f1fb6SGiuseppe Scrivano 	}
719ed192c59SMateusz Guzik 	spin_unlock(&files->file_lock);
720582f1fb6SGiuseppe Scrivano }
721582f1fb6SGiuseppe Scrivano 
722278a5fbaSChristian Brauner /**
723278a5fbaSChristian Brauner  * __close_range() - Close all file descriptors in a given range.
724278a5fbaSChristian Brauner  *
725278a5fbaSChristian Brauner  * @fd:     starting file descriptor to close
726278a5fbaSChristian Brauner  * @max_fd: last file descriptor to close
727*35931eb3SMatthew Wilcox (Oracle)  * @flags:  CLOSE_RANGE flags.
728278a5fbaSChristian Brauner  *
729278a5fbaSChristian Brauner  * This closes a range of file descriptors. All file descriptors
730278a5fbaSChristian Brauner  * from @fd up to and including @max_fd are closed.
731278a5fbaSChristian Brauner  */
__close_range(unsigned fd,unsigned max_fd,unsigned int flags)73260997c3dSChristian Brauner int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
733278a5fbaSChristian Brauner {
73460997c3dSChristian Brauner 	struct task_struct *me = current;
73560997c3dSChristian Brauner 	struct files_struct *cur_fds = me->files, *fds = NULL;
73660997c3dSChristian Brauner 
737582f1fb6SGiuseppe Scrivano 	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
73860997c3dSChristian Brauner 		return -EINVAL;
739278a5fbaSChristian Brauner 
740278a5fbaSChristian Brauner 	if (fd > max_fd)
741278a5fbaSChristian Brauner 		return -EINVAL;
742278a5fbaSChristian Brauner 
74360997c3dSChristian Brauner 	if (flags & CLOSE_RANGE_UNSHARE) {
74460997c3dSChristian Brauner 		int ret;
74560997c3dSChristian Brauner 		unsigned int max_unshare_fds = NR_OPEN_MAX;
74660997c3dSChristian Brauner 
74760997c3dSChristian Brauner 		/*
74803ba0fe4SChristian Brauner 		 * If the caller requested all fds to be made cloexec we always
74903ba0fe4SChristian Brauner 		 * copy all of the file descriptors since they still want to
75003ba0fe4SChristian Brauner 		 * use them.
75160997c3dSChristian Brauner 		 */
75203ba0fe4SChristian Brauner 		if (!(flags & CLOSE_RANGE_CLOEXEC)) {
75303ba0fe4SChristian Brauner 			/*
75403ba0fe4SChristian Brauner 			 * If the requested range is greater than the current
75503ba0fe4SChristian Brauner 			 * maximum, we're closing everything so only copy all
75603ba0fe4SChristian Brauner 			 * file descriptors beneath the lowest file descriptor.
75703ba0fe4SChristian Brauner 			 */
75803ba0fe4SChristian Brauner 			rcu_read_lock();
75903ba0fe4SChristian Brauner 			if (max_fd >= last_fd(files_fdtable(cur_fds)))
76060997c3dSChristian Brauner 				max_unshare_fds = fd;
76103ba0fe4SChristian Brauner 			rcu_read_unlock();
76203ba0fe4SChristian Brauner 		}
76360997c3dSChristian Brauner 
76460997c3dSChristian Brauner 		ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
76560997c3dSChristian Brauner 		if (ret)
76660997c3dSChristian Brauner 			return ret;
76760997c3dSChristian Brauner 
76860997c3dSChristian Brauner 		/*
76960997c3dSChristian Brauner 		 * We used to share our file descriptor table, and have now
77060997c3dSChristian Brauner 		 * created a private one, make sure we're using it below.
77160997c3dSChristian Brauner 		 */
77260997c3dSChristian Brauner 		if (fds)
77360997c3dSChristian Brauner 			swap(cur_fds, fds);
77460997c3dSChristian Brauner 	}
77560997c3dSChristian Brauner 
776582f1fb6SGiuseppe Scrivano 	if (flags & CLOSE_RANGE_CLOEXEC)
777582f1fb6SGiuseppe Scrivano 		__range_cloexec(cur_fds, fd, max_fd);
778582f1fb6SGiuseppe Scrivano 	else
779582f1fb6SGiuseppe Scrivano 		__range_close(cur_fds, fd, max_fd);
780278a5fbaSChristian Brauner 
78160997c3dSChristian Brauner 	if (fds) {
78260997c3dSChristian Brauner 		/*
78360997c3dSChristian Brauner 		 * We're done closing the files we were supposed to. Time to install
78460997c3dSChristian Brauner 		 * the new file descriptor table and drop the old one.
78560997c3dSChristian Brauner 		 */
78660997c3dSChristian Brauner 		task_lock(me);
78760997c3dSChristian Brauner 		me->files = cur_fds;
78860997c3dSChristian Brauner 		task_unlock(me);
78960997c3dSChristian Brauner 		put_files_struct(fds);
79060997c3dSChristian Brauner 	}
79160997c3dSChristian Brauner 
792278a5fbaSChristian Brauner 	return 0;
793278a5fbaSChristian Brauner }
794278a5fbaSChristian Brauner 
79580cd7956STodd Kjos /*
79653dec2eaSJens Axboe  * See close_fd_get_file() below, this variant assumes current->files->file_lock
79753dec2eaSJens Axboe  * is held.
79853dec2eaSJens Axboe  */
__close_fd_get_file(unsigned int fd)7996319194eSAl Viro struct file *__close_fd_get_file(unsigned int fd)
80053dec2eaSJens Axboe {
8016319194eSAl Viro 	return pick_file(current->files, fd);
80253dec2eaSJens Axboe }
80353dec2eaSJens Axboe 
80453dec2eaSJens Axboe /*
8059fe83c43SEric W. Biederman  * variant of close_fd that gets a ref on the file for later fput.
80640a19260SAl Viro  * The caller must ensure that filp_close() called on the file.
80780cd7956STodd Kjos  */
close_fd_get_file(unsigned int fd)8086319194eSAl Viro struct file *close_fd_get_file(unsigned int fd)
80980cd7956STodd Kjos {
81080cd7956STodd Kjos 	struct files_struct *files = current->files;
8116319194eSAl Viro 	struct file *file;
81280cd7956STodd Kjos 
81380cd7956STodd Kjos 	spin_lock(&files->file_lock);
8146319194eSAl Viro 	file = pick_file(files, fd);
81580cd7956STodd Kjos 	spin_unlock(&files->file_lock);
81680cd7956STodd Kjos 
8176319194eSAl Viro 	return file;
81880cd7956STodd Kjos }
81980cd7956STodd Kjos 
do_close_on_exec(struct files_struct * files)8206a6d27deSAl Viro void do_close_on_exec(struct files_struct *files)
8216a6d27deSAl Viro {
8226a6d27deSAl Viro 	unsigned i;
8236a6d27deSAl Viro 	struct fdtable *fdt;
8246a6d27deSAl Viro 
8256a6d27deSAl Viro 	/* exec unshares first */
8266a6d27deSAl Viro 	spin_lock(&files->file_lock);
8276a6d27deSAl Viro 	for (i = 0; ; i++) {
8286a6d27deSAl Viro 		unsigned long set;
8296a6d27deSAl Viro 		unsigned fd = i * BITS_PER_LONG;
8306a6d27deSAl Viro 		fdt = files_fdtable(files);
8316a6d27deSAl Viro 		if (fd >= fdt->max_fds)
8326a6d27deSAl Viro 			break;
8336a6d27deSAl Viro 		set = fdt->close_on_exec[i];
8346a6d27deSAl Viro 		if (!set)
8356a6d27deSAl Viro 			continue;
8366a6d27deSAl Viro 		fdt->close_on_exec[i] = 0;
8376a6d27deSAl Viro 		for ( ; set ; fd++, set >>= 1) {
8386a6d27deSAl Viro 			struct file *file;
8396a6d27deSAl Viro 			if (!(set & 1))
8406a6d27deSAl Viro 				continue;
8416a6d27deSAl Viro 			file = fdt->fd[fd];
8426a6d27deSAl Viro 			if (!file)
8436a6d27deSAl Viro 				continue;
8446a6d27deSAl Viro 			rcu_assign_pointer(fdt->fd[fd], NULL);
8456a6d27deSAl Viro 			__put_unused_fd(files, fd);
8466a6d27deSAl Viro 			spin_unlock(&files->file_lock);
8476a6d27deSAl Viro 			filp_close(file, files);
8486a6d27deSAl Viro 			cond_resched();
8496a6d27deSAl Viro 			spin_lock(&files->file_lock);
8506a6d27deSAl Viro 		}
8516a6d27deSAl Viro 
8526a6d27deSAl Viro 	}
8536a6d27deSAl Viro 	spin_unlock(&files->file_lock);
8546a6d27deSAl Viro }
8556a6d27deSAl Viro 
__fget_files_rcu(struct files_struct * files,unsigned int fd,fmode_t mask)856e386dfc5SLinus Torvalds static inline struct file *__fget_files_rcu(struct files_struct *files,
85781132a39SGou Hao 	unsigned int fd, fmode_t mask)
858e386dfc5SLinus Torvalds {
859e386dfc5SLinus Torvalds 	for (;;) {
860e386dfc5SLinus Torvalds 		struct file *file;
861e386dfc5SLinus Torvalds 		struct fdtable *fdt = rcu_dereference_raw(files->fdt);
862e386dfc5SLinus Torvalds 		struct file __rcu **fdentry;
863e386dfc5SLinus Torvalds 
864e386dfc5SLinus Torvalds 		if (unlikely(fd >= fdt->max_fds))
865e386dfc5SLinus Torvalds 			return NULL;
866e386dfc5SLinus Torvalds 
867e386dfc5SLinus Torvalds 		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
868e386dfc5SLinus Torvalds 		file = rcu_dereference_raw(*fdentry);
869e386dfc5SLinus Torvalds 		if (unlikely(!file))
870e386dfc5SLinus Torvalds 			return NULL;
871e386dfc5SLinus Torvalds 
872e386dfc5SLinus Torvalds 		if (unlikely(file->f_mode & mask))
873e386dfc5SLinus Torvalds 			return NULL;
874e386dfc5SLinus Torvalds 
875e386dfc5SLinus Torvalds 		/*
876e386dfc5SLinus Torvalds 		 * Ok, we have a file pointer. However, because we do
877e386dfc5SLinus Torvalds 		 * this all locklessly under RCU, we may be racing with
878e386dfc5SLinus Torvalds 		 * that file being closed.
879e386dfc5SLinus Torvalds 		 *
880e386dfc5SLinus Torvalds 		 * Such a race can take two forms:
881e386dfc5SLinus Torvalds 		 *
882e386dfc5SLinus Torvalds 		 *  (a) the file ref already went down to zero,
88381132a39SGou Hao 		 *      and get_file_rcu() fails. Just try again:
884e386dfc5SLinus Torvalds 		 */
88581132a39SGou Hao 		if (unlikely(!get_file_rcu(file)))
886e386dfc5SLinus Torvalds 			continue;
887e386dfc5SLinus Torvalds 
888e386dfc5SLinus Torvalds 		/*
889e386dfc5SLinus Torvalds 		 *  (b) the file table entry has changed under us.
890e386dfc5SLinus Torvalds 		 *       Note that we don't need to re-check the 'fdt->fd'
891e386dfc5SLinus Torvalds 		 *       pointer having changed, because it always goes
892e386dfc5SLinus Torvalds 		 *       hand-in-hand with 'fdt'.
893e386dfc5SLinus Torvalds 		 *
89481132a39SGou Hao 		 * If so, we need to put our ref and try again.
895e386dfc5SLinus Torvalds 		 */
896e386dfc5SLinus Torvalds 		if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
897e386dfc5SLinus Torvalds 		    unlikely(rcu_dereference_raw(*fdentry) != file)) {
89881132a39SGou Hao 			fput(file);
899e386dfc5SLinus Torvalds 			continue;
900e386dfc5SLinus Torvalds 		}
901e386dfc5SLinus Torvalds 
902e386dfc5SLinus Torvalds 		/*
903e386dfc5SLinus Torvalds 		 * Ok, we have a ref to the file, and checked that it
904e386dfc5SLinus Torvalds 		 * still exists.
905e386dfc5SLinus Torvalds 		 */
906e386dfc5SLinus Torvalds 		return file;
907e386dfc5SLinus Torvalds 	}
908e386dfc5SLinus Torvalds }
909e386dfc5SLinus Torvalds 
__fget_files(struct files_struct * files,unsigned int fd,fmode_t mask)9105e876fb4SSargun Dhillon static struct file *__fget_files(struct files_struct *files, unsigned int fd,
91181132a39SGou Hao 				 fmode_t mask)
9120ee8cdfeSAl Viro {
9131deb46e2SOleg Nesterov 	struct file *file;
9140ee8cdfeSAl Viro 
9150ee8cdfeSAl Viro 	rcu_read_lock();
91681132a39SGou Hao 	file = __fget_files_rcu(files, fd, mask);
9170ee8cdfeSAl Viro 	rcu_read_unlock();
9180ee8cdfeSAl Viro 
9190ee8cdfeSAl Viro 	return file;
9200ee8cdfeSAl Viro }
9210ee8cdfeSAl Viro 
__fget(unsigned int fd,fmode_t mask)92281132a39SGou Hao static inline struct file *__fget(unsigned int fd, fmode_t mask)
9235e876fb4SSargun Dhillon {
92481132a39SGou Hao 	return __fget_files(current->files, fd, mask);
925091141a4SJens Axboe }
926091141a4SJens Axboe 
fget(unsigned int fd)9271deb46e2SOleg Nesterov struct file *fget(unsigned int fd)
9281deb46e2SOleg Nesterov {
92981132a39SGou Hao 	return __fget(fd, FMODE_PATH);
9301deb46e2SOleg Nesterov }
9310ee8cdfeSAl Viro EXPORT_SYMBOL(fget);
9320ee8cdfeSAl Viro 
fget_raw(unsigned int fd)9330ee8cdfeSAl Viro struct file *fget_raw(unsigned int fd)
9340ee8cdfeSAl Viro {
93581132a39SGou Hao 	return __fget(fd, 0);
9360ee8cdfeSAl Viro }
9370ee8cdfeSAl Viro EXPORT_SYMBOL(fget_raw);
9380ee8cdfeSAl Viro 
fget_task(struct task_struct * task,unsigned int fd)9395e876fb4SSargun Dhillon struct file *fget_task(struct task_struct *task, unsigned int fd)
9405e876fb4SSargun Dhillon {
9415e876fb4SSargun Dhillon 	struct file *file = NULL;
9425e876fb4SSargun Dhillon 
9435e876fb4SSargun Dhillon 	task_lock(task);
9445e876fb4SSargun Dhillon 	if (task->files)
94581132a39SGou Hao 		file = __fget_files(task->files, fd, 0);
9465e876fb4SSargun Dhillon 	task_unlock(task);
9475e876fb4SSargun Dhillon 
9485e876fb4SSargun Dhillon 	return file;
9495e876fb4SSargun Dhillon }
9505e876fb4SSargun Dhillon 
task_lookup_fd_rcu(struct task_struct * task,unsigned int fd)9513a879fb3SEric W. Biederman struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
9523a879fb3SEric W. Biederman {
9533a879fb3SEric W. Biederman 	/* Must be called with rcu_read_lock held */
9543a879fb3SEric W. Biederman 	struct files_struct *files;
9553a879fb3SEric W. Biederman 	struct file *file = NULL;
9563a879fb3SEric W. Biederman 
9573a879fb3SEric W. Biederman 	task_lock(task);
9583a879fb3SEric W. Biederman 	files = task->files;
9593a879fb3SEric W. Biederman 	if (files)
9603a879fb3SEric W. Biederman 		file = files_lookup_fd_rcu(files, fd);
9613a879fb3SEric W. Biederman 	task_unlock(task);
9623a879fb3SEric W. Biederman 
9633a879fb3SEric W. Biederman 	return file;
9643a879fb3SEric W. Biederman }
9653a879fb3SEric W. Biederman 
task_lookup_next_fd_rcu(struct task_struct * task,unsigned int * ret_fd)966e9a53aebSEric W. Biederman struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
967e9a53aebSEric W. Biederman {
968e9a53aebSEric W. Biederman 	/* Must be called with rcu_read_lock held */
969e9a53aebSEric W. Biederman 	struct files_struct *files;
970e9a53aebSEric W. Biederman 	unsigned int fd = *ret_fd;
971e9a53aebSEric W. Biederman 	struct file *file = NULL;
972e9a53aebSEric W. Biederman 
973e9a53aebSEric W. Biederman 	task_lock(task);
974e9a53aebSEric W. Biederman 	files = task->files;
975e9a53aebSEric W. Biederman 	if (files) {
976e9a53aebSEric W. Biederman 		for (; fd < files_fdtable(files)->max_fds; fd++) {
977e9a53aebSEric W. Biederman 			file = files_lookup_fd_rcu(files, fd);
978e9a53aebSEric W. Biederman 			if (file)
979e9a53aebSEric W. Biederman 				break;
980e9a53aebSEric W. Biederman 		}
981e9a53aebSEric W. Biederman 	}
982e9a53aebSEric W. Biederman 	task_unlock(task);
983e9a53aebSEric W. Biederman 	*ret_fd = fd;
984e9a53aebSEric W. Biederman 	return file;
985e9a53aebSEric W. Biederman }
9864480c27cSAndreas Gruenbacher EXPORT_SYMBOL(task_lookup_next_fd_rcu);
987e9a53aebSEric W. Biederman 
9880ee8cdfeSAl Viro /*
9890ee8cdfeSAl Viro  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
9900ee8cdfeSAl Viro  *
9910ee8cdfeSAl Viro  * You can use this instead of fget if you satisfy all of the following
9920ee8cdfeSAl Viro  * conditions:
9930ee8cdfeSAl Viro  * 1) You must call fput_light before exiting the syscall and returning control
9940ee8cdfeSAl Viro  *    to userspace (i.e. you cannot remember the returned struct file * after
9950ee8cdfeSAl Viro  *    returning to userspace).
9960ee8cdfeSAl Viro  * 2) You must not call filp_close on the returned struct file * in between
9970ee8cdfeSAl Viro  *    calls to fget_light and fput_light.
9980ee8cdfeSAl Viro  * 3) You must not clone the current task in between the calls to fget_light
9990ee8cdfeSAl Viro  *    and fput_light.
10000ee8cdfeSAl Viro  *
10010ee8cdfeSAl Viro  * The fput_needed flag returned by fget_light should be passed to the
10020ee8cdfeSAl Viro  * corresponding fput_light.
10030ee8cdfeSAl Viro  */
__fget_light(unsigned int fd,fmode_t mask)1004bd2a31d5SAl Viro static unsigned long __fget_light(unsigned int fd, fmode_t mask)
10050ee8cdfeSAl Viro {
10060ee8cdfeSAl Viro 	struct files_struct *files = current->files;
1007ad461834SOleg Nesterov 	struct file *file;
10080ee8cdfeSAl Viro 
10097ee47dcfSJann Horn 	/*
10107ee47dcfSJann Horn 	 * If another thread is concurrently calling close_fd() followed
10117ee47dcfSJann Horn 	 * by put_files_struct(), we must not observe the old table
10127ee47dcfSJann Horn 	 * entry combined with the new refcount - otherwise we could
10137ee47dcfSJann Horn 	 * return a file that is concurrently being freed.
10147ee47dcfSJann Horn 	 *
10157ee47dcfSJann Horn 	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
10167ee47dcfSJann Horn 	 * put_files_struct().
10177ee47dcfSJann Horn 	 */
10187ee47dcfSJann Horn 	if (atomic_read_acquire(&files->count) == 1) {
1019bebf684bSEric W. Biederman 		file = files_lookup_fd_raw(files, fd);
1020bd2a31d5SAl Viro 		if (!file || unlikely(file->f_mode & mask))
1021bd2a31d5SAl Viro 			return 0;
1022bd2a31d5SAl Viro 		return (unsigned long)file;
10230ee8cdfeSAl Viro 	} else {
102481132a39SGou Hao 		file = __fget(fd, mask);
1025bd2a31d5SAl Viro 		if (!file)
1026bd2a31d5SAl Viro 			return 0;
1027bd2a31d5SAl Viro 		return FDPUT_FPUT | (unsigned long)file;
1028bd2a31d5SAl Viro 	}
1029bd2a31d5SAl Viro }
__fdget(unsigned int fd)1030bd2a31d5SAl Viro unsigned long __fdget(unsigned int fd)
1031bd2a31d5SAl Viro {
1032bd2a31d5SAl Viro 	return __fget_light(fd, FMODE_PATH);
1033bd2a31d5SAl Viro }
1034bd2a31d5SAl Viro EXPORT_SYMBOL(__fdget);
1035bd2a31d5SAl Viro 
__fdget_raw(unsigned int fd)1036bd2a31d5SAl Viro unsigned long __fdget_raw(unsigned int fd)
1037bd2a31d5SAl Viro {
1038bd2a31d5SAl Viro 	return __fget_light(fd, 0);
10390ee8cdfeSAl Viro }
10400ee8cdfeSAl Viro 
1041bd2a31d5SAl Viro /*
1042ad461834SOleg Nesterov  * Try to avoid f_pos locking. We only need it if the
104399aea681SEric Biggers  * file is marked for FMODE_ATOMIC_POS, and it can be
104499aea681SEric Biggers  * accessed multiple ways.
10450ee8cdfeSAl Viro  *
10462be7d348SLinus Torvalds  * Always do it for directories, because pidfd_getfd()
1047bd2a31d5SAl Viro  * can make a file accessible even if it otherwise would
1048bd2a31d5SAl Viro  * not be, and for directories this is a correctness
1049bd2a31d5SAl Viro  * issue, not a "POSIX requirement".
1050bd2a31d5SAl Viro  */
file_needs_f_pos_lock(struct file * file)1051bd2a31d5SAl Viro static inline bool file_needs_f_pos_lock(struct file *file)
105299aea681SEric Biggers {
1053bd2a31d5SAl Viro 	return (file->f_mode & FMODE_ATOMIC_POS) &&
1054bd2a31d5SAl Viro 		(file_count(file) > 1 || file->f_op->iterate_shared);
105563b6df14SAl Viro }
105663b6df14SAl Viro 
__fdget_pos(unsigned int fd)105763b6df14SAl Viro unsigned long __fdget_pos(unsigned int fd)
105863b6df14SAl Viro {
105963b6df14SAl Viro 	unsigned long v = __fdget(fd);
1060bd2a31d5SAl Viro 	struct file *file = (struct file *)(v & ~3);
1061bd2a31d5SAl Viro 
1062bd2a31d5SAl Viro 	if (file && file_needs_f_pos_lock(file)) {
1063bd2a31d5SAl Viro 		v |= FDPUT_POS_UNLOCK;
1064bd2a31d5SAl Viro 		mutex_lock(&file->f_pos_lock);
1065fe17f22dSAl Viro 	}
1066fe17f22dSAl Viro 	return v;
1067fe17f22dSAl Viro }
1068fe17f22dSAl Viro 
__f_unlock_pos(struct file * f)1069fe17f22dSAl Viro void __f_unlock_pos(struct file *f)
1070fe17f22dSAl Viro {
1071fe17f22dSAl Viro 	mutex_unlock(&f->f_pos_lock);
1072fe17f22dSAl Viro }
1073fe17f22dSAl Viro 
1074fe17f22dSAl Viro /*
1075fe17f22dSAl Viro  * We only lock f_pos if we have threads or if the file might be
1076fe17f22dSAl Viro  * shared with another process. In both cases we'll have an elevated
1077fe17f22dSAl Viro  * file count (done either by fdget() or by fork()).
1078fe17f22dSAl Viro  */
1079fe17f22dSAl Viro 
set_close_on_exec(unsigned int fd,int flag)1080fe17f22dSAl Viro void set_close_on_exec(unsigned int fd, int flag)
1081fe17f22dSAl Viro {
1082fe17f22dSAl Viro 	struct files_struct *files = current->files;
1083fe17f22dSAl Viro 	struct fdtable *fdt;
1084fe17f22dSAl Viro 	spin_lock(&files->file_lock);
1085fe17f22dSAl Viro 	fdt = files_fdtable(files);
1086fe17f22dSAl Viro 	if (flag)
1087fe17f22dSAl Viro 		__set_close_on_exec(fd, fdt);
1088fe17f22dSAl Viro 	else
1089fe17f22dSAl Viro 		__clear_close_on_exec(fd, fdt);
1090fe17f22dSAl Viro 	spin_unlock(&files->file_lock);
10918280d161SAl Viro }
10928280d161SAl Viro 
get_close_on_exec(unsigned int fd)1093e983094dSAl Viro bool get_close_on_exec(unsigned int fd)
10948280d161SAl Viro {
10958280d161SAl Viro 	struct files_struct *files = current->files;
10968280d161SAl Viro 	struct fdtable *fdt;
10978280d161SAl Viro 	bool res;
10988280d161SAl Viro 	rcu_read_lock();
10998280d161SAl Viro 	fdt = files_fdtable(files);
11008280d161SAl Viro 	res = close_on_exec(fd, fdt);
11018280d161SAl Viro 	rcu_read_unlock();
11028280d161SAl Viro 	return res;
11038280d161SAl Viro }
11048280d161SAl Viro 
do_dup2(struct files_struct * files,struct file * file,unsigned fd,unsigned flags)11058280d161SAl Viro static int do_dup2(struct files_struct *files,
11068280d161SAl Viro 	struct file *file, unsigned fd, unsigned flags)
11078280d161SAl Viro __releases(&files->file_lock)
11088280d161SAl Viro {
11098280d161SAl Viro 	struct file *tofree;
11108280d161SAl Viro 	struct fdtable *fdt;
11118280d161SAl Viro 
11128280d161SAl Viro 	/*
11138280d161SAl Viro 	 * We need to detect attempts to do dup2() over allocated but still
11148280d161SAl Viro 	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
11158280d161SAl Viro 	 * extra work in their equivalent of fget() - they insert struct
11168280d161SAl Viro 	 * file immediately after grabbing descriptor, mark it larval if
11178280d161SAl Viro 	 * more work (e.g. actual opening) is needed and make sure that
11188280d161SAl Viro 	 * fget() treats larval files as absent.  Potentially interesting,
11198280d161SAl Viro 	 * but while extra work in fget() is trivial, locking implications
11208280d161SAl Viro 	 * and amount of surgery on open()-related paths in VFS are not.
11218280d161SAl Viro 	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
11228280d161SAl Viro 	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
11238280d161SAl Viro 	 * scope of POSIX or SUS, since neither considers shared descriptor
11248280d161SAl Viro 	 * tables and this condition does not arise without those.
11258280d161SAl Viro 	 */
11268280d161SAl Viro 	fdt = files_fdtable(files);
11278280d161SAl Viro 	tofree = fdt->fd[fd];
11288280d161SAl Viro 	if (!tofree && fd_is_open(fd, fdt))
11298280d161SAl Viro 		goto Ebusy;
11308280d161SAl Viro 	get_file(file);
11318280d161SAl Viro 	rcu_assign_pointer(fdt->fd[fd], file);
11328280d161SAl Viro 	__set_open_fd(fd, fdt);
11338280d161SAl Viro 	if (flags & O_CLOEXEC)
11348280d161SAl Viro 		__set_close_on_exec(fd, fdt);
11358280d161SAl Viro 	else
11368280d161SAl Viro 		__clear_close_on_exec(fd, fdt);
11378280d161SAl Viro 	spin_unlock(&files->file_lock);
11388280d161SAl Viro 
11398280d161SAl Viro 	if (tofree)
11408280d161SAl Viro 		filp_close(tofree, files);
11418760c909SEric W. Biederman 
11428280d161SAl Viro 	return fd;
11438280d161SAl Viro 
114408f05c49SAl Viro Ebusy:
11458280d161SAl Viro 	spin_unlock(&files->file_lock);
11468280d161SAl Viro 	return -EBUSY;
11478280d161SAl Viro }
11488280d161SAl Viro 
replace_fd(unsigned fd,struct file * file,unsigned flags)11498280d161SAl Viro int replace_fd(unsigned fd, struct file *file, unsigned flags)
11508280d161SAl Viro {
11518280d161SAl Viro 	int err;
11528280d161SAl Viro 	struct files_struct *files = current->files;
11538280d161SAl Viro 
11548280d161SAl Viro 	if (!file)
11558280d161SAl Viro 		return close_fd(fd);
11568280d161SAl Viro 
115766590610SKees Cook 	if (fd >= rlimit(RLIMIT_NOFILE))
115866590610SKees Cook 		return -EBADF;
115966590610SKees Cook 
116066590610SKees Cook 	spin_lock(&files->file_lock);
116166590610SKees Cook 	err = expand_files(files, fd);
116266590610SKees Cook 	if (unlikely(err < 0))
116366590610SKees Cook 		goto out_unlock;
1164deefa7f3SKees Cook 	return do_dup2(files, file, fd, flags);
1165deefa7f3SKees Cook 
116666590610SKees Cook out_unlock:
116766590610SKees Cook 	spin_unlock(&files->file_lock);
116866590610SKees Cook 	return err;
116966590610SKees Cook }
1170deefa7f3SKees Cook 
117166590610SKees Cook /**
117242eb0d54SChristoph Hellwig  * __receive_fd() - Install received file into file descriptor table
117366590610SKees Cook  * @file: struct file that was received from another process
117466590610SKees Cook  * @ufd: __user pointer to write new fd number to
117566590610SKees Cook  * @o_flags: the O_* flags to apply to the new fd entry
117666590610SKees Cook  *
117766590610SKees Cook  * Installs a received file into the file descriptor table, with appropriate
117866590610SKees Cook  * checks and count updates. Optionally writes the fd number to userspace, if
117966590610SKees Cook  * @ufd is non-NULL.
118066590610SKees Cook  *
118166590610SKees Cook  * This helper handles its own reference counting of the incoming
118266590610SKees Cook  * struct file.
118366590610SKees Cook  *
118466590610SKees Cook  * Returns newly install fd or -ve on error.
1185deefa7f3SKees Cook  */
__receive_fd(struct file * file,int __user * ufd,unsigned int o_flags)118666590610SKees Cook int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
118766590610SKees Cook {
118866590610SKees Cook 	int new_fd;
118966590610SKees Cook 	int error;
119066590610SKees Cook 
1191deefa7f3SKees Cook 	error = security_file_receive(file);
119266590610SKees Cook 	if (error)
119317381715SKees Cook 		return error;
119442eb0d54SChristoph Hellwig 
119542eb0d54SChristoph Hellwig 	new_fd = get_unused_fd_flags(o_flags);
119642eb0d54SChristoph Hellwig 	if (new_fd < 0)
119742eb0d54SChristoph Hellwig 		return new_fd;
119842eb0d54SChristoph Hellwig 
119942eb0d54SChristoph Hellwig 	if (ufd) {
120042eb0d54SChristoph Hellwig 		error = put_user(new_fd, ufd);
120142eb0d54SChristoph Hellwig 		if (error) {
120242eb0d54SChristoph Hellwig 			put_unused_fd(new_fd);
120342eb0d54SChristoph Hellwig 			return error;
120442eb0d54SChristoph Hellwig 		}
120517381715SKees Cook 	}
120617381715SKees Cook 
120717381715SKees Cook 	fd_install(new_fd, get_file(file));
120866590610SKees Cook 	__receive_sock(file);
1209deefa7f3SKees Cook 	return new_fd;
121066590610SKees Cook }
121166590610SKees Cook 
receive_fd_replace(int new_fd,struct file * file,unsigned int o_flags)12129c930054SXie Yongji int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
12139c930054SXie Yongji {
12149c930054SXie Yongji 	int error;
12159c930054SXie Yongji 
12169c930054SXie Yongji 	error = security_file_receive(file);
12179c930054SXie Yongji 	if (error)
1218c7248321SDominik Brodowski 		return error;
1219fe17f22dSAl Viro 	error = replace_fd(new_fd, file, o_flags);
1220fe17f22dSAl Viro 	if (error)
12218280d161SAl Viro 		return error;
1222fe17f22dSAl Viro 	__receive_sock(file);
1223fe17f22dSAl Viro 	return new_fd;
1224fe17f22dSAl Viro }
1225fe17f22dSAl Viro 
receive_fd(struct file * file,unsigned int o_flags)1226fe17f22dSAl Viro int receive_fd(struct file *file, unsigned int o_flags)
1227aed97647SRichard W.M. Jones {
1228aed97647SRichard W.M. Jones 	return __receive_fd(file, NULL, o_flags);
1229aed97647SRichard W.M. Jones }
1230fe17f22dSAl Viro EXPORT_SYMBOL_GPL(receive_fd);
123108f05c49SAl Viro 
ksys_dup3(unsigned int oldfd,unsigned int newfd,int flags)1232fe17f22dSAl Viro static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
1233fe17f22dSAl Viro {
1234fe17f22dSAl Viro 	int err = -EBADF;
1235120ce2b0SEric W. Biederman 	struct file *file;
1236fe17f22dSAl Viro 	struct files_struct *files = current->files;
1237fe17f22dSAl Viro 
1238fe17f22dSAl Viro 	if ((flags & ~O_CLOEXEC) != 0)
1239fe17f22dSAl Viro 		return -EINVAL;
1240fe17f22dSAl Viro 
1241fe17f22dSAl Viro 	if (unlikely(oldfd == newfd))
1242fe17f22dSAl Viro 		return -EINVAL;
12438280d161SAl Viro 
1244fe17f22dSAl Viro 	if (newfd >= rlimit(RLIMIT_NOFILE))
1245fe17f22dSAl Viro 		return -EBADF;
1246fe17f22dSAl Viro 
1247fe17f22dSAl Viro 	spin_lock(&files->file_lock);
1248fe17f22dSAl Viro 	err = expand_files(files, newfd);
1249fe17f22dSAl Viro 	file = files_lookup_fd_locked(files, oldfd);
1250fe17f22dSAl Viro 	if (unlikely(!file))
1251fe17f22dSAl Viro 		goto Ebadf;
1252c7248321SDominik Brodowski 	if (unlikely(err < 0)) {
1253c7248321SDominik Brodowski 		if (err == -EMFILE)
1254c7248321SDominik Brodowski 			goto Ebadf;
1255c7248321SDominik Brodowski 		goto out_unlock;
1256c7248321SDominik Brodowski 	}
1257fe17f22dSAl Viro 	return do_dup2(files, file, newfd, flags);
1258fe17f22dSAl Viro 
1259fe17f22dSAl Viro Ebadf:
1260fe17f22dSAl Viro 	err = -EBADF;
1261fe17f22dSAl Viro out_unlock:
1262fe17f22dSAl Viro 	spin_unlock(&files->file_lock);
1263fe17f22dSAl Viro 	return err;
1264f36c2943SEric W. Biederman }
1265fe17f22dSAl Viro 
SYSCALL_DEFINE3(dup3,unsigned int,oldfd,unsigned int,newfd,int,flags)1266fe17f22dSAl Viro SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
1267fe17f22dSAl Viro {
1268fe17f22dSAl Viro 	return ksys_dup3(oldfd, newfd, flags);
1269c7248321SDominik Brodowski }
1270fe17f22dSAl Viro 
SYSCALL_DEFINE2(dup2,unsigned int,oldfd,unsigned int,newfd)1271fe17f22dSAl Viro SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
1272bc1cd99aSChristoph Hellwig {
1273fe17f22dSAl Viro 	if (unlikely(newfd == oldfd)) { /* corner case */
1274fe17f22dSAl Viro 		struct files_struct *files = current->files;
1275fe17f22dSAl Viro 		int retval = oldfd;
1276fe17f22dSAl Viro 
1277fe17f22dSAl Viro 		rcu_read_lock();
12788d10a035SYann Droneaud 		if (!files_lookup_fd_rcu(files, oldfd))
1279fe17f22dSAl Viro 			retval = -EBADF;
1280fe17f22dSAl Viro 		rcu_read_unlock();
1281fe17f22dSAl Viro 		return retval;
1282fe17f22dSAl Viro 	}
1283fe17f22dSAl Viro 	return ksys_dup3(oldfd, newfd, 0);
1284fe17f22dSAl Viro }
1285fe17f22dSAl Viro 
SYSCALL_DEFINE1(dup,unsigned int,fildes)1286fe17f22dSAl Viro SYSCALL_DEFINE1(dup, unsigned int, fildes)
1287fe17f22dSAl Viro {
1288fe17f22dSAl Viro 	int ret = -EBADF;
1289e06b53c2SEric W. Biederman 	struct file *file = fget_raw(fildes);
1290fe17f22dSAl Viro 
1291e06b53c2SEric W. Biederman 	if (file) {
1292fe17f22dSAl Viro 		ret = get_unused_fd_flags(0);
1293e06b53c2SEric W. Biederman 		if (ret >= 0)
1294fe17f22dSAl Viro 			fd_install(ret, file);
1295fe17f22dSAl Viro 		else
1296fe17f22dSAl Viro 			fput(file);
1297fe17f22dSAl Viro 	}
1298fe17f22dSAl Viro 	return ret;
1299fe17f22dSAl Viro }
1300c3c073f8SAl Viro 
f_dupfd(unsigned int from,struct file * file,unsigned flags)1301c3c073f8SAl Viro int f_dupfd(unsigned int from, struct file *file, unsigned flags)
1302c3c073f8SAl Viro {
1303c3c073f8SAl Viro 	unsigned long nofile = rlimit(RLIMIT_NOFILE);
1304c3c073f8SAl Viro 	int err;
1305c3c073f8SAl Viro 	if (from >= nofile)
1306c3c073f8SAl Viro 		return -EINVAL;
1307c3c073f8SAl Viro 	err = alloc_fd(from, nofile, flags);
1308c3c073f8SAl Viro 	if (err >= 0) {
1309c3c073f8SAl Viro 		get_file(file);
1310a77cfcb4SAl Viro 		fd_install(err, file);
1311a77cfcb4SAl Viro 	}
1312a77cfcb4SAl Viro 	return err;
1313a77cfcb4SAl Viro }
1314a77cfcb4SAl Viro 
iterate_fd(struct files_struct * files,unsigned n,int (* f)(const void *,struct file *,unsigned),const void * p)1315c3c073f8SAl Viro int iterate_fd(struct files_struct *files, unsigned n,
1316a77cfcb4SAl Viro 		int (*f)(const void *, struct file *, unsigned),
1317a77cfcb4SAl Viro 		const void *p)
1318c3c073f8SAl Viro {
1319c3c073f8SAl Viro 	struct fdtable *fdt;
1320c3c073f8SAl Viro 	int res = 0;
1321c3c073f8SAl Viro 	if (!files)
1322c3c073f8SAl Viro 		return 0;
1323 	spin_lock(&files->file_lock);
1324 	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1325 		struct file *file;
1326 		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1327 		if (!file)
1328 			continue;
1329 		res = f(p, file, n);
1330 		if (res)
1331 			break;
1332 	}
1333 	spin_unlock(&files->file_lock);
1334 	return res;
1335 }
1336 EXPORT_SYMBOL(iterate_fd);
1337